[clang] db158c7 - [AArch64] Update generic sched model to A510

Harvin Iriawan via cfe-commits cfe-commits at lists.llvm.org
Mon Aug 21 04:25:47 PDT 2023


Author: Harvin Iriawan
Date: 2023-08-21T12:25:15+01:00
New Revision: db158c7c830807caeeb0691739c41f1d522029e9

URL: https://github.com/llvm/llvm-project/commit/db158c7c830807caeeb0691739c41f1d522029e9
DIFF: https://github.com/llvm/llvm-project/commit/db158c7c830807caeeb0691739c41f1d522029e9.diff

LOG: [AArch64] Update generic sched model to A510

  Refresh of the generic scheduling model to use A510 instead of A55.
  Main benefits are to the little core, and introducing SVE scheduling information.
  Changes tested on various OoO cores, no performance degradation is seen.

  Differential Revision: https://reviews.llvm.org/D156799

Added: 
    

Modified: 
    clang/test/CodeGen/aarch64-ABI-align-packed-assembly.c
    llvm/lib/Target/AArch64/AArch64.td
    llvm/test/Analysis/CostModel/AArch64/vector-select.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2_lse128.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8_1a.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse2.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse2_lse128.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-outline_atomics.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc3.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8_1a.ll
    llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8a.ll
    llvm/test/CodeGen/AArch64/GlobalISel/aapcs_vararg_frame.ll
    llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
    llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
    llvm/test/CodeGen/AArch64/GlobalISel/call-translator-variadic-musttail.ll
    llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
    llvm/test/CodeGen/AArch64/GlobalISel/select-bitfield-insert.ll
    llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll
    llvm/test/CodeGen/AArch64/GlobalISel/store-merging.ll
    llvm/test/CodeGen/AArch64/GlobalISel/swifterror.ll
    llvm/test/CodeGen/AArch64/a57-csel.ll
    llvm/test/CodeGen/AArch64/aarch64-addv.ll
    llvm/test/CodeGen/AArch64/aarch64-be-bv.ll
    llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
    llvm/test/CodeGen/AArch64/aarch64-combine-add-sub-mul.ll
    llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll
    llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
    llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
    llvm/test/CodeGen/AArch64/aarch64-fixup-statepoint-regs-crash.ll
    llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
    llvm/test/CodeGen/AArch64/aarch64-interleaved-access-w-undef.ll
    llvm/test/CodeGen/AArch64/aarch64-isel-csinc.ll
    llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
    llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
    llvm/test/CodeGen/AArch64/aarch64-mops-consecutive.ll
    llvm/test/CodeGen/AArch64/aarch64-mops.ll
    llvm/test/CodeGen/AArch64/aarch64-mull-masks.ll
    llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
    llvm/test/CodeGen/AArch64/aarch64-pmull2.ll
    llvm/test/CodeGen/AArch64/aarch64-smull.ll
    llvm/test/CodeGen/AArch64/aarch64-uzp1-combine.ll
    llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
    llvm/test/CodeGen/AArch64/aarch64_fnmadd.ll
    llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll
    llvm/test/CodeGen/AArch64/abd-combine.ll
    llvm/test/CodeGen/AArch64/active_lane_mask.ll
    llvm/test/CodeGen/AArch64/add-extract.ll
    llvm/test/CodeGen/AArch64/addcarry-crash.ll
    llvm/test/CodeGen/AArch64/addsub-constant-folding.ll
    llvm/test/CodeGen/AArch64/addsub.ll
    llvm/test/CodeGen/AArch64/align-down.ll
    llvm/test/CodeGen/AArch64/and-mask-removal.ll
    llvm/test/CodeGen/AArch64/andorbrcompare.ll
    llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll
    llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
    llvm/test/CodeGen/AArch64/arm64-addr-type-promotion.ll
    llvm/test/CodeGen/AArch64/arm64-addrmode.ll
    llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll
    llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
    llvm/test/CodeGen/AArch64/arm64-cse.ll
    llvm/test/CodeGen/AArch64/arm64-csel.ll
    llvm/test/CodeGen/AArch64/arm64-dup.ll
    llvm/test/CodeGen/AArch64/arm64-fcopysign.ll
    llvm/test/CodeGen/AArch64/arm64-fmadd.ll
    llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-no-helper.ll
    llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
    llvm/test/CodeGen/AArch64/arm64-inline-asm.ll
    llvm/test/CodeGen/AArch64/arm64-instruction-mix-remarks.ll
    llvm/test/CodeGen/AArch64/arm64-ld1.ll
    llvm/test/CodeGen/AArch64/arm64-ldp.ll
    llvm/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll
    llvm/test/CodeGen/AArch64/arm64-mul.ll
    llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
    llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll
    llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
    llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll
    llvm/test/CodeGen/AArch64/arm64-nvcast.ll
    llvm/test/CodeGen/AArch64/arm64-promote-const-complex-initializers.ll
    llvm/test/CodeGen/AArch64/arm64-register-pairing.ll
    llvm/test/CodeGen/AArch64/arm64-rev.ll
    llvm/test/CodeGen/AArch64/arm64-setcc-int-to-fp-combine.ll
    llvm/test/CodeGen/AArch64/arm64-setcc-swap-infloop.ll
    llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
    llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
    llvm/test/CodeGen/AArch64/arm64-tbl.ll
    llvm/test/CodeGen/AArch64/arm64-vabs.ll
    llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
    llvm/test/CodeGen/AArch64/arm64-vhadd.ll
    llvm/test/CodeGen/AArch64/arm64-vmul.ll
    llvm/test/CodeGen/AArch64/arm64-vshift.ll
    llvm/test/CodeGen/AArch64/arm64-xaluo.ll
    llvm/test/CodeGen/AArch64/arm64-zip.ll
    llvm/test/CodeGen/AArch64/arm64_32-addrs.ll
    llvm/test/CodeGen/AArch64/arm64_32.ll
    llvm/test/CodeGen/AArch64/arm64ec-reservedregs.ll
    llvm/test/CodeGen/AArch64/atomic-ops-lse.ll
    llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll
    llvm/test/CodeGen/AArch64/atomic-ops.ll
    llvm/test/CodeGen/AArch64/bcmp-inline-small.ll
    llvm/test/CodeGen/AArch64/bcmp.ll
    llvm/test/CodeGen/AArch64/bf16-shuffle.ll
    llvm/test/CodeGen/AArch64/bfis-in-loop.ll
    llvm/test/CodeGen/AArch64/bitfield-insert.ll
    llvm/test/CodeGen/AArch64/bool-ext-inc.ll
    llvm/test/CodeGen/AArch64/branch-relax-alignment.ll
    llvm/test/CodeGen/AArch64/branch-relax-bcc.ll
    llvm/test/CodeGen/AArch64/build-one-lane.ll
    llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll
    llvm/test/CodeGen/AArch64/build-vector-two-dup.ll
    llvm/test/CodeGen/AArch64/cgp-usubo.ll
    llvm/test/CodeGen/AArch64/cmp-chains.ll
    llvm/test/CodeGen/AArch64/cmp-select-sign.ll
    llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll
    llvm/test/CodeGen/AArch64/combine-andintoload.ll
    llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-contract.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add-scalable.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add-scalable.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-splat.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
    llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll
    llvm/test/CodeGen/AArch64/cond-br-tuning.ll
    llvm/test/CodeGen/AArch64/consthoist-gep.ll
    llvm/test/CodeGen/AArch64/copyprop.ll
    llvm/test/CodeGen/AArch64/ctpop-nonean.ll
    llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
    llvm/test/CodeGen/AArch64/dag-combine-select.ll
    llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
    llvm/test/CodeGen/AArch64/dag-numsignbits.ll
    llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll
    llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll
    llvm/test/CodeGen/AArch64/double_reduct.ll
    llvm/test/CodeGen/AArch64/expand-select.ll
    llvm/test/CodeGen/AArch64/expand-vector-rot.ll
    llvm/test/CodeGen/AArch64/extbinopload.ll
    llvm/test/CodeGen/AArch64/extend_inreg_of_concat_subvectors.ll
    llvm/test/CodeGen/AArch64/extract-bits.ll
    llvm/test/CodeGen/AArch64/extract-lowbits.ll
    llvm/test/CodeGen/AArch64/f16-instructions.ll
    llvm/test/CodeGen/AArch64/fabs.ll
    llvm/test/CodeGen/AArch64/fadd-combines.ll
    llvm/test/CodeGen/AArch64/faddp-half.ll
    llvm/test/CodeGen/AArch64/faddp.ll
    llvm/test/CodeGen/AArch64/fast-isel-addressing-modes.ll
    llvm/test/CodeGen/AArch64/fast-isel-gep.ll
    llvm/test/CodeGen/AArch64/fast-isel-memcpy.ll
    llvm/test/CodeGen/AArch64/fast-isel-shift.ll
    llvm/test/CodeGen/AArch64/fcopysign.ll
    llvm/test/CodeGen/AArch64/fcvt.ll
    llvm/test/CodeGen/AArch64/fcvt_combine.ll
    llvm/test/CodeGen/AArch64/fdiv-combine.ll
    llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
    llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
    llvm/test/CodeGen/AArch64/flags-multiuse.ll
    llvm/test/CodeGen/AArch64/fmaximum-legalization.ll
    llvm/test/CodeGen/AArch64/fminimummaximum.ll
    llvm/test/CodeGen/AArch64/fminmax.ll
    llvm/test/CodeGen/AArch64/fmlal-loreg.ll
    llvm/test/CodeGen/AArch64/fold-csel-cttz-and.ll
    llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
    llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll
    llvm/test/CodeGen/AArch64/fp16-v16-instructions.ll
    llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
    llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
    llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
    llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll
    llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
    llvm/test/CodeGen/AArch64/fptrunc.ll
    llvm/test/CodeGen/AArch64/fsqrt.ll
    llvm/test/CodeGen/AArch64/funnel-shift-rot.ll
    llvm/test/CodeGen/AArch64/funnel-shift.ll
    llvm/test/CodeGen/AArch64/global-merge-3.ll
    llvm/test/CodeGen/AArch64/gpr_cttz.ll
    llvm/test/CodeGen/AArch64/half.ll
    llvm/test/CodeGen/AArch64/highextractbitcast.ll
    llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
    llvm/test/CodeGen/AArch64/i128-math.ll
    llvm/test/CodeGen/AArch64/i128_volatile_load_store.ll
    llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll
    llvm/test/CodeGen/AArch64/insert-extend.ll
    llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
    llvm/test/CodeGen/AArch64/insert-subvector.ll
    llvm/test/CodeGen/AArch64/insertshuffleload.ll
    llvm/test/CodeGen/AArch64/isinf.ll
    llvm/test/CodeGen/AArch64/known-never-nan.ll
    llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll
    llvm/test/CodeGen/AArch64/load-insert-zero.ll
    llvm/test/CodeGen/AArch64/logic-reassociate.ll
    llvm/test/CodeGen/AArch64/logic-shift.ll
    llvm/test/CodeGen/AArch64/logical_shifted_reg.ll
    llvm/test/CodeGen/AArch64/lowerMUL-newload.ll
    llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
    llvm/test/CodeGen/AArch64/machine-combiner-subadd.ll
    llvm/test/CodeGen/AArch64/machine-combiner-transient.ll
    llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll
    llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll
    llvm/test/CodeGen/AArch64/machine_cse_impdef_killflags.ll
    llvm/test/CodeGen/AArch64/madd-combiner.ll
    llvm/test/CodeGen/AArch64/memcpy-scoped-aa.ll
    llvm/test/CodeGen/AArch64/merge-trunc-store.ll
    llvm/test/CodeGen/AArch64/midpoint-int.ll
    llvm/test/CodeGen/AArch64/minmax-of-minmax.ll
    llvm/test/CodeGen/AArch64/minmax.ll
    llvm/test/CodeGen/AArch64/misched-detail-resource-booking-01.mir
    llvm/test/CodeGen/AArch64/misched-fusion-lit.ll
    llvm/test/CodeGen/AArch64/mul_pow2.ll
    llvm/test/CodeGen/AArch64/mulcmle.ll
    llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll
    llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
    llvm/test/CodeGen/AArch64/neg-imm.ll
    llvm/test/CodeGen/AArch64/neon-abd.ll
    llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
    llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
    llvm/test/CodeGen/AArch64/neon-dotpattern.ll
    llvm/test/CodeGen/AArch64/neon-dotreduce.ll
    llvm/test/CodeGen/AArch64/neon-extadd.ll
    llvm/test/CodeGen/AArch64/neon-extracttruncate.ll
    llvm/test/CodeGen/AArch64/neon-mov.ll
    llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll
    llvm/test/CodeGen/AArch64/neon-rshrn.ll
    llvm/test/CodeGen/AArch64/neon-shift-neg.ll
    llvm/test/CodeGen/AArch64/neon-truncstore.ll
    llvm/test/CodeGen/AArch64/neon-wide-splat.ll
    llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll
    llvm/test/CodeGen/AArch64/no-sve-no-neon.ll
    llvm/test/CodeGen/AArch64/nontemporal-load.ll
    llvm/test/CodeGen/AArch64/nontemporal.ll
    llvm/test/CodeGen/AArch64/nzcv-save.ll
    llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll
    llvm/test/CodeGen/AArch64/peephole-and-tst.ll
    llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll
    llvm/test/CodeGen/AArch64/pr-cf624b2.ll
    llvm/test/CodeGen/AArch64/pr58350.ll
    llvm/test/CodeGen/AArch64/pr58516.ll
    llvm/test/CodeGen/AArch64/pr61549.ll
    llvm/test/CodeGen/AArch64/predicated-add-sub.ll
    llvm/test/CodeGen/AArch64/pull-negations-after-concat-of-truncates.ll
    llvm/test/CodeGen/AArch64/ragreedy-csr.ll
    llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
    llvm/test/CodeGen/AArch64/rand.ll
    llvm/test/CodeGen/AArch64/rcpc3-sve.ll
    llvm/test/CodeGen/AArch64/reduce-and.ll
    llvm/test/CodeGen/AArch64/reduce-or.ll
    llvm/test/CodeGen/AArch64/reduce-shuffle.ll
    llvm/test/CodeGen/AArch64/reduce-xor.ll
    llvm/test/CodeGen/AArch64/regress-tblgen-chains.ll
    llvm/test/CodeGen/AArch64/rotate-extract.ll
    llvm/test/CodeGen/AArch64/sadd_sat.ll
    llvm/test/CodeGen/AArch64/sadd_sat_plus.ll
    llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
    llvm/test/CodeGen/AArch64/sat-add.ll
    llvm/test/CodeGen/AArch64/select-constant-xor.ll
    llvm/test/CodeGen/AArch64/select_const.ll
    llvm/test/CodeGen/AArch64/select_fmf.ll
    llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll
    llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll
    llvm/test/CodeGen/AArch64/settag-merge-order.ll
    llvm/test/CodeGen/AArch64/settag-merge.ll
    llvm/test/CodeGen/AArch64/settag.ll
    llvm/test/CodeGen/AArch64/sext.ll
    llvm/test/CodeGen/AArch64/shift-amount-mod.ll
    llvm/test/CodeGen/AArch64/shift-by-signext.ll
    llvm/test/CodeGen/AArch64/shift_minsize.ll
    llvm/test/CodeGen/AArch64/shrink-wrap-byval-inalloca-preallocated.ll
    llvm/test/CodeGen/AArch64/shrink-wrapping-vla.ll
    llvm/test/CodeGen/AArch64/shuffle-tbl34.ll
    llvm/test/CodeGen/AArch64/shuffles.ll
    llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll
    llvm/test/CodeGen/AArch64/sinksplat.ll
    llvm/test/CodeGen/AArch64/sitofp-fixed-legal.ll
    llvm/test/CodeGen/AArch64/sme-aarch64-svcount.ll
    llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
    llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll
    llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
    llvm/test/CodeGen/AArch64/sme-intrinsics-mova-insert.ll
    llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll
    llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
    llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
    llvm/test/CodeGen/AArch64/sme-streaming-body.ll
    llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
    llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
    llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll
    llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas.ll
    llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll
    llvm/test/CodeGen/AArch64/sme2-intrinsics-insert-mova.ll
    llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
    llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
    llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
    llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll
    llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll
    llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll
    llvm/test/CodeGen/AArch64/sme2-intrinsics-mlals.ll
    llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll
    llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll
    llvm/test/CodeGen/AArch64/sme2-intrinsics-sub.ll
    llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
    llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll
    llvm/test/CodeGen/AArch64/split-vector-insert.ll
    llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
    llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
    llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll
    llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
    llvm/test/CodeGen/AArch64/srem-vector-lkk.ll
    llvm/test/CodeGen/AArch64/sshl_sat.ll
    llvm/test/CodeGen/AArch64/ssub_sat.ll
    llvm/test/CodeGen/AArch64/ssub_sat_plus.ll
    llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
    llvm/test/CodeGen/AArch64/stack-guard-sysreg.ll
    llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll
    llvm/test/CodeGen/AArch64/sve-abd.ll
    llvm/test/CodeGen/AArch64/sve-alloca.ll
    llvm/test/CodeGen/AArch64/sve-bitcast.ll
    llvm/test/CodeGen/AArch64/sve-breakdown-scalable-vectortype.ll
    llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
    llvm/test/CodeGen/AArch64/sve-doublereduct.ll
    llvm/test/CodeGen/AArch64/sve-expand-div.ll
    llvm/test/CodeGen/AArch64/sve-extract-element.ll
    llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
    llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
    llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
    llvm/test/CodeGen/AArch64/sve-fcmp.ll
    llvm/test/CodeGen/AArch64/sve-fcopysign.ll
    llvm/test/CodeGen/AArch64/sve-fix-length-and-combine-512.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-addressing-modes.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-fp-compares.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-compares.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-limit-duplane.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-no-vscale-range.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-ptest.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll
    llvm/test/CodeGen/AArch64/sve-fold-vscale.ll
    llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll
    llvm/test/CodeGen/AArch64/sve-fp-combine.ll
    llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
    llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll
    llvm/test/CodeGen/AArch64/sve-fpext-load.ll
    llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
    llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll
    llvm/test/CodeGen/AArch64/sve-fptrunc-store.ll
    llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll
    llvm/test/CodeGen/AArch64/sve-gather-scatter-dag-combine.ll
    llvm/test/CodeGen/AArch64/sve-gep.ll
    llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll
    llvm/test/CodeGen/AArch64/sve-insert-element.ll
    llvm/test/CodeGen/AArch64/sve-insert-vector.ll
    llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll
    llvm/test/CodeGen/AArch64/sve-int-arith.ll
    llvm/test/CodeGen/AArch64/sve-int-reduce.ll
    llvm/test/CodeGen/AArch64/sve-intrinsics-counting-elems-i32.ll
    llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll
    llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll
    llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll
    llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
    llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-imm-addr-mode.ll
    llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll
    llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll
    llvm/test/CodeGen/AArch64/sve-ld1r.ll
    llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll
    llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
    llvm/test/CodeGen/AArch64/sve-masked-ldst-sext.ll
    llvm/test/CodeGen/AArch64/sve-masked-ldst-zext.ll
    llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll
    llvm/test/CodeGen/AArch64/sve-masked-scatter.ll
    llvm/test/CodeGen/AArch64/sve-pr62151.ll
    llvm/test/CodeGen/AArch64/sve-pred-arith.ll
    llvm/test/CodeGen/AArch64/sve-pred-selectop.ll
    llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll
    llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll
    llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
    llvm/test/CodeGen/AArch64/sve-redundant-store.ll
    llvm/test/CodeGen/AArch64/sve-select.ll
    llvm/test/CodeGen/AArch64/sve-sext-zext.ll
    llvm/test/CodeGen/AArch64/sve-smulo-sdnode.ll
    llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll
    llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
    llvm/test/CodeGen/AArch64/sve-split-fp-reduce.ll
    llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll
    llvm/test/CodeGen/AArch64/sve-split-int-pred-reduce.ll
    llvm/test/CodeGen/AArch64/sve-split-load.ll
    llvm/test/CodeGen/AArch64/sve-split-store.ll
    llvm/test/CodeGen/AArch64/sve-srem-combine-loop.ll
    llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll
    llvm/test/CodeGen/AArch64/sve-stepvector.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
    llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
    llvm/test/CodeGen/AArch64/sve-tailcall.ll
    llvm/test/CodeGen/AArch64/sve-trunc.ll
    llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll
    llvm/test/CodeGen/AArch64/sve-vecreduce-fold.ll
    llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
    llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
    llvm/test/CodeGen/AArch64/sve-vector-splat.ll
    llvm/test/CodeGen/AArch64/sve-vscale-attr.ll
    llvm/test/CodeGen/AArch64/sve-vselect-imm.ll
    llvm/test/CodeGen/AArch64/sve2-fcopysign.ll
    llvm/test/CodeGen/AArch64/sve2-fixed-length-fcopysign.ll
    llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll
    llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx2.ll
    llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll
    llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll
    llvm/test/CodeGen/AArch64/swift-return.ll
    llvm/test/CodeGen/AArch64/swifterror.ll
    llvm/test/CodeGen/AArch64/tbl-loops.ll
    llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
    llvm/test/CodeGen/AArch64/typepromotion-overflow.ll
    llvm/test/CodeGen/AArch64/typepromotion-phisret.ll
    llvm/test/CodeGen/AArch64/typepromotion-signed.ll
    llvm/test/CodeGen/AArch64/uadd_sat.ll
    llvm/test/CodeGen/AArch64/uadd_sat_plus.ll
    llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
    llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
    llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll
    llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-interleavedbits.ll
    llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll
    llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-lowhigh.ll
    llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-variablemask.ll
    llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll
    llvm/test/CodeGen/AArch64/urem-lkk.ll
    llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll
    llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
    llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
    llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll
    llvm/test/CodeGen/AArch64/urem-vector-lkk.ll
    llvm/test/CodeGen/AArch64/usub_sat_vec.ll
    llvm/test/CodeGen/AArch64/vcvt-oversize.ll
    llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
    llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll
    llvm/test/CodeGen/AArch64/vec-libcalls.ll
    llvm/test/CodeGen/AArch64/vec_cttz.ll
    llvm/test/CodeGen/AArch64/vec_uaddo.ll
    llvm/test/CodeGen/AArch64/vec_umulo.ll
    llvm/test/CodeGen/AArch64/vecreduce-add.ll
    llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
    llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll
    llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
    llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
    llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
    llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll
    llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
    llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll
    llvm/test/CodeGen/AArch64/vector-fcopysign.ll
    llvm/test/CodeGen/AArch64/vector-fcvt.ll
    llvm/test/CodeGen/AArch64/vector-gep.ll
    llvm/test/CodeGen/AArch64/vldn_shuffle.ll
    llvm/test/CodeGen/AArch64/vselect-constants.ll
    llvm/test/CodeGen/AArch64/vselect-ext.ll
    llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
    llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
    llvm/test/CodeGen/AArch64/win64_vararg.ll
    llvm/test/CodeGen/AArch64/win64_vararg2.ll
    llvm/test/CodeGen/AArch64/win64_vararg_float.ll
    llvm/test/CodeGen/AArch64/win64_vararg_float_cc.ll
    llvm/test/CodeGen/AArch64/wineh-bti.ll
    llvm/test/CodeGen/AArch64/zero-call-used-regs.ll
    llvm/test/CodeGen/AArch64/zext-to-tbl.ll
    llvm/test/CodeGen/AArch64/zext.ll
    llvm/test/MC/AArch64/elf-globaladdress.ll
    llvm/test/MachineVerifier/test_g_concat_vectors.mir
    llvm/test/Transforms/CanonicalizeFreezeInLoops/aarch64.ll
    llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll
    llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-ldp.ll
    llvm/test/Transforms/LoopStrengthReduce/AArch64/pr53625.ll
    llvm/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll
    llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
    llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected

Removed: 
    


################################################################################
diff  --git a/clang/test/CodeGen/aarch64-ABI-align-packed-assembly.c b/clang/test/CodeGen/aarch64-ABI-align-packed-assembly.c
index f461eadf652e17..e6eb98b027bf6b 100644
--- a/clang/test/CodeGen/aarch64-ABI-align-packed-assembly.c
+++ b/clang/test/CodeGen/aarch64-ABI-align-packed-assembly.c
@@ -55,7 +55,7 @@ struct non_packed_struct gs_non_packed_struct;
 __attribute__((noinline)) void named_arg_non_packed_struct(double d0, double d1, double d2, double d3,
                                  double d4, double d5, double d6, double d7,
                                  double d8, struct non_packed_struct s_non_packed_struct) {
-// CHECK: ldr q0, [sp, #16]
+// CHECK: ldr q1, [sp, #16]
     gd = d8;
     gs_non_packed_struct = s_non_packed_struct;
 }
@@ -74,8 +74,8 @@ void test_non_packed_struct() {
     init(1, &s_non_packed_struct);
 
 // CHECK: mov x8, #4611686018427387904        // =0x4000000000000000
-// CHECK: str q0, [sp, #16]
 // CHECK: str x8, [sp]
+// CHECK: str q0, [sp, #16]
     named_arg_non_packed_struct(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, s_non_packed_struct);
 // CHECK: str q0, [sp, #16]
     variadic_non_packed_struct(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, s_non_packed_struct);
@@ -86,7 +86,7 @@ struct packed_struct gs_packed_struct;
 __attribute__((noinline)) void named_arg_packed_struct(double d0, double d1, double d2, double d3,
                                  double d4, double d5, double d6, double d7,
                                  double d8, struct packed_struct s_packed_struct) {
-// CHECK: ldur q0, [sp, #8]
+// CHECK: ldur q1, [sp, #8]
     gd = d8;
     gs_packed_struct = s_packed_struct;
 }
@@ -105,8 +105,8 @@ void test_packed_struct() {
     init(1, &s_packed_struct);
 
 // CHECK: mov x8, #4611686018427387904        // =0x4000000000000000
-// CHECK: stur q0, [sp, #8]
 // CHECK: str x8, [sp]
+// CHECK: stur q0, [sp, #8]
     named_arg_packed_struct(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, s_packed_struct);
 // CHECK: stur q0, [sp, #8]
     variadic_packed_struct(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, s_packed_struct);
@@ -117,7 +117,7 @@ struct packed_member gs_packed_member;
 __attribute__((noinline)) void named_arg_packed_member(double d0, double d1, double d2, double d3,
                                  double d4, double d5, double d6, double d7,
                                  double d8, struct packed_member s_packed_member) {
-// CHECK: ldur q0, [sp, #8]
+// CHECK: ldur q1, [sp, #8]
     gd = d8;
     gs_packed_member = s_packed_member;
 }
@@ -136,8 +136,8 @@ void test_packed_member() {
     init(1, &s_packed_member);
 
 // CHECK: mov x8, #4611686018427387904        // =0x4000000000000000
-// CHECK: stur q0, [sp, #8]
 // CHECK: str x8, [sp]
+// CHECK: stur q0, [sp, #8]
     named_arg_packed_member(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, s_packed_member);
 // CHECK: stur q0, [sp, #8]
     variadic_packed_member(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, s_packed_member);
@@ -148,7 +148,7 @@ struct aligned_struct_8 gs_aligned_struct_8;
 __attribute__((noinline)) void named_arg_aligned_struct_8(double d0, double d1, double d2, double d3,
                                  double d4, double d5, double d6, double d7,
                                  double d8, struct aligned_struct_8 s_aligned_struct_8) {
-// CHECK: ldr q0, [sp, #16]
+// CHECK: ldr q1, [sp, #16]
     gd = d8;
     gs_aligned_struct_8 = s_aligned_struct_8;
 }
@@ -167,8 +167,8 @@ void test_aligned_struct_8() {
     init(1, &s_aligned_struct_8);
 
 // CHECK: mov x8, #4611686018427387904        // =0x4000000000000000
-// CHECK: str q0, [sp, #16]
 // CHECK: str x8, [sp]
+// CHECK: str q0, [sp, #16]
     named_arg_aligned_struct_8(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, s_aligned_struct_8);
 // CHECK: str q0, [sp, #16]
     variadic_aligned_struct_8(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, s_aligned_struct_8);
@@ -179,7 +179,7 @@ struct aligned_member_8 gs_aligned_member_8;
 __attribute__((noinline)) void named_arg_aligned_member_8(double d0, double d1, double d2, double d3,
                                  double d4, double d5, double d6, double d7,
                                  double d8, struct aligned_member_8 s_aligned_member_8) {
-// CHECK: ldr q0, [sp, #16]
+// CHECK: ldr q1, [sp, #16]
     gd = d8;
     gs_aligned_member_8 = s_aligned_member_8;
 }
@@ -198,8 +198,8 @@ void test_aligned_member_8() {
     init(1, &s_aligned_member_8);
 
 // CHECK: mov x8, #4611686018427387904        // =0x4000000000000000
-// CHECK: str q0, [sp, #16]
 // CHECK: str x8, [sp]
+// CHECK: str q0, [sp, #16]
     named_arg_aligned_member_8(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, s_aligned_member_8);
 // CHECK: str q0, [sp, #16]
     variadic_aligned_member_8(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, s_aligned_member_8);
@@ -210,7 +210,7 @@ struct pragma_packed_struct_8 gs_pragma_packed_struct_8;
 __attribute__((noinline)) void named_arg_pragma_packed_struct_8(double d0, double d1, double d2, double d3,
                                  double d4, double d5, double d6, double d7,
                                  double d8, struct pragma_packed_struct_8 s_pragma_packed_struct_8) {
-// CHECK: ldur q0, [sp, #8]
+// CHECK: ldur q1, [sp, #8]
     gd = d8;
     gs_pragma_packed_struct_8 = s_pragma_packed_struct_8;
 }
@@ -229,8 +229,8 @@ void test_pragma_packed_struct_8() {
     init(1, &s_pragma_packed_struct_8);
 
 // CHECK: mov x8, #4611686018427387904        // =0x4000000000000000
-// CHECK: stur q0, [sp, #8]
 // CHECK: str x8, [sp]
+// CHECK: stur q0, [sp, #8]
     named_arg_pragma_packed_struct_8(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, s_pragma_packed_struct_8);
 // CHECK: stur q0, [sp, #8]
     variadic_pragma_packed_struct_8(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, s_pragma_packed_struct_8);
@@ -241,7 +241,7 @@ struct pragma_packed_struct_4 gs_pragma_packed_struct_4;
 __attribute__((noinline)) void named_arg_pragma_packed_struct_4(double d0, double d1, double d2, double d3,
                                  double d4, double d5, double d6, double d7,
                                  double d8, struct pragma_packed_struct_4 s_pragma_packed_struct_4) {
-// CHECK: ldur q0, [sp, #8]
+// CHECK: ldur q1, [sp, #8]
     gd = d8;
     gs_pragma_packed_struct_4 = s_pragma_packed_struct_4;
 }
@@ -260,8 +260,8 @@ void test_pragma_packed_struct_4() {
     init(1, &s_pragma_packed_struct_4);
 
 // CHECK: mov x8, #4611686018427387904        // =0x4000000000000000
-// CHECK: stur q0, [sp, #8]
 // CHECK: str x8, [sp]
+// CHECK: stur q0, [sp, #8]
     named_arg_pragma_packed_struct_4(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, s_pragma_packed_struct_4);
 // CHECK: stur q0, [sp, #8]
     variadic_pragma_packed_struct_4(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, s_pragma_packed_struct_4);

diff  --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index e27c4230e1fdc3..9a7cc283b5c15c 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -1402,7 +1402,7 @@ def ProcessorFeatures {
 
 // FeatureFuseAdrpAdd is enabled under Generic to allow linker merging
 // optimizations.
-def : ProcessorModel<"generic", CortexA55Model, ProcessorFeatures.Generic,
+def : ProcessorModel<"generic", CortexA510Model, ProcessorFeatures.Generic,
                      [FeatureFuseAES, FeatureFuseAdrpAdd, FeaturePostRAScheduler,
                       FeatureEnableSelectOptimize]>;
 def : ProcessorModel<"cortex-a35", CortexA53Model, ProcessorFeatures.A53,

diff  --git a/llvm/test/Analysis/CostModel/AArch64/vector-select.ll b/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
index 5578e609eca329..4477bfbf577f98 100644
--- a/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/vector-select.ll
@@ -125,9 +125,9 @@ define <2 x i64> @v2i64_select_sle(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
 ; CODE:    mov
 ; CODE:    mov
 ; CODE:    mov
+; CODE:    ldr
 ; CODE:    cmge
 ; CODE:    cmge
-; CODE:    ldr
 ; CODE:    bif
 ; CODE:    bif
 ; CODE:    ext

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2.ll
index 6158e7d7a8a818..0c52a8a683e3a0 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2.ll
@@ -1129,7 +1129,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1141,7 +1141,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1153,7 +1153,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1165,7 +1165,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1177,7 +1177,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1189,7 +1189,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1201,7 +1201,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1213,7 +1213,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -1225,7 +1225,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -1237,7 +1237,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -1249,7 +1249,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_monotonic:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -1261,7 +1261,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acquire:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -1273,7 +1273,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_release:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -1285,7 +1285,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acq_rel:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -1297,7 +1297,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_seq_cst:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -1312,7 +1312,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -1327,7 +1327,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -1342,7 +1342,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -1357,7 +1357,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -1372,7 +1372,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -1904,7 +1904,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1916,7 +1916,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1928,7 +1928,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1940,7 +1940,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1952,7 +1952,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1964,7 +1964,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1976,7 +1976,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1988,7 +1988,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -2000,7 +2000,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -2012,7 +2012,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -2024,7 +2024,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_monotonic:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -2036,7 +2036,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acquire:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -2048,7 +2048,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_release:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -2060,7 +2060,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acq_rel:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -2072,7 +2072,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_seq_cst:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -2086,7 +2086,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -2100,7 +2100,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -2114,7 +2114,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -2128,7 +2128,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -2142,7 +2142,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -2489,9 +2489,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    and x9, x0, x2
-; -O1:    and x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    and x9, x1, x3
+; -O1:    and x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -2512,9 +2512,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    and x9, x0, x2
-; -O1:    and x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    and x9, x1, x3
+; -O1:    and x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -2535,9 +2535,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_release:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    and x9, x0, x2
-; -O1:    and x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x1, x3
+; -O1:    and x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -2558,9 +2558,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    and x9, x0, x2
-; -O1:    and x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x1, x3
+; -O1:    and x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -2581,9 +2581,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    and x9, x0, x2
-; -O1:    and x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x1, x3
+; -O1:    and x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -2679,7 +2679,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -2691,7 +2691,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -2703,7 +2703,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -2715,7 +2715,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -2727,7 +2727,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -2739,7 +2739,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -2751,7 +2751,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -2763,7 +2763,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -2775,7 +2775,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -2787,7 +2787,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -2799,7 +2799,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -2811,7 +2811,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -2823,7 +2823,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -2835,7 +2835,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -2847,7 +2847,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -2862,7 +2862,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -2877,7 +2877,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -2892,7 +2892,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -2907,7 +2907,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -2922,7 +2922,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -3312,9 +3312,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic:
 ; -O1:    ldxp x0, x1, [x8]
 ; -O1:    and x9, x0, x2
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x1, x3
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
@@ -3339,9 +3339,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire:
 ; -O1:    ldaxp x0, x1, [x8]
 ; -O1:    and x9, x0, x2
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x1, x3
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
@@ -3366,9 +3366,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_release:
 ; -O1:    ldxp x0, x1, [x8]
 ; -O1:    and x9, x0, x2
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x1, x3
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value release, align 16
     ret i128 %r
@@ -3393,9 +3393,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel:
 ; -O1:    ldaxp x0, x1, [x8]
 ; -O1:    and x9, x0, x2
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x1, x3
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
@@ -3420,9 +3420,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst:
 ; -O1:    ldaxp x0, x1, [x8]
 ; -O1:    and x9, x0, x2
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x1, x3
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
@@ -3530,7 +3530,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value monotonic, align 1
@@ -3544,7 +3544,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acquire, align 1
@@ -3558,7 +3558,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value release, align 1
@@ -3572,7 +3572,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acq_rel, align 1
@@ -3586,7 +3586,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value seq_cst, align 1
@@ -3600,7 +3600,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value monotonic, align 1
@@ -3614,7 +3614,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acquire, align 1
@@ -3628,7 +3628,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value release, align 1
@@ -3642,7 +3642,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acq_rel, align 1
@@ -3656,7 +3656,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value seq_cst, align 1
@@ -3670,7 +3670,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value monotonic, align 1
@@ -3684,7 +3684,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acquire, align 1
@@ -3698,7 +3698,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value release, align 1
@@ -3712,7 +3712,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acq_rel, align 1
@@ -3726,7 +3726,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value seq_cst, align 1
@@ -3744,7 +3744,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3763,7 +3763,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3782,7 +3782,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3801,7 +3801,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3820,7 +3820,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -4169,9 +4169,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    orr x9, x0, x2
-; -O1:    orr x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    orr x9, x1, x3
+; -O1:    orr x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -4192,9 +4192,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    orr x9, x0, x2
-; -O1:    orr x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    orr x9, x1, x3
+; -O1:    orr x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -4215,9 +4215,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_release:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    orr x9, x0, x2
-; -O1:    orr x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x1, x3
+; -O1:    orr x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -4238,9 +4238,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    orr x9, x0, x2
-; -O1:    orr x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x1, x3
+; -O1:    orr x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -4261,9 +4261,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    orr x9, x0, x2
-; -O1:    orr x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x1, x3
+; -O1:    orr x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -4359,7 +4359,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_monotonic(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -4371,7 +4371,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -4383,7 +4383,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -4395,7 +4395,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -4407,7 +4407,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -4419,7 +4419,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_monotonic(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -4431,7 +4431,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -4443,7 +4443,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -4455,7 +4455,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -4467,7 +4467,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -4479,7 +4479,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_monotonic(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_monotonic:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -4491,7 +4491,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acquire:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -4503,7 +4503,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_release:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -4515,7 +4515,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acq_rel:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -4527,7 +4527,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_seq_cst:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -4542,7 +4542,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_monotonic(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -4557,7 +4557,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acquire(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -4572,7 +4572,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_release(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -4587,7 +4587,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acq_rel(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -4602,7 +4602,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_seq_cst(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -4949,9 +4949,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    eor x9, x0, x2
-; -O1:    eor x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    eor x9, x1, x3
+; -O1:    eor x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -4972,9 +4972,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    eor x9, x0, x2
-; -O1:    eor x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    eor x9, x1, x3
+; -O1:    eor x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -4995,9 +4995,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_release:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    eor x9, x0, x2
-; -O1:    eor x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x1, x3
+; -O1:    eor x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -5018,9 +5018,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    eor x9, x0, x2
-; -O1:    eor x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x1, x3
+; -O1:    eor x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -5041,9 +5041,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    eor x9, x0, x2
-; -O1:    eor x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x1, x3
+; -O1:    eor x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -5139,7 +5139,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -5151,7 +5151,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -5163,7 +5163,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -5175,7 +5175,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -5187,7 +5187,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -5199,7 +5199,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -5211,7 +5211,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -5223,7 +5223,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -5235,7 +5235,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -5247,7 +5247,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -5259,7 +5259,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_monotonic:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -5271,7 +5271,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acquire:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -5283,7 +5283,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_release:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -5295,7 +5295,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acq_rel:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -5307,7 +5307,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_seq_cst:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -5322,7 +5322,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -5337,7 +5337,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -5352,7 +5352,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -5367,7 +5367,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -5382,7 +5382,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -6109,8 +6109,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -6127,8 +6127,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -6145,8 +6145,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -6163,8 +6163,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -6181,8 +6181,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -6197,8 +6197,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -6213,8 +6213,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -6229,8 +6229,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -6245,8 +6245,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -6261,8 +6261,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -6277,8 +6277,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -6293,8 +6293,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -6309,8 +6309,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -6325,8 +6325,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -6341,8 +6341,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -6366,9 +6366,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -6392,9 +6392,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -6418,9 +6418,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -6444,9 +6444,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -6470,9 +6470,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -7199,8 +7199,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -7217,8 +7217,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -7235,8 +7235,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -7253,8 +7253,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -7271,8 +7271,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -7287,8 +7287,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -7303,8 +7303,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -7319,8 +7319,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -7335,8 +7335,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -7351,8 +7351,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -7367,8 +7367,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -7383,8 +7383,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -7399,8 +7399,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -7415,8 +7415,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -7431,8 +7431,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -7456,9 +7456,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -7482,9 +7482,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -7508,9 +7508,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -7534,9 +7534,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -7560,9 +7560,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -8283,8 +8283,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -8300,8 +8300,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -8317,8 +8317,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -8334,8 +8334,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -8351,8 +8351,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -8367,8 +8367,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -8383,8 +8383,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -8399,8 +8399,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -8415,8 +8415,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -8431,8 +8431,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -8447,8 +8447,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -8463,8 +8463,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -8479,8 +8479,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -8495,8 +8495,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -8511,8 +8511,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -8536,9 +8536,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -8562,9 +8562,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -8588,9 +8588,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -8614,9 +8614,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -8640,9 +8640,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -9363,8 +9363,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -9380,8 +9380,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -9397,8 +9397,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -9414,8 +9414,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -9431,8 +9431,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -9447,8 +9447,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -9463,8 +9463,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -9479,8 +9479,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -9495,8 +9495,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -9511,8 +9511,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -9527,8 +9527,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -9543,8 +9543,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -9559,8 +9559,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -9575,8 +9575,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -9591,8 +9591,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -9616,9 +9616,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -9642,9 +9642,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -9668,9 +9668,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -9694,9 +9694,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -9720,9 +9720,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2_lse128.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2_lse128.ll
index e04660449b098d..4b0371d23b5b89 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2_lse128.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2_lse128.ll
@@ -654,7 +654,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -666,7 +666,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -678,7 +678,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -690,7 +690,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -702,7 +702,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -714,7 +714,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -726,7 +726,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -738,7 +738,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -750,7 +750,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -762,7 +762,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -774,7 +774,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_monotonic:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -786,7 +786,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acquire:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -798,7 +798,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_release:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -810,7 +810,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acq_rel:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -822,7 +822,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_seq_cst:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -837,7 +837,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -852,7 +852,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -867,7 +867,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -882,7 +882,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -897,7 +897,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -1184,7 +1184,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1196,7 +1196,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1208,7 +1208,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1220,7 +1220,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1232,7 +1232,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1244,7 +1244,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1256,7 +1256,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1268,7 +1268,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -1280,7 +1280,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -1292,7 +1292,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -1304,7 +1304,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_monotonic:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -1316,7 +1316,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acquire:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -1328,7 +1328,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_release:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -1340,7 +1340,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acq_rel:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -1352,7 +1352,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_seq_cst:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -1366,7 +1366,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -1380,7 +1380,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -1394,7 +1394,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -1408,7 +1408,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -1422,7 +1422,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -1704,7 +1704,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1716,7 +1716,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1728,7 +1728,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1740,7 +1740,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1752,7 +1752,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1764,7 +1764,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1776,7 +1776,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1788,7 +1788,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -1800,7 +1800,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -1812,7 +1812,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -1824,7 +1824,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -1836,7 +1836,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -1848,7 +1848,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -1860,7 +1860,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -1872,7 +1872,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -1887,7 +1887,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -1902,7 +1902,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -1917,7 +1917,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -1932,7 +1932,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -1947,7 +1947,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -2495,7 +2495,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value monotonic, align 1
@@ -2509,7 +2509,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acquire, align 1
@@ -2523,7 +2523,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value release, align 1
@@ -2537,7 +2537,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acq_rel, align 1
@@ -2551,7 +2551,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value seq_cst, align 1
@@ -2565,7 +2565,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value monotonic, align 1
@@ -2579,7 +2579,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acquire, align 1
@@ -2593,7 +2593,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value release, align 1
@@ -2607,7 +2607,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acq_rel, align 1
@@ -2621,7 +2621,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value seq_cst, align 1
@@ -2635,7 +2635,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value monotonic, align 1
@@ -2649,7 +2649,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acquire, align 1
@@ -2663,7 +2663,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value release, align 1
@@ -2677,7 +2677,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acq_rel, align 1
@@ -2691,7 +2691,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value seq_cst, align 1
@@ -2709,7 +2709,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -2728,7 +2728,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -2747,7 +2747,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -2766,7 +2766,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -2785,7 +2785,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3024,7 +3024,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_monotonic(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -3036,7 +3036,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -3048,7 +3048,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -3060,7 +3060,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -3072,7 +3072,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -3084,7 +3084,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_monotonic(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -3096,7 +3096,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -3108,7 +3108,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -3120,7 +3120,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -3132,7 +3132,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -3144,7 +3144,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_monotonic(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_monotonic:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -3156,7 +3156,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acquire:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -3168,7 +3168,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_release:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -3180,7 +3180,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acq_rel:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -3192,7 +3192,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_seq_cst:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -3207,7 +3207,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_monotonic(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -3222,7 +3222,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acquire(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -3237,7 +3237,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_release(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -3252,7 +3252,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acq_rel(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -3267,7 +3267,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_seq_cst(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -3559,7 +3559,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -3571,7 +3571,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -3583,7 +3583,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -3595,7 +3595,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -3607,7 +3607,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -3619,7 +3619,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -3631,7 +3631,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -3643,7 +3643,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -3655,7 +3655,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -3667,7 +3667,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -3679,7 +3679,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_monotonic:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -3691,7 +3691,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acquire:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -3703,7 +3703,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_release:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -3715,7 +3715,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acq_rel:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -3727,7 +3727,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_seq_cst:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -3742,7 +3742,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -3757,7 +3757,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -3772,7 +3772,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -3787,7 +3787,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -3802,7 +3802,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -4154,8 +4154,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -4172,8 +4172,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -4190,8 +4190,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -4208,8 +4208,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -4226,8 +4226,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -4242,8 +4242,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -4258,8 +4258,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -4274,8 +4274,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -4290,8 +4290,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -4306,8 +4306,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -4322,8 +4322,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -4338,8 +4338,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -4354,8 +4354,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -4370,8 +4370,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -4386,8 +4386,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -4411,9 +4411,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -4437,9 +4437,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -4463,9 +4463,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -4489,9 +4489,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -4515,9 +4515,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -4869,8 +4869,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -4887,8 +4887,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -4905,8 +4905,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -4923,8 +4923,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -4941,8 +4941,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -4957,8 +4957,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -4973,8 +4973,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -4989,8 +4989,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -5005,8 +5005,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -5021,8 +5021,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -5037,8 +5037,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -5053,8 +5053,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -5069,8 +5069,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -5085,8 +5085,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -5101,8 +5101,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -5126,9 +5126,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -5152,9 +5152,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -5178,9 +5178,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -5204,9 +5204,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -5230,9 +5230,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -5583,8 +5583,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -5600,8 +5600,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -5617,8 +5617,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -5634,8 +5634,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -5651,8 +5651,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -5667,8 +5667,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -5683,8 +5683,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -5699,8 +5699,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -5715,8 +5715,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -5731,8 +5731,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -5747,8 +5747,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -5763,8 +5763,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -5779,8 +5779,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -5795,8 +5795,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -5811,8 +5811,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -5836,9 +5836,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -5862,9 +5862,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -5888,9 +5888,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -5914,9 +5914,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -5940,9 +5940,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -6293,8 +6293,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -6310,8 +6310,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -6327,8 +6327,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -6344,8 +6344,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -6361,8 +6361,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -6377,8 +6377,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -6393,8 +6393,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -6409,8 +6409,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -6425,8 +6425,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -6441,8 +6441,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -6457,8 +6457,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -6473,8 +6473,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -6489,8 +6489,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -6505,8 +6505,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -6521,8 +6521,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -6546,9 +6546,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -6572,9 +6572,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -6598,9 +6598,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -6624,9 +6624,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -6650,9 +6650,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll
index cb8e7530dedf06..89d22c59e630b4 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll
@@ -709,7 +709,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -721,7 +721,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -733,7 +733,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -745,7 +745,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -757,7 +757,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -769,7 +769,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -781,7 +781,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -793,7 +793,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -805,7 +805,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -817,7 +817,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -829,7 +829,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_monotonic:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -841,7 +841,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acquire:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -853,7 +853,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_release:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -865,7 +865,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acq_rel:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -877,7 +877,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_seq_cst:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -892,7 +892,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -907,7 +907,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -922,7 +922,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -937,7 +937,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -952,7 +952,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -1349,7 +1349,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1361,7 +1361,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1373,7 +1373,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1385,7 +1385,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1397,7 +1397,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1409,7 +1409,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1421,7 +1421,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1433,7 +1433,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -1445,7 +1445,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -1457,7 +1457,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -1469,7 +1469,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_monotonic:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -1481,7 +1481,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acquire:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -1493,7 +1493,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_release:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -1505,7 +1505,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acq_rel:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -1517,7 +1517,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_seq_cst:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -1531,7 +1531,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -1545,7 +1545,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -1559,7 +1559,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -1573,7 +1573,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -1587,7 +1587,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -1849,9 +1849,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    and x9, x0, x2
-; -O1:    and x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    and x9, x1, x3
+; -O1:    and x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -1872,9 +1872,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    and x9, x0, x2
-; -O1:    and x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    and x9, x1, x3
+; -O1:    and x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -1895,9 +1895,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_release:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    and x9, x0, x2
-; -O1:    and x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x1, x3
+; -O1:    and x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -1918,9 +1918,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    and x9, x0, x2
-; -O1:    and x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x1, x3
+; -O1:    and x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -1941,9 +1941,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    and x9, x0, x2
-; -O1:    and x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x1, x3
+; -O1:    and x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -2014,7 +2014,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -2026,7 +2026,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -2038,7 +2038,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -2050,7 +2050,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -2062,7 +2062,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -2074,7 +2074,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -2086,7 +2086,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -2098,7 +2098,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -2110,7 +2110,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -2122,7 +2122,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -2134,7 +2134,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -2146,7 +2146,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -2158,7 +2158,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -2170,7 +2170,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -2182,7 +2182,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -2197,7 +2197,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -2212,7 +2212,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -2227,7 +2227,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -2242,7 +2242,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -2257,7 +2257,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -2647,9 +2647,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic:
 ; -O1:    ldxp x0, x1, [x8]
 ; -O1:    and x9, x0, x2
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x1, x3
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
@@ -2674,9 +2674,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire:
 ; -O1:    ldaxp x0, x1, [x8]
 ; -O1:    and x9, x0, x2
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x1, x3
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
@@ -2701,9 +2701,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_release:
 ; -O1:    ldxp x0, x1, [x8]
 ; -O1:    and x9, x0, x2
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x1, x3
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value release, align 16
     ret i128 %r
@@ -2728,9 +2728,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel:
 ; -O1:    ldaxp x0, x1, [x8]
 ; -O1:    and x9, x0, x2
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x1, x3
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
@@ -2755,9 +2755,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst:
 ; -O1:    ldaxp x0, x1, [x8]
 ; -O1:    and x9, x0, x2
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x1, x3
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
@@ -2865,7 +2865,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value monotonic, align 1
@@ -2879,7 +2879,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acquire, align 1
@@ -2893,7 +2893,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value release, align 1
@@ -2907,7 +2907,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acq_rel, align 1
@@ -2921,7 +2921,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value seq_cst, align 1
@@ -2935,7 +2935,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value monotonic, align 1
@@ -2949,7 +2949,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acquire, align 1
@@ -2963,7 +2963,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value release, align 1
@@ -2977,7 +2977,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acq_rel, align 1
@@ -2991,7 +2991,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value seq_cst, align 1
@@ -3005,7 +3005,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value monotonic, align 1
@@ -3019,7 +3019,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acquire, align 1
@@ -3033,7 +3033,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value release, align 1
@@ -3047,7 +3047,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acq_rel, align 1
@@ -3061,7 +3061,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value seq_cst, align 1
@@ -3079,7 +3079,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3098,7 +3098,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3117,7 +3117,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3136,7 +3136,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3155,7 +3155,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3319,9 +3319,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    orr x9, x0, x2
-; -O1:    orr x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    orr x9, x1, x3
+; -O1:    orr x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -3342,9 +3342,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    orr x9, x0, x2
-; -O1:    orr x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    orr x9, x1, x3
+; -O1:    orr x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -3365,9 +3365,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_release:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    orr x9, x0, x2
-; -O1:    orr x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x1, x3
+; -O1:    orr x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -3388,9 +3388,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    orr x9, x0, x2
-; -O1:    orr x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x1, x3
+; -O1:    orr x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -3411,9 +3411,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    orr x9, x0, x2
-; -O1:    orr x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x1, x3
+; -O1:    orr x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -3459,7 +3459,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_monotonic(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -3471,7 +3471,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -3483,7 +3483,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -3495,7 +3495,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -3507,7 +3507,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -3519,7 +3519,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_monotonic(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -3531,7 +3531,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -3543,7 +3543,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -3555,7 +3555,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -3567,7 +3567,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -3579,7 +3579,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_monotonic(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_monotonic:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -3591,7 +3591,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acquire:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -3603,7 +3603,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_release:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -3615,7 +3615,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acq_rel:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -3627,7 +3627,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_seq_cst:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -3642,7 +3642,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_monotonic(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -3657,7 +3657,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acquire(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -3672,7 +3672,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_release(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -3687,7 +3687,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acq_rel(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -3702,7 +3702,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_seq_cst(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -3864,9 +3864,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    eor x9, x0, x2
-; -O1:    eor x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    eor x9, x1, x3
+; -O1:    eor x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -3887,9 +3887,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    eor x9, x0, x2
-; -O1:    eor x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    eor x9, x1, x3
+; -O1:    eor x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -3910,9 +3910,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_release:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    eor x9, x0, x2
-; -O1:    eor x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x1, x3
+; -O1:    eor x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -3933,9 +3933,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    eor x9, x0, x2
-; -O1:    eor x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x1, x3
+; -O1:    eor x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -3956,9 +3956,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    eor x9, x0, x2
-; -O1:    eor x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x1, x3
+; -O1:    eor x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -4004,7 +4004,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -4016,7 +4016,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -4028,7 +4028,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -4040,7 +4040,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -4052,7 +4052,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -4064,7 +4064,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -4076,7 +4076,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -4088,7 +4088,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -4100,7 +4100,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -4112,7 +4112,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -4124,7 +4124,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_monotonic:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -4136,7 +4136,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acquire:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -4148,7 +4148,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_release:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -4160,7 +4160,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acq_rel:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -4172,7 +4172,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_seq_cst:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -4187,7 +4187,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -4202,7 +4202,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -4217,7 +4217,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -4232,7 +4232,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -4247,7 +4247,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -4974,8 +4974,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -4992,8 +4992,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -5010,8 +5010,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -5028,8 +5028,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -5046,8 +5046,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -5062,8 +5062,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -5078,8 +5078,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -5094,8 +5094,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -5110,8 +5110,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -5126,8 +5126,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -5142,8 +5142,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -5158,8 +5158,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -5174,8 +5174,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -5190,8 +5190,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -5206,8 +5206,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -5231,9 +5231,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -5257,9 +5257,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -5283,9 +5283,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -5309,9 +5309,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -5335,9 +5335,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -6064,8 +6064,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -6082,8 +6082,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -6100,8 +6100,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -6118,8 +6118,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -6136,8 +6136,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -6152,8 +6152,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -6168,8 +6168,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -6184,8 +6184,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -6200,8 +6200,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -6216,8 +6216,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -6232,8 +6232,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -6248,8 +6248,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -6264,8 +6264,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -6280,8 +6280,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -6296,8 +6296,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -6321,9 +6321,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -6347,9 +6347,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -6373,9 +6373,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -6399,9 +6399,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -6425,9 +6425,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -7148,8 +7148,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -7165,8 +7165,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -7182,8 +7182,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -7199,8 +7199,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -7216,8 +7216,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -7232,8 +7232,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -7248,8 +7248,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -7264,8 +7264,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -7280,8 +7280,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -7296,8 +7296,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -7312,8 +7312,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -7328,8 +7328,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -7344,8 +7344,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -7360,8 +7360,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -7376,8 +7376,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -7401,9 +7401,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -7427,9 +7427,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -7453,9 +7453,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -7479,9 +7479,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -7505,9 +7505,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -8228,8 +8228,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -8245,8 +8245,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -8262,8 +8262,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -8279,8 +8279,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -8296,8 +8296,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -8312,8 +8312,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -8328,8 +8328,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -8344,8 +8344,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -8360,8 +8360,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -8376,8 +8376,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -8392,8 +8392,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -8408,8 +8408,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -8424,8 +8424,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -8440,8 +8440,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -8456,8 +8456,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -8481,9 +8481,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -8507,9 +8507,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -8533,9 +8533,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -8559,9 +8559,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -8585,9 +8585,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc.ll
index 987c0640c0f4a2..bb6163f5bc3875 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc.ll
@@ -1129,7 +1129,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1141,7 +1141,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1153,7 +1153,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1165,7 +1165,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1177,7 +1177,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1189,7 +1189,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1201,7 +1201,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1213,7 +1213,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -1225,7 +1225,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -1237,7 +1237,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -1249,7 +1249,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_monotonic:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -1261,7 +1261,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acquire:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -1273,7 +1273,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_release:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -1285,7 +1285,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acq_rel:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -1297,7 +1297,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_seq_cst:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -1312,7 +1312,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -1327,7 +1327,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -1342,7 +1342,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -1357,7 +1357,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -1372,7 +1372,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -1904,7 +1904,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1916,7 +1916,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1928,7 +1928,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1940,7 +1940,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1952,7 +1952,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1964,7 +1964,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1976,7 +1976,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1988,7 +1988,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -2000,7 +2000,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -2012,7 +2012,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -2024,7 +2024,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_monotonic:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -2036,7 +2036,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acquire:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -2048,7 +2048,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_release:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -2060,7 +2060,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acq_rel:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -2072,7 +2072,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_seq_cst:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -2086,7 +2086,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -2100,7 +2100,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -2114,7 +2114,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -2128,7 +2128,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -2142,7 +2142,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -2489,9 +2489,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    and x9, x0, x2
-; -O1:    and x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    and x9, x1, x3
+; -O1:    and x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -2512,9 +2512,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    and x9, x0, x2
-; -O1:    and x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    and x9, x1, x3
+; -O1:    and x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -2535,9 +2535,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_release:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    and x9, x0, x2
-; -O1:    and x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x1, x3
+; -O1:    and x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -2558,9 +2558,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    and x9, x0, x2
-; -O1:    and x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x1, x3
+; -O1:    and x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -2581,9 +2581,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    and x9, x0, x2
-; -O1:    and x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x1, x3
+; -O1:    and x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -2679,7 +2679,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -2691,7 +2691,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -2703,7 +2703,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -2715,7 +2715,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -2727,7 +2727,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -2739,7 +2739,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -2751,7 +2751,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -2763,7 +2763,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -2775,7 +2775,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -2787,7 +2787,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -2799,7 +2799,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -2811,7 +2811,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -2823,7 +2823,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -2835,7 +2835,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -2847,7 +2847,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -2862,7 +2862,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -2877,7 +2877,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -2892,7 +2892,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -2907,7 +2907,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -2922,7 +2922,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -3312,9 +3312,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic:
 ; -O1:    ldxp x0, x1, [x8]
 ; -O1:    and x9, x0, x2
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x1, x3
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
@@ -3339,9 +3339,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire:
 ; -O1:    ldaxp x0, x1, [x8]
 ; -O1:    and x9, x0, x2
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x1, x3
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
@@ -3366,9 +3366,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_release:
 ; -O1:    ldxp x0, x1, [x8]
 ; -O1:    and x9, x0, x2
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x1, x3
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value release, align 16
     ret i128 %r
@@ -3393,9 +3393,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel:
 ; -O1:    ldaxp x0, x1, [x8]
 ; -O1:    and x9, x0, x2
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x1, x3
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
@@ -3420,9 +3420,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst:
 ; -O1:    ldaxp x0, x1, [x8]
 ; -O1:    and x9, x0, x2
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x1, x3
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
@@ -3530,7 +3530,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value monotonic, align 1
@@ -3544,7 +3544,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acquire, align 1
@@ -3558,7 +3558,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value release, align 1
@@ -3572,7 +3572,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acq_rel, align 1
@@ -3586,7 +3586,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value seq_cst, align 1
@@ -3600,7 +3600,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value monotonic, align 1
@@ -3614,7 +3614,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acquire, align 1
@@ -3628,7 +3628,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value release, align 1
@@ -3642,7 +3642,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acq_rel, align 1
@@ -3656,7 +3656,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value seq_cst, align 1
@@ -3670,7 +3670,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value monotonic, align 1
@@ -3684,7 +3684,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acquire, align 1
@@ -3698,7 +3698,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value release, align 1
@@ -3712,7 +3712,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acq_rel, align 1
@@ -3726,7 +3726,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value seq_cst, align 1
@@ -3744,7 +3744,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3763,7 +3763,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3782,7 +3782,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3801,7 +3801,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3820,7 +3820,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -4169,9 +4169,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    orr x9, x0, x2
-; -O1:    orr x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    orr x9, x1, x3
+; -O1:    orr x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -4192,9 +4192,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    orr x9, x0, x2
-; -O1:    orr x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    orr x9, x1, x3
+; -O1:    orr x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -4215,9 +4215,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_release:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    orr x9, x0, x2
-; -O1:    orr x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x1, x3
+; -O1:    orr x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -4238,9 +4238,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    orr x9, x0, x2
-; -O1:    orr x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x1, x3
+; -O1:    orr x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -4261,9 +4261,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    orr x9, x0, x2
-; -O1:    orr x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x1, x3
+; -O1:    orr x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -4359,7 +4359,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_monotonic(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -4371,7 +4371,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -4383,7 +4383,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -4395,7 +4395,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -4407,7 +4407,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -4419,7 +4419,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_monotonic(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -4431,7 +4431,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -4443,7 +4443,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -4455,7 +4455,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -4467,7 +4467,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -4479,7 +4479,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_monotonic(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_monotonic:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -4491,7 +4491,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acquire:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -4503,7 +4503,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_release:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -4515,7 +4515,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acq_rel:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -4527,7 +4527,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_seq_cst:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -4542,7 +4542,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_monotonic(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -4557,7 +4557,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acquire(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -4572,7 +4572,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_release(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -4587,7 +4587,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acq_rel(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -4602,7 +4602,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_seq_cst(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -4949,9 +4949,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    eor x9, x0, x2
-; -O1:    eor x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    eor x9, x1, x3
+; -O1:    eor x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -4972,9 +4972,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    eor x9, x0, x2
-; -O1:    eor x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    eor x9, x1, x3
+; -O1:    eor x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -4995,9 +4995,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_release:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    eor x9, x0, x2
-; -O1:    eor x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x1, x3
+; -O1:    eor x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -5018,9 +5018,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    eor x9, x0, x2
-; -O1:    eor x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x1, x3
+; -O1:    eor x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -5041,9 +5041,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    eor x9, x0, x2
-; -O1:    eor x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x1, x3
+; -O1:    eor x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -5139,7 +5139,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -5151,7 +5151,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -5163,7 +5163,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -5175,7 +5175,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -5187,7 +5187,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -5199,7 +5199,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -5211,7 +5211,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -5223,7 +5223,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -5235,7 +5235,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -5247,7 +5247,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -5259,7 +5259,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_monotonic:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -5271,7 +5271,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acquire:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -5283,7 +5283,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_release:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -5295,7 +5295,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acq_rel:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -5307,7 +5307,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_seq_cst:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -5322,7 +5322,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -5337,7 +5337,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -5352,7 +5352,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -5367,7 +5367,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -5382,7 +5382,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -6109,8 +6109,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -6127,8 +6127,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -6145,8 +6145,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -6163,8 +6163,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -6181,8 +6181,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -6197,8 +6197,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -6213,8 +6213,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -6229,8 +6229,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -6245,8 +6245,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -6261,8 +6261,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -6277,8 +6277,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -6293,8 +6293,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -6309,8 +6309,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -6325,8 +6325,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -6341,8 +6341,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -6366,9 +6366,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -6392,9 +6392,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -6418,9 +6418,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -6444,9 +6444,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -6470,9 +6470,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -7199,8 +7199,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -7217,8 +7217,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -7235,8 +7235,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -7253,8 +7253,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -7271,8 +7271,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -7287,8 +7287,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -7303,8 +7303,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -7319,8 +7319,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -7335,8 +7335,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -7351,8 +7351,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -7367,8 +7367,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -7383,8 +7383,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -7399,8 +7399,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -7415,8 +7415,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -7431,8 +7431,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -7456,9 +7456,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -7482,9 +7482,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -7508,9 +7508,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -7534,9 +7534,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -7560,9 +7560,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -8283,8 +8283,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -8300,8 +8300,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -8317,8 +8317,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -8334,8 +8334,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -8351,8 +8351,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -8367,8 +8367,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -8383,8 +8383,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -8399,8 +8399,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -8415,8 +8415,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -8431,8 +8431,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -8447,8 +8447,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -8463,8 +8463,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -8479,8 +8479,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -8495,8 +8495,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -8511,8 +8511,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -8536,9 +8536,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -8562,9 +8562,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -8588,9 +8588,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -8614,9 +8614,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -8640,9 +8640,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -9363,8 +9363,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -9380,8 +9380,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -9397,8 +9397,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -9414,8 +9414,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -9431,8 +9431,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -9447,8 +9447,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -9463,8 +9463,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -9479,8 +9479,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -9495,8 +9495,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -9511,8 +9511,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -9527,8 +9527,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -9543,8 +9543,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -9559,8 +9559,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -9575,8 +9575,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -9591,8 +9591,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -9616,9 +9616,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -9642,9 +9642,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -9668,9 +9668,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -9694,9 +9694,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -9720,9 +9720,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll
index 25305e056d0b3a..635620bb5ae11b 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll
@@ -1129,7 +1129,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1141,7 +1141,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1153,7 +1153,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1165,7 +1165,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1177,7 +1177,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1189,7 +1189,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1201,7 +1201,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1213,7 +1213,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -1225,7 +1225,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -1237,7 +1237,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -1249,7 +1249,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_monotonic:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -1261,7 +1261,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acquire:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -1273,7 +1273,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_release:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -1285,7 +1285,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acq_rel:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -1297,7 +1297,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_seq_cst:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -1312,7 +1312,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -1327,7 +1327,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -1342,7 +1342,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -1357,7 +1357,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -1372,7 +1372,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -1904,7 +1904,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1916,7 +1916,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1928,7 +1928,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1940,7 +1940,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1952,7 +1952,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1964,7 +1964,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1976,7 +1976,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1988,7 +1988,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -2000,7 +2000,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -2012,7 +2012,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -2024,7 +2024,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_monotonic:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -2036,7 +2036,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acquire:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -2048,7 +2048,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_release:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -2060,7 +2060,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acq_rel:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -2072,7 +2072,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_seq_cst:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -2086,7 +2086,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -2100,7 +2100,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -2114,7 +2114,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -2128,7 +2128,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -2142,7 +2142,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -2489,9 +2489,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    and x9, x0, x2
-; -O1:    and x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    and x9, x1, x3
+; -O1:    and x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -2512,9 +2512,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    and x9, x0, x2
-; -O1:    and x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    and x9, x1, x3
+; -O1:    and x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -2535,9 +2535,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_release:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    and x9, x0, x2
-; -O1:    and x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x1, x3
+; -O1:    and x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -2558,9 +2558,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    and x9, x0, x2
-; -O1:    and x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x1, x3
+; -O1:    and x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -2581,9 +2581,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    and x9, x0, x2
-; -O1:    and x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x1, x3
+; -O1:    and x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -2679,7 +2679,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -2691,7 +2691,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -2703,7 +2703,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -2715,7 +2715,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -2727,7 +2727,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -2739,7 +2739,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -2751,7 +2751,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -2763,7 +2763,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -2775,7 +2775,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -2787,7 +2787,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -2799,7 +2799,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -2811,7 +2811,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -2823,7 +2823,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -2835,7 +2835,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -2847,7 +2847,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -2862,7 +2862,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -2877,7 +2877,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -2892,7 +2892,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -2907,7 +2907,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -2922,7 +2922,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -3312,9 +3312,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic:
 ; -O1:    ldxp x0, x1, [x8]
 ; -O1:    and x9, x0, x2
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x1, x3
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
@@ -3339,9 +3339,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire:
 ; -O1:    ldaxp x0, x1, [x8]
 ; -O1:    and x9, x0, x2
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x1, x3
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
@@ -3366,9 +3366,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_release:
 ; -O1:    ldxp x0, x1, [x8]
 ; -O1:    and x9, x0, x2
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x1, x3
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value release, align 16
     ret i128 %r
@@ -3393,9 +3393,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel:
 ; -O1:    ldaxp x0, x1, [x8]
 ; -O1:    and x9, x0, x2
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x1, x3
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
@@ -3420,9 +3420,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst:
 ; -O1:    ldaxp x0, x1, [x8]
 ; -O1:    and x9, x0, x2
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x1, x3
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
@@ -3530,7 +3530,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value monotonic, align 1
@@ -3544,7 +3544,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acquire, align 1
@@ -3558,7 +3558,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value release, align 1
@@ -3572,7 +3572,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acq_rel, align 1
@@ -3586,7 +3586,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value seq_cst, align 1
@@ -3600,7 +3600,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value monotonic, align 1
@@ -3614,7 +3614,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acquire, align 1
@@ -3628,7 +3628,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value release, align 1
@@ -3642,7 +3642,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acq_rel, align 1
@@ -3656,7 +3656,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value seq_cst, align 1
@@ -3670,7 +3670,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value monotonic, align 1
@@ -3684,7 +3684,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acquire, align 1
@@ -3698,7 +3698,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value release, align 1
@@ -3712,7 +3712,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acq_rel, align 1
@@ -3726,7 +3726,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value seq_cst, align 1
@@ -3744,7 +3744,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3763,7 +3763,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3782,7 +3782,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3801,7 +3801,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3820,7 +3820,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -4169,9 +4169,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    orr x9, x0, x2
-; -O1:    orr x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    orr x9, x1, x3
+; -O1:    orr x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -4192,9 +4192,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    orr x9, x0, x2
-; -O1:    orr x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    orr x9, x1, x3
+; -O1:    orr x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -4215,9 +4215,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_release:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    orr x9, x0, x2
-; -O1:    orr x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x1, x3
+; -O1:    orr x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -4238,9 +4238,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    orr x9, x0, x2
-; -O1:    orr x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x1, x3
+; -O1:    orr x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -4261,9 +4261,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    orr x9, x0, x2
-; -O1:    orr x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x1, x3
+; -O1:    orr x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -4359,7 +4359,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_monotonic(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -4371,7 +4371,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -4383,7 +4383,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -4395,7 +4395,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -4407,7 +4407,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -4419,7 +4419,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_monotonic(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -4431,7 +4431,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -4443,7 +4443,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -4455,7 +4455,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -4467,7 +4467,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -4479,7 +4479,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_monotonic(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_monotonic:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -4491,7 +4491,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acquire:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -4503,7 +4503,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_release:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -4515,7 +4515,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acq_rel:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -4527,7 +4527,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_seq_cst:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -4542,7 +4542,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_monotonic(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -4557,7 +4557,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acquire(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -4572,7 +4572,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_release(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -4587,7 +4587,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acq_rel(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -4602,7 +4602,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_seq_cst(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -4949,9 +4949,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    eor x9, x0, x2
-; -O1:    eor x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    eor x9, x1, x3
+; -O1:    eor x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -4972,9 +4972,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    eor x9, x0, x2
-; -O1:    eor x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    eor x9, x1, x3
+; -O1:    eor x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -4995,9 +4995,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_release:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    eor x9, x0, x2
-; -O1:    eor x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x1, x3
+; -O1:    eor x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -5018,9 +5018,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    eor x9, x0, x2
-; -O1:    eor x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x1, x3
+; -O1:    eor x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -5041,9 +5041,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    eor x9, x0, x2
-; -O1:    eor x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x1, x3
+; -O1:    eor x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -5139,7 +5139,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -5151,7 +5151,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -5163,7 +5163,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -5175,7 +5175,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -5187,7 +5187,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -5199,7 +5199,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -5211,7 +5211,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -5223,7 +5223,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -5235,7 +5235,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -5247,7 +5247,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -5259,7 +5259,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_monotonic:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -5271,7 +5271,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acquire:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -5283,7 +5283,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_release:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -5295,7 +5295,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acq_rel:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -5307,7 +5307,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_seq_cst:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -5322,7 +5322,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -5337,7 +5337,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -5352,7 +5352,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -5367,7 +5367,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -5382,7 +5382,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -6109,8 +6109,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -6127,8 +6127,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -6145,8 +6145,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -6163,8 +6163,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -6181,8 +6181,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -6197,8 +6197,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -6213,8 +6213,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -6229,8 +6229,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -6245,8 +6245,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -6261,8 +6261,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -6277,8 +6277,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -6293,8 +6293,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -6309,8 +6309,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -6325,8 +6325,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -6341,8 +6341,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -6366,9 +6366,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -6392,9 +6392,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -6418,9 +6418,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -6444,9 +6444,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -6470,9 +6470,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -7199,8 +7199,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -7217,8 +7217,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -7235,8 +7235,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -7253,8 +7253,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -7271,8 +7271,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -7287,8 +7287,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -7303,8 +7303,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -7319,8 +7319,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -7335,8 +7335,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -7351,8 +7351,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -7367,8 +7367,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -7383,8 +7383,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -7399,8 +7399,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -7415,8 +7415,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -7431,8 +7431,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -7456,9 +7456,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -7482,9 +7482,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -7508,9 +7508,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -7534,9 +7534,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -7560,9 +7560,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -8283,8 +8283,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -8300,8 +8300,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -8317,8 +8317,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -8334,8 +8334,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -8351,8 +8351,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -8367,8 +8367,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -8383,8 +8383,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -8399,8 +8399,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -8415,8 +8415,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -8431,8 +8431,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -8447,8 +8447,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -8463,8 +8463,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -8479,8 +8479,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -8495,8 +8495,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -8511,8 +8511,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -8536,9 +8536,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -8562,9 +8562,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -8588,9 +8588,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -8614,9 +8614,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -8640,9 +8640,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -9363,8 +9363,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -9380,8 +9380,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -9397,8 +9397,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -9414,8 +9414,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -9431,8 +9431,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -9447,8 +9447,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -9463,8 +9463,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -9479,8 +9479,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -9495,8 +9495,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -9511,8 +9511,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -9527,8 +9527,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -9543,8 +9543,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -9559,8 +9559,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -9575,8 +9575,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -9591,8 +9591,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -9616,9 +9616,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -9642,9 +9642,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -9668,9 +9668,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -9694,9 +9694,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -9720,9 +9720,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8_1a.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8_1a.ll
index 296dcf40523233..7fc733a13bf07f 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8_1a.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8_1a.ll
@@ -689,7 +689,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -701,7 +701,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -713,7 +713,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -725,7 +725,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -737,7 +737,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -749,7 +749,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -761,7 +761,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -773,7 +773,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -785,7 +785,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -797,7 +797,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -809,7 +809,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_monotonic:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -821,7 +821,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acquire:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -833,7 +833,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_release:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -845,7 +845,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acq_rel:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -857,7 +857,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_seq_cst:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -872,7 +872,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -887,7 +887,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -902,7 +902,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -917,7 +917,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -932,7 +932,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -1219,7 +1219,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1231,7 +1231,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1243,7 +1243,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1255,7 +1255,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1267,7 +1267,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1279,7 +1279,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1291,7 +1291,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1303,7 +1303,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -1315,7 +1315,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -1327,7 +1327,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -1339,7 +1339,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_monotonic:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -1351,7 +1351,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acquire:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -1363,7 +1363,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_release:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -1375,7 +1375,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acq_rel:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -1387,7 +1387,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_seq_cst:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -1401,7 +1401,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -1415,7 +1415,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -1429,7 +1429,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -1443,7 +1443,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -1457,7 +1457,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -1774,7 +1774,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1786,7 +1786,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1798,7 +1798,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1810,7 +1810,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1822,7 +1822,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1834,7 +1834,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1846,7 +1846,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1858,7 +1858,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -1870,7 +1870,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -1882,7 +1882,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -1894,7 +1894,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -1906,7 +1906,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -1918,7 +1918,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -1930,7 +1930,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -1942,7 +1942,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -1957,7 +1957,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -1972,7 +1972,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -1987,7 +1987,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -2002,7 +2002,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -2017,7 +2017,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -2565,7 +2565,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value monotonic, align 1
@@ -2579,7 +2579,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acquire, align 1
@@ -2593,7 +2593,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value release, align 1
@@ -2607,7 +2607,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acq_rel, align 1
@@ -2621,7 +2621,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value seq_cst, align 1
@@ -2635,7 +2635,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value monotonic, align 1
@@ -2649,7 +2649,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acquire, align 1
@@ -2663,7 +2663,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value release, align 1
@@ -2677,7 +2677,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acq_rel, align 1
@@ -2691,7 +2691,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value seq_cst, align 1
@@ -2705,7 +2705,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value monotonic, align 1
@@ -2719,7 +2719,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acquire, align 1
@@ -2733,7 +2733,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value release, align 1
@@ -2747,7 +2747,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acq_rel, align 1
@@ -2761,7 +2761,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value seq_cst, align 1
@@ -2779,7 +2779,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -2798,7 +2798,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -2817,7 +2817,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -2836,7 +2836,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -2855,7 +2855,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3149,7 +3149,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_monotonic(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -3161,7 +3161,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -3173,7 +3173,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -3185,7 +3185,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -3197,7 +3197,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -3209,7 +3209,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_monotonic(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -3221,7 +3221,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -3233,7 +3233,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -3245,7 +3245,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -3257,7 +3257,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -3269,7 +3269,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_monotonic(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_monotonic:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -3281,7 +3281,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acquire:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -3293,7 +3293,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_release:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -3305,7 +3305,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acq_rel:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -3317,7 +3317,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_seq_cst:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -3332,7 +3332,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_monotonic(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -3347,7 +3347,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acquire(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -3362,7 +3362,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_release(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -3377,7 +3377,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acq_rel(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -3392,7 +3392,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_seq_cst(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -3684,7 +3684,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -3696,7 +3696,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -3708,7 +3708,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -3720,7 +3720,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -3732,7 +3732,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -3744,7 +3744,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -3756,7 +3756,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -3768,7 +3768,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -3780,7 +3780,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -3792,7 +3792,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -3804,7 +3804,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_monotonic:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -3816,7 +3816,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acquire:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -3828,7 +3828,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_release:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -3840,7 +3840,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acq_rel:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -3852,7 +3852,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_seq_cst:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -3867,7 +3867,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -3882,7 +3882,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -3897,7 +3897,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -3912,7 +3912,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -3927,7 +3927,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -4279,8 +4279,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -4297,8 +4297,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -4315,8 +4315,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -4333,8 +4333,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -4351,8 +4351,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -4367,8 +4367,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -4383,8 +4383,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -4399,8 +4399,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -4415,8 +4415,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -4431,8 +4431,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -4447,8 +4447,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -4463,8 +4463,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -4479,8 +4479,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -4495,8 +4495,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -4511,8 +4511,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -4536,9 +4536,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -4562,9 +4562,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -4588,9 +4588,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -4614,9 +4614,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -4640,9 +4640,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -4994,8 +4994,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -5012,8 +5012,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -5030,8 +5030,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -5048,8 +5048,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -5066,8 +5066,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -5082,8 +5082,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -5098,8 +5098,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -5114,8 +5114,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -5130,8 +5130,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -5146,8 +5146,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -5162,8 +5162,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -5178,8 +5178,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -5194,8 +5194,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -5210,8 +5210,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -5226,8 +5226,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -5251,9 +5251,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -5277,9 +5277,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -5303,9 +5303,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -5329,9 +5329,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -5355,9 +5355,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -5708,8 +5708,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -5725,8 +5725,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -5742,8 +5742,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -5759,8 +5759,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -5776,8 +5776,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -5792,8 +5792,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -5808,8 +5808,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -5824,8 +5824,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -5840,8 +5840,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -5856,8 +5856,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -5872,8 +5872,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -5888,8 +5888,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -5904,8 +5904,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -5920,8 +5920,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -5936,8 +5936,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -5961,9 +5961,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -5987,9 +5987,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -6013,9 +6013,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -6039,9 +6039,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -6065,9 +6065,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -6418,8 +6418,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -6435,8 +6435,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -6452,8 +6452,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -6469,8 +6469,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -6486,8 +6486,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -6502,8 +6502,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -6518,8 +6518,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -6534,8 +6534,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -6550,8 +6550,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -6566,8 +6566,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -6582,8 +6582,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -6598,8 +6598,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -6614,8 +6614,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -6630,8 +6630,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -6646,8 +6646,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -6671,9 +6671,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -6697,9 +6697,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -6723,9 +6723,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -6749,9 +6749,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -6775,9 +6775,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a.ll
index f57d8006f7e8b7..0ea04d18788f68 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a.ll
@@ -1129,7 +1129,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1141,7 +1141,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1153,7 +1153,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1165,7 +1165,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1177,7 +1177,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1189,7 +1189,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1201,7 +1201,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1213,7 +1213,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -1225,7 +1225,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -1237,7 +1237,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -1249,7 +1249,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_monotonic:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -1261,7 +1261,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acquire:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -1273,7 +1273,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_release:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -1285,7 +1285,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acq_rel:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -1297,7 +1297,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_seq_cst:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -1312,7 +1312,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -1327,7 +1327,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -1342,7 +1342,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -1357,7 +1357,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -1372,7 +1372,7 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_add_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    adds x8, x0, x20
+; -O1:    adds x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -1904,7 +1904,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1916,7 +1916,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1928,7 +1928,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1940,7 +1940,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1952,7 +1952,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1964,7 +1964,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1976,7 +1976,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1988,7 +1988,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -2000,7 +2000,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -2012,7 +2012,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -2024,7 +2024,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_monotonic:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -2036,7 +2036,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acquire:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -2048,7 +2048,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_release:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -2060,7 +2060,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acq_rel:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -2072,7 +2072,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_seq_cst:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -2086,7 +2086,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -2100,7 +2100,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -2114,7 +2114,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -2128,7 +2128,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -2142,7 +2142,7 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    subs x8, x0, x20
+; -O1:    subs x8, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -2489,9 +2489,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    and x9, x0, x2
-; -O1:    and x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    and x9, x1, x3
+; -O1:    and x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -2512,9 +2512,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    and x9, x0, x2
-; -O1:    and x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    and x9, x1, x3
+; -O1:    and x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -2535,9 +2535,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_release:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    and x9, x0, x2
-; -O1:    and x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x1, x3
+; -O1:    and x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -2558,9 +2558,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    and x9, x0, x2
-; -O1:    and x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x1, x3
+; -O1:    and x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -2581,9 +2581,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    and x9, x0, x2
-; -O1:    and x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x1, x3
+; -O1:    and x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -2679,7 +2679,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -2691,7 +2691,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -2703,7 +2703,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -2715,7 +2715,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -2727,7 +2727,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -2739,7 +2739,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -2751,7 +2751,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -2763,7 +2763,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -2775,7 +2775,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -2787,7 +2787,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -2799,7 +2799,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -2811,7 +2811,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -2823,7 +2823,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -2835,7 +2835,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -2847,7 +2847,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -2862,7 +2862,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -2877,7 +2877,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -2892,7 +2892,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -2907,7 +2907,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -2922,7 +2922,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -3312,9 +3312,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic:
 ; -O1:    ldxp x0, x1, [x8]
 ; -O1:    and x9, x0, x2
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x1, x3
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
@@ -3339,9 +3339,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire:
 ; -O1:    ldaxp x0, x1, [x8]
 ; -O1:    and x9, x0, x2
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x1, x3
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
@@ -3366,9 +3366,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_release:
 ; -O1:    ldxp x0, x1, [x8]
 ; -O1:    and x9, x0, x2
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x1, x3
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value release, align 16
     ret i128 %r
@@ -3393,9 +3393,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel:
 ; -O1:    ldaxp x0, x1, [x8]
 ; -O1:    and x9, x0, x2
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x1, x3
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
@@ -3420,9 +3420,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst:
 ; -O1:    ldaxp x0, x1, [x8]
 ; -O1:    and x9, x0, x2
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x1, x3
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
@@ -3530,7 +3530,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value monotonic, align 1
@@ -3544,7 +3544,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acquire, align 1
@@ -3558,7 +3558,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value release, align 1
@@ -3572,7 +3572,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acq_rel, align 1
@@ -3586,7 +3586,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value seq_cst, align 1
@@ -3600,7 +3600,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value monotonic, align 1
@@ -3614,7 +3614,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acquire, align 1
@@ -3628,7 +3628,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value release, align 1
@@ -3642,7 +3642,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acq_rel, align 1
@@ -3656,7 +3656,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value seq_cst, align 1
@@ -3670,7 +3670,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value monotonic, align 1
@@ -3684,7 +3684,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acquire, align 1
@@ -3698,7 +3698,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value release, align 1
@@ -3712,7 +3712,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acq_rel, align 1
@@ -3726,7 +3726,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value seq_cst, align 1
@@ -3744,7 +3744,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3763,7 +3763,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3782,7 +3782,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3801,7 +3801,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3820,7 +3820,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -4169,9 +4169,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    orr x9, x0, x2
-; -O1:    orr x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    orr x9, x1, x3
+; -O1:    orr x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -4192,9 +4192,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    orr x9, x0, x2
-; -O1:    orr x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    orr x9, x1, x3
+; -O1:    orr x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -4215,9 +4215,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_release:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    orr x9, x0, x2
-; -O1:    orr x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x1, x3
+; -O1:    orr x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -4238,9 +4238,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    orr x9, x0, x2
-; -O1:    orr x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x1, x3
+; -O1:    orr x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -4261,9 +4261,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    orr x9, x0, x2
-; -O1:    orr x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x1, x3
+; -O1:    orr x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -4359,7 +4359,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_monotonic(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -4371,7 +4371,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -4383,7 +4383,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -4395,7 +4395,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -4407,7 +4407,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -4419,7 +4419,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_monotonic(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -4431,7 +4431,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -4443,7 +4443,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -4455,7 +4455,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -4467,7 +4467,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -4479,7 +4479,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_monotonic(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_monotonic:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -4491,7 +4491,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acquire:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -4503,7 +4503,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_release:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -4515,7 +4515,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acq_rel:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -4527,7 +4527,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_seq_cst:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -4542,7 +4542,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_monotonic(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -4557,7 +4557,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acquire(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -4572,7 +4572,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_release(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -4587,7 +4587,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acq_rel(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -4602,7 +4602,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_seq_cst(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -4949,9 +4949,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    eor x9, x0, x2
-; -O1:    eor x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    eor x9, x1, x3
+; -O1:    eor x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -4972,9 +4972,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    eor x9, x0, x2
-; -O1:    eor x10, x1, x3
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    eor x9, x1, x3
+; -O1:    eor x10, x0, x2
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -4995,9 +4995,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_release:
 ; -O1:    ldxp x0, x1, [x8]
-; -O1:    eor x9, x0, x2
-; -O1:    eor x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x1, x3
+; -O1:    eor x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -5018,9 +5018,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    eor x9, x0, x2
-; -O1:    eor x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x1, x3
+; -O1:    eor x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -5041,9 +5041,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst:
 ; -O1:    ldaxp x0, x1, [x8]
-; -O1:    eor x9, x0, x2
-; -O1:    eor x10, x1, x3
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x1, x3
+; -O1:    eor x10, x0, x2
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -5139,7 +5139,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -5151,7 +5151,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -5163,7 +5163,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -5175,7 +5175,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -5187,7 +5187,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -5199,7 +5199,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -5211,7 +5211,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -5223,7 +5223,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -5235,7 +5235,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -5247,7 +5247,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -5259,7 +5259,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_monotonic:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -5271,7 +5271,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acquire:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -5283,7 +5283,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_release:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -5295,7 +5295,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acq_rel:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -5307,7 +5307,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_seq_cst:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -5322,7 +5322,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -5337,7 +5337,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -5352,7 +5352,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -5367,7 +5367,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -5382,7 +5382,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -6109,8 +6109,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -6127,8 +6127,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -6145,8 +6145,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -6163,8 +6163,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -6181,8 +6181,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -6197,8 +6197,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -6213,8 +6213,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -6229,8 +6229,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -6245,8 +6245,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -6261,8 +6261,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -6277,8 +6277,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -6293,8 +6293,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -6309,8 +6309,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -6325,8 +6325,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -6341,8 +6341,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -6366,9 +6366,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -6392,9 +6392,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -6418,9 +6418,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -6444,9 +6444,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -6470,9 +6470,9 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_max_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -7199,8 +7199,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -7217,8 +7217,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -7235,8 +7235,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -7253,8 +7253,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -7271,8 +7271,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -7287,8 +7287,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -7303,8 +7303,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -7319,8 +7319,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -7335,8 +7335,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -7351,8 +7351,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -7367,8 +7367,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -7383,8 +7383,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -7399,8 +7399,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -7415,8 +7415,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -7431,8 +7431,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -7456,9 +7456,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -7482,9 +7482,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -7508,9 +7508,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -7534,9 +7534,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -7560,9 +7560,9 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_min_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -8283,8 +8283,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -8300,8 +8300,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -8317,8 +8317,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -8334,8 +8334,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -8351,8 +8351,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -8367,8 +8367,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -8383,8 +8383,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -8399,8 +8399,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -8415,8 +8415,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -8431,8 +8431,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -8447,8 +8447,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -8463,8 +8463,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -8479,8 +8479,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -8495,8 +8495,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -8511,8 +8511,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -8536,9 +8536,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -8562,9 +8562,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -8588,9 +8588,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -8614,9 +8614,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -8640,9 +8640,9 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umax_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -9363,8 +9363,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -9380,8 +9380,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -9397,8 +9397,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -9414,8 +9414,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -9431,8 +9431,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -9447,8 +9447,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -9463,8 +9463,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -9479,8 +9479,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -9495,8 +9495,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -9511,8 +9511,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -9527,8 +9527,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -9543,8 +9543,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -9559,8 +9559,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -9575,8 +9575,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -9591,8 +9591,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -9616,9 +9616,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -9642,9 +9642,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -9668,9 +9668,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -9694,9 +9694,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -9720,9 +9720,9 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ;
 ; -O1-LABEL: atomicrmw_umin_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
-; -O1:    cmp x20, x0
+; -O1:    cmp x21, x0
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse2.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse2.ll
index 926827e6a875ff..01317e09028c35 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse2.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse2.ll
@@ -1129,7 +1129,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1141,7 +1141,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1153,7 +1153,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1165,7 +1165,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1177,7 +1177,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1189,7 +1189,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1201,7 +1201,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1213,7 +1213,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -1225,7 +1225,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -1237,7 +1237,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -1249,7 +1249,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_monotonic:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -1261,7 +1261,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acquire:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -1273,7 +1273,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_release:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -1285,7 +1285,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acq_rel:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -1297,7 +1297,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_seq_cst:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -1894,7 +1894,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1906,7 +1906,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1918,7 +1918,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1930,7 +1930,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1942,7 +1942,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1954,7 +1954,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1966,7 +1966,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1978,7 +1978,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -1990,7 +1990,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -2002,7 +2002,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -2014,7 +2014,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_monotonic:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -2026,7 +2026,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acquire:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -2038,7 +2038,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_release:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -2050,7 +2050,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acq_rel:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -2062,7 +2062,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_seq_cst:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -2487,9 +2487,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    and x9, x1, x3
-; -O1:    and x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    and x9, x0, x2
+; -O1:    and x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -2508,9 +2508,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    and x9, x1, x3
-; -O1:    and x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    and x9, x0, x2
+; -O1:    and x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -2529,9 +2529,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_release:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    and x9, x1, x3
-; -O1:    and x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x0, x2
+; -O1:    and x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -2550,9 +2550,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    and x9, x1, x3
-; -O1:    and x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x0, x2
+; -O1:    and x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -2571,9 +2571,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    and x9, x1, x3
-; -O1:    and x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x0, x2
+; -O1:    and x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -2669,7 +2669,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -2681,7 +2681,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -2693,7 +2693,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -2705,7 +2705,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -2717,7 +2717,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -2729,7 +2729,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -2741,7 +2741,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -2753,7 +2753,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -2765,7 +2765,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -2777,7 +2777,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -2789,7 +2789,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -2801,7 +2801,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -2813,7 +2813,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -2825,7 +2825,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -2837,7 +2837,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -2852,7 +2852,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -2867,7 +2867,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -2882,7 +2882,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -2897,7 +2897,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -2912,7 +2912,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -3315,9 +3315,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic:
 ; -O1:    ldxp x1, x0, [x8]
 ; -O1:    and x9, x1, x3
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x0, x2
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
@@ -3340,9 +3340,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire:
 ; -O1:    ldaxp x1, x0, [x8]
 ; -O1:    and x9, x1, x3
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x0, x2
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
@@ -3365,9 +3365,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_release:
 ; -O1:    ldxp x1, x0, [x8]
 ; -O1:    and x9, x1, x3
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x0, x2
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value release, align 16
     ret i128 %r
@@ -3390,9 +3390,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel:
 ; -O1:    ldaxp x1, x0, [x8]
 ; -O1:    and x9, x1, x3
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x0, x2
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
@@ -3415,9 +3415,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst:
 ; -O1:    ldaxp x1, x0, [x8]
 ; -O1:    and x9, x1, x3
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x0, x2
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
@@ -3525,7 +3525,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value monotonic, align 1
@@ -3539,7 +3539,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acquire, align 1
@@ -3553,7 +3553,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value release, align 1
@@ -3567,7 +3567,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acq_rel, align 1
@@ -3581,7 +3581,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value seq_cst, align 1
@@ -3595,7 +3595,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value monotonic, align 1
@@ -3609,7 +3609,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acquire, align 1
@@ -3623,7 +3623,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value release, align 1
@@ -3637,7 +3637,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acq_rel, align 1
@@ -3651,7 +3651,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value seq_cst, align 1
@@ -3665,7 +3665,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value monotonic, align 1
@@ -3679,7 +3679,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acquire, align 1
@@ -3693,7 +3693,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value release, align 1
@@ -3707,7 +3707,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acq_rel, align 1
@@ -3721,7 +3721,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value seq_cst, align 1
@@ -3739,7 +3739,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3758,7 +3758,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3777,7 +3777,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3796,7 +3796,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3815,7 +3815,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -4177,9 +4177,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    orr x9, x1, x3
-; -O1:    orr x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    orr x9, x0, x2
+; -O1:    orr x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -4198,9 +4198,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    orr x9, x1, x3
-; -O1:    orr x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    orr x9, x0, x2
+; -O1:    orr x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -4219,9 +4219,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_release:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    orr x9, x1, x3
-; -O1:    orr x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x0, x2
+; -O1:    orr x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -4240,9 +4240,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    orr x9, x1, x3
-; -O1:    orr x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x0, x2
+; -O1:    orr x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -4261,9 +4261,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    orr x9, x1, x3
-; -O1:    orr x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x0, x2
+; -O1:    orr x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -4359,7 +4359,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_monotonic(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -4371,7 +4371,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -4383,7 +4383,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -4395,7 +4395,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -4407,7 +4407,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -4419,7 +4419,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_monotonic(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -4431,7 +4431,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -4443,7 +4443,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -4455,7 +4455,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -4467,7 +4467,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -4479,7 +4479,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_monotonic(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_monotonic:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -4491,7 +4491,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acquire:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -4503,7 +4503,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_release:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -4515,7 +4515,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acq_rel:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -4527,7 +4527,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_seq_cst:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -4542,7 +4542,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_monotonic(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -4557,7 +4557,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acquire(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -4572,7 +4572,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_release(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -4587,7 +4587,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acq_rel(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -4602,7 +4602,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_seq_cst(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -4962,9 +4962,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    eor x9, x1, x3
-; -O1:    eor x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    eor x9, x0, x2
+; -O1:    eor x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -4983,9 +4983,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    eor x9, x1, x3
-; -O1:    eor x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    eor x9, x0, x2
+; -O1:    eor x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -5004,9 +5004,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_release:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    eor x9, x1, x3
-; -O1:    eor x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x0, x2
+; -O1:    eor x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -5025,9 +5025,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    eor x9, x1, x3
-; -O1:    eor x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x0, x2
+; -O1:    eor x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -5046,9 +5046,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    eor x9, x1, x3
-; -O1:    eor x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x0, x2
+; -O1:    eor x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -5144,7 +5144,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -5156,7 +5156,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -5168,7 +5168,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -5180,7 +5180,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -5192,7 +5192,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -5204,7 +5204,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -5216,7 +5216,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -5228,7 +5228,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -5240,7 +5240,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -5252,7 +5252,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -5264,7 +5264,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_monotonic:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -5276,7 +5276,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acquire:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -5288,7 +5288,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_release:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -5300,7 +5300,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acq_rel:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -5312,7 +5312,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_seq_cst:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -5327,7 +5327,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -5342,7 +5342,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -5357,7 +5357,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -5372,7 +5372,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -5387,7 +5387,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -6022,8 +6022,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -6038,8 +6038,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -6054,8 +6054,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -6070,8 +6070,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -6086,8 +6086,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -6100,8 +6100,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -6114,8 +6114,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -6128,8 +6128,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -6142,8 +6142,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -6156,8 +6156,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -6170,8 +6170,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -6184,8 +6184,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -6198,8 +6198,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -6212,8 +6212,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -6226,8 +6226,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -6244,7 +6244,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -6261,7 +6261,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -6278,7 +6278,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -6295,7 +6295,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -6312,7 +6312,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -6947,8 +6947,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -6963,8 +6963,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -6979,8 +6979,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -6995,8 +6995,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -7011,8 +7011,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -7025,8 +7025,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -7039,8 +7039,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -7053,8 +7053,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -7067,8 +7067,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -7081,8 +7081,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -7095,8 +7095,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -7109,8 +7109,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -7123,8 +7123,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -7137,8 +7137,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -7151,8 +7151,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -7169,7 +7169,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -7186,7 +7186,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -7203,7 +7203,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -7220,7 +7220,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -7237,7 +7237,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -7872,8 +7872,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -7888,8 +7888,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -7904,8 +7904,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -7920,8 +7920,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -7936,8 +7936,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -7950,8 +7950,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -7964,8 +7964,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -7978,8 +7978,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -7992,8 +7992,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -8006,8 +8006,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -8020,8 +8020,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -8034,8 +8034,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -8048,8 +8048,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -8062,8 +8062,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -8076,8 +8076,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -8094,7 +8094,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -8111,7 +8111,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -8128,7 +8128,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -8145,7 +8145,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -8162,7 +8162,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -8797,8 +8797,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -8813,8 +8813,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -8829,8 +8829,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -8845,8 +8845,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -8861,8 +8861,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -8875,8 +8875,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -8889,8 +8889,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -8903,8 +8903,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -8917,8 +8917,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -8931,8 +8931,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -8945,8 +8945,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -8959,8 +8959,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -8973,8 +8973,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -8987,8 +8987,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -9001,8 +9001,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -9019,7 +9019,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -9036,7 +9036,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -9053,7 +9053,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -9070,7 +9070,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -9087,7 +9087,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse2_lse128.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse2_lse128.ll
index bb1702bc58ca7f..83e383f335637c 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse2_lse128.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse2_lse128.ll
@@ -634,7 +634,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -646,7 +646,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -658,7 +658,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -670,7 +670,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -682,7 +682,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -694,7 +694,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -706,7 +706,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -718,7 +718,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -730,7 +730,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -742,7 +742,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -754,7 +754,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_monotonic:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -766,7 +766,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acquire:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -778,7 +778,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_release:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -790,7 +790,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acq_rel:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -802,7 +802,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_seq_cst:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -1239,7 +1239,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1251,7 +1251,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1263,7 +1263,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1275,7 +1275,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1287,7 +1287,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1299,7 +1299,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1311,7 +1311,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1323,7 +1323,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -1335,7 +1335,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -1347,7 +1347,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -1359,7 +1359,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_monotonic:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -1371,7 +1371,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acquire:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -1383,7 +1383,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_release:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -1395,7 +1395,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acq_rel:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -1407,7 +1407,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_seq_cst:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -1754,7 +1754,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1766,7 +1766,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1778,7 +1778,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1790,7 +1790,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1802,7 +1802,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1814,7 +1814,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1826,7 +1826,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1838,7 +1838,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -1850,7 +1850,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -1862,7 +1862,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -1874,7 +1874,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -1886,7 +1886,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -1898,7 +1898,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -1910,7 +1910,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -1922,7 +1922,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -1937,7 +1937,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -1952,7 +1952,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -1967,7 +1967,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -1982,7 +1982,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -1997,7 +1997,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -2550,7 +2550,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value monotonic, align 1
@@ -2564,7 +2564,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acquire, align 1
@@ -2578,7 +2578,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value release, align 1
@@ -2592,7 +2592,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acq_rel, align 1
@@ -2606,7 +2606,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value seq_cst, align 1
@@ -2620,7 +2620,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value monotonic, align 1
@@ -2634,7 +2634,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acquire, align 1
@@ -2648,7 +2648,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value release, align 1
@@ -2662,7 +2662,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acq_rel, align 1
@@ -2676,7 +2676,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value seq_cst, align 1
@@ -2690,7 +2690,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value monotonic, align 1
@@ -2704,7 +2704,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acquire, align 1
@@ -2718,7 +2718,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value release, align 1
@@ -2732,7 +2732,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acq_rel, align 1
@@ -2746,7 +2746,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value seq_cst, align 1
@@ -2764,7 +2764,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -2783,7 +2783,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -2802,7 +2802,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -2821,7 +2821,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -2840,7 +2840,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3079,7 +3079,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_monotonic(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -3091,7 +3091,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -3103,7 +3103,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -3115,7 +3115,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -3127,7 +3127,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -3139,7 +3139,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_monotonic(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -3151,7 +3151,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -3163,7 +3163,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -3175,7 +3175,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -3187,7 +3187,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -3199,7 +3199,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_monotonic(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_monotonic:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -3211,7 +3211,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acquire:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -3223,7 +3223,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_release:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -3235,7 +3235,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acq_rel:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -3247,7 +3247,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_seq_cst:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -3262,7 +3262,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_monotonic(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -3277,7 +3277,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acquire(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -3292,7 +3292,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_release(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -3307,7 +3307,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acq_rel(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -3322,7 +3322,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_seq_cst(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -3604,7 +3604,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -3616,7 +3616,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -3628,7 +3628,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -3640,7 +3640,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -3652,7 +3652,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -3664,7 +3664,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -3676,7 +3676,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -3688,7 +3688,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -3700,7 +3700,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -3712,7 +3712,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -3724,7 +3724,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_monotonic:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -3736,7 +3736,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acquire:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -3748,7 +3748,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_release:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -3760,7 +3760,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acq_rel:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -3772,7 +3772,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_seq_cst:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -3787,7 +3787,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -3802,7 +3802,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -3817,7 +3817,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -3832,7 +3832,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -3847,7 +3847,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -4142,8 +4142,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -4158,8 +4158,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -4174,8 +4174,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -4190,8 +4190,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -4206,8 +4206,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -4220,8 +4220,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -4234,8 +4234,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -4248,8 +4248,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -4262,8 +4262,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -4276,8 +4276,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -4290,8 +4290,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -4304,8 +4304,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -4318,8 +4318,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -4332,8 +4332,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -4346,8 +4346,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -4364,7 +4364,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -4381,7 +4381,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -4398,7 +4398,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -4415,7 +4415,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -4432,7 +4432,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -4727,8 +4727,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -4743,8 +4743,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -4759,8 +4759,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -4775,8 +4775,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -4791,8 +4791,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -4805,8 +4805,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -4819,8 +4819,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -4833,8 +4833,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -4847,8 +4847,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -4861,8 +4861,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -4875,8 +4875,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -4889,8 +4889,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -4903,8 +4903,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -4917,8 +4917,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -4931,8 +4931,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -4949,7 +4949,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -4966,7 +4966,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -4983,7 +4983,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -5000,7 +5000,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -5017,7 +5017,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -5312,8 +5312,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -5328,8 +5328,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -5344,8 +5344,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -5360,8 +5360,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -5376,8 +5376,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -5390,8 +5390,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -5404,8 +5404,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -5418,8 +5418,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -5432,8 +5432,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -5446,8 +5446,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -5460,8 +5460,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -5474,8 +5474,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -5488,8 +5488,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -5502,8 +5502,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -5516,8 +5516,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -5534,7 +5534,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -5551,7 +5551,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -5568,7 +5568,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -5585,7 +5585,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -5602,7 +5602,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -5897,8 +5897,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -5913,8 +5913,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -5929,8 +5929,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -5945,8 +5945,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -5961,8 +5961,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -5975,8 +5975,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -5989,8 +5989,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -6003,8 +6003,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -6017,8 +6017,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -6031,8 +6031,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -6045,8 +6045,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -6059,8 +6059,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -6073,8 +6073,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -6087,8 +6087,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -6101,8 +6101,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -6119,7 +6119,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -6136,7 +6136,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -6153,7 +6153,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -6170,7 +6170,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -6187,7 +6187,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-outline_atomics.ll
index d8ffc1ad00f856..f9c1a2216dc2c1 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-outline_atomics.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-outline_atomics.ll
@@ -639,7 +639,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -651,7 +651,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -663,7 +663,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -675,7 +675,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -687,7 +687,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -699,7 +699,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -711,7 +711,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -723,7 +723,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -735,7 +735,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -747,7 +747,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -759,7 +759,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_monotonic:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -771,7 +771,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acquire:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -783,7 +783,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_release:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -795,7 +795,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acq_rel:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -807,7 +807,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_seq_cst:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -1234,7 +1234,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1246,7 +1246,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1258,7 +1258,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1270,7 +1270,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1282,7 +1282,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1294,7 +1294,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1306,7 +1306,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1318,7 +1318,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -1330,7 +1330,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -1342,7 +1342,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -1354,7 +1354,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_monotonic:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -1366,7 +1366,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acquire:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -1378,7 +1378,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_release:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -1390,7 +1390,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acq_rel:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -1402,7 +1402,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_seq_cst:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -1723,9 +1723,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    and x9, x1, x3
-; -O1:    and x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    and x9, x0, x2
+; -O1:    and x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -1740,9 +1740,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    and x9, x1, x3
-; -O1:    and x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    and x9, x0, x2
+; -O1:    and x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -1757,9 +1757,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_release:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    and x9, x1, x3
-; -O1:    and x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x0, x2
+; -O1:    and x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -1774,9 +1774,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    and x9, x1, x3
-; -O1:    and x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x0, x2
+; -O1:    and x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -1791,9 +1791,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    and x9, x1, x3
-; -O1:    and x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x0, x2
+; -O1:    and x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -1864,7 +1864,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1876,7 +1876,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1888,7 +1888,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1900,7 +1900,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1912,7 +1912,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1924,7 +1924,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1936,7 +1936,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1948,7 +1948,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -1960,7 +1960,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -1972,7 +1972,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -1984,7 +1984,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -1996,7 +1996,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -2008,7 +2008,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -2020,7 +2020,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -2032,7 +2032,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -2047,7 +2047,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -2062,7 +2062,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -2077,7 +2077,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -2092,7 +2092,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -2107,7 +2107,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -2466,9 +2466,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic:
 ; -O1:    ldxp x1, x0, [x8]
 ; -O1:    and x9, x1, x3
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x0, x2
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
@@ -2487,9 +2487,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire:
 ; -O1:    ldaxp x1, x0, [x8]
 ; -O1:    and x9, x1, x3
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x0, x2
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
@@ -2508,9 +2508,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_release:
 ; -O1:    ldxp x1, x0, [x8]
 ; -O1:    and x9, x1, x3
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x0, x2
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value release, align 16
     ret i128 %r
@@ -2529,9 +2529,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel:
 ; -O1:    ldaxp x1, x0, [x8]
 ; -O1:    and x9, x1, x3
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x0, x2
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
@@ -2550,9 +2550,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst:
 ; -O1:    ldaxp x1, x0, [x8]
 ; -O1:    and x9, x1, x3
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x0, x2
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
@@ -2650,7 +2650,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value monotonic, align 1
@@ -2664,7 +2664,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acquire, align 1
@@ -2678,7 +2678,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value release, align 1
@@ -2692,7 +2692,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acq_rel, align 1
@@ -2706,7 +2706,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value seq_cst, align 1
@@ -2720,7 +2720,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value monotonic, align 1
@@ -2734,7 +2734,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acquire, align 1
@@ -2748,7 +2748,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value release, align 1
@@ -2762,7 +2762,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acq_rel, align 1
@@ -2776,7 +2776,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value seq_cst, align 1
@@ -2790,7 +2790,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value monotonic, align 1
@@ -2804,7 +2804,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acquire, align 1
@@ -2818,7 +2818,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value release, align 1
@@ -2832,7 +2832,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acq_rel, align 1
@@ -2846,7 +2846,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value seq_cst, align 1
@@ -2864,7 +2864,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -2883,7 +2883,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -2902,7 +2902,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -2921,7 +2921,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -2940,7 +2940,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3098,9 +3098,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    orr x9, x1, x3
-; -O1:    orr x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    orr x9, x0, x2
+; -O1:    orr x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -3115,9 +3115,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    orr x9, x1, x3
-; -O1:    orr x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    orr x9, x0, x2
+; -O1:    orr x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -3132,9 +3132,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_release:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    orr x9, x1, x3
-; -O1:    orr x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x0, x2
+; -O1:    orr x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -3149,9 +3149,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    orr x9, x1, x3
-; -O1:    orr x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x0, x2
+; -O1:    orr x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -3166,9 +3166,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    orr x9, x1, x3
-; -O1:    orr x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x0, x2
+; -O1:    orr x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -3214,7 +3214,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_monotonic(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -3226,7 +3226,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -3238,7 +3238,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -3250,7 +3250,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -3262,7 +3262,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -3274,7 +3274,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_monotonic(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -3286,7 +3286,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -3298,7 +3298,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -3310,7 +3310,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -3322,7 +3322,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -3334,7 +3334,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_monotonic(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_monotonic:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -3346,7 +3346,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acquire:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -3358,7 +3358,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_release:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -3370,7 +3370,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acq_rel:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -3382,7 +3382,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_seq_cst:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -3397,7 +3397,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_monotonic(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -3412,7 +3412,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acquire(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -3427,7 +3427,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_release(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -3442,7 +3442,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acq_rel(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -3457,7 +3457,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_seq_cst(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -3613,9 +3613,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    eor x9, x1, x3
-; -O1:    eor x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    eor x9, x0, x2
+; -O1:    eor x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -3630,9 +3630,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    eor x9, x1, x3
-; -O1:    eor x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    eor x9, x0, x2
+; -O1:    eor x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -3647,9 +3647,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_release:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    eor x9, x1, x3
-; -O1:    eor x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x0, x2
+; -O1:    eor x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -3664,9 +3664,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    eor x9, x1, x3
-; -O1:    eor x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x0, x2
+; -O1:    eor x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -3681,9 +3681,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    eor x9, x1, x3
-; -O1:    eor x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x0, x2
+; -O1:    eor x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -3729,7 +3729,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -3741,7 +3741,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -3753,7 +3753,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -3765,7 +3765,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -3777,7 +3777,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -3789,7 +3789,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -3801,7 +3801,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -3813,7 +3813,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -3825,7 +3825,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -3837,7 +3837,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -3849,7 +3849,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_monotonic:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -3861,7 +3861,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acquire:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -3873,7 +3873,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_release:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -3885,7 +3885,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acq_rel:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -3897,7 +3897,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_seq_cst:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -3912,7 +3912,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -3927,7 +3927,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -3942,7 +3942,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -3957,7 +3957,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -3972,7 +3972,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -4537,8 +4537,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -4553,8 +4553,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -4569,8 +4569,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -4585,8 +4585,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -4601,8 +4601,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -4615,8 +4615,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -4629,8 +4629,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -4643,8 +4643,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -4657,8 +4657,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -4671,8 +4671,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -4685,8 +4685,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -4699,8 +4699,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -4713,8 +4713,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -4727,8 +4727,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -4741,8 +4741,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -4759,7 +4759,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -4776,7 +4776,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -4793,7 +4793,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -4810,7 +4810,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -4827,7 +4827,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -5392,8 +5392,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -5408,8 +5408,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -5424,8 +5424,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -5440,8 +5440,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -5456,8 +5456,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -5470,8 +5470,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -5484,8 +5484,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -5498,8 +5498,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -5512,8 +5512,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -5526,8 +5526,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -5540,8 +5540,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -5554,8 +5554,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -5568,8 +5568,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -5582,8 +5582,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -5596,8 +5596,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -5614,7 +5614,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -5631,7 +5631,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -5648,7 +5648,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -5665,7 +5665,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -5682,7 +5682,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -6247,8 +6247,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -6263,8 +6263,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -6279,8 +6279,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -6295,8 +6295,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -6311,8 +6311,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -6325,8 +6325,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -6339,8 +6339,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -6353,8 +6353,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -6367,8 +6367,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -6381,8 +6381,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -6395,8 +6395,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -6409,8 +6409,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -6423,8 +6423,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -6437,8 +6437,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -6451,8 +6451,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -6469,7 +6469,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -6486,7 +6486,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -6503,7 +6503,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -6520,7 +6520,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -6537,7 +6537,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -7102,8 +7102,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -7118,8 +7118,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -7134,8 +7134,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -7150,8 +7150,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -7166,8 +7166,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -7180,8 +7180,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -7194,8 +7194,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -7208,8 +7208,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -7222,8 +7222,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -7236,8 +7236,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -7250,8 +7250,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -7264,8 +7264,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -7278,8 +7278,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -7292,8 +7292,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -7306,8 +7306,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -7324,7 +7324,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -7341,7 +7341,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -7358,7 +7358,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -7375,7 +7375,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -7392,7 +7392,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc.ll
index e1fa58928a3dc9..1bead6d694c652 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc.ll
@@ -1129,7 +1129,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1141,7 +1141,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1153,7 +1153,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1165,7 +1165,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1177,7 +1177,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1189,7 +1189,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1201,7 +1201,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1213,7 +1213,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -1225,7 +1225,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -1237,7 +1237,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -1249,7 +1249,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_monotonic:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -1261,7 +1261,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acquire:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -1273,7 +1273,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_release:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -1285,7 +1285,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acq_rel:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -1297,7 +1297,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_seq_cst:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -1894,7 +1894,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1906,7 +1906,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1918,7 +1918,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1930,7 +1930,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1942,7 +1942,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1954,7 +1954,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1966,7 +1966,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1978,7 +1978,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -1990,7 +1990,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -2002,7 +2002,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -2014,7 +2014,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_monotonic:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -2026,7 +2026,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acquire:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -2038,7 +2038,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_release:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -2050,7 +2050,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acq_rel:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -2062,7 +2062,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_seq_cst:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -2487,9 +2487,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    and x9, x1, x3
-; -O1:    and x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    and x9, x0, x2
+; -O1:    and x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -2508,9 +2508,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    and x9, x1, x3
-; -O1:    and x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    and x9, x0, x2
+; -O1:    and x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -2529,9 +2529,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_release:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    and x9, x1, x3
-; -O1:    and x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x0, x2
+; -O1:    and x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -2550,9 +2550,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    and x9, x1, x3
-; -O1:    and x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x0, x2
+; -O1:    and x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -2571,9 +2571,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    and x9, x1, x3
-; -O1:    and x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x0, x2
+; -O1:    and x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -2669,7 +2669,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -2681,7 +2681,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -2693,7 +2693,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -2705,7 +2705,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -2717,7 +2717,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -2729,7 +2729,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -2741,7 +2741,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -2753,7 +2753,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -2765,7 +2765,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -2777,7 +2777,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -2789,7 +2789,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -2801,7 +2801,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -2813,7 +2813,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -2825,7 +2825,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -2837,7 +2837,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -2852,7 +2852,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -2867,7 +2867,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -2882,7 +2882,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -2897,7 +2897,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -2912,7 +2912,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -3315,9 +3315,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic:
 ; -O1:    ldxp x1, x0, [x8]
 ; -O1:    and x9, x1, x3
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x0, x2
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
@@ -3340,9 +3340,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire:
 ; -O1:    ldaxp x1, x0, [x8]
 ; -O1:    and x9, x1, x3
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x0, x2
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
@@ -3365,9 +3365,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_release:
 ; -O1:    ldxp x1, x0, [x8]
 ; -O1:    and x9, x1, x3
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x0, x2
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value release, align 16
     ret i128 %r
@@ -3390,9 +3390,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel:
 ; -O1:    ldaxp x1, x0, [x8]
 ; -O1:    and x9, x1, x3
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x0, x2
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
@@ -3415,9 +3415,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst:
 ; -O1:    ldaxp x1, x0, [x8]
 ; -O1:    and x9, x1, x3
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x0, x2
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
@@ -3525,7 +3525,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value monotonic, align 1
@@ -3539,7 +3539,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acquire, align 1
@@ -3553,7 +3553,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value release, align 1
@@ -3567,7 +3567,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acq_rel, align 1
@@ -3581,7 +3581,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value seq_cst, align 1
@@ -3595,7 +3595,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value monotonic, align 1
@@ -3609,7 +3609,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acquire, align 1
@@ -3623,7 +3623,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value release, align 1
@@ -3637,7 +3637,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acq_rel, align 1
@@ -3651,7 +3651,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value seq_cst, align 1
@@ -3665,7 +3665,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value monotonic, align 1
@@ -3679,7 +3679,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acquire, align 1
@@ -3693,7 +3693,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value release, align 1
@@ -3707,7 +3707,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acq_rel, align 1
@@ -3721,7 +3721,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value seq_cst, align 1
@@ -3739,7 +3739,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3758,7 +3758,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3777,7 +3777,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3796,7 +3796,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3815,7 +3815,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -4177,9 +4177,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    orr x9, x1, x3
-; -O1:    orr x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    orr x9, x0, x2
+; -O1:    orr x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -4198,9 +4198,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    orr x9, x1, x3
-; -O1:    orr x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    orr x9, x0, x2
+; -O1:    orr x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -4219,9 +4219,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_release:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    orr x9, x1, x3
-; -O1:    orr x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x0, x2
+; -O1:    orr x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -4240,9 +4240,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    orr x9, x1, x3
-; -O1:    orr x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x0, x2
+; -O1:    orr x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -4261,9 +4261,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    orr x9, x1, x3
-; -O1:    orr x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x0, x2
+; -O1:    orr x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -4359,7 +4359,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_monotonic(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -4371,7 +4371,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -4383,7 +4383,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -4395,7 +4395,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -4407,7 +4407,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -4419,7 +4419,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_monotonic(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -4431,7 +4431,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -4443,7 +4443,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -4455,7 +4455,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -4467,7 +4467,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -4479,7 +4479,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_monotonic(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_monotonic:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -4491,7 +4491,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acquire:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -4503,7 +4503,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_release:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -4515,7 +4515,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acq_rel:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -4527,7 +4527,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_seq_cst:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -4542,7 +4542,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_monotonic(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -4557,7 +4557,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acquire(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -4572,7 +4572,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_release(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -4587,7 +4587,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acq_rel(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -4602,7 +4602,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_seq_cst(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -4962,9 +4962,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    eor x9, x1, x3
-; -O1:    eor x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    eor x9, x0, x2
+; -O1:    eor x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -4983,9 +4983,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    eor x9, x1, x3
-; -O1:    eor x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    eor x9, x0, x2
+; -O1:    eor x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -5004,9 +5004,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_release:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    eor x9, x1, x3
-; -O1:    eor x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x0, x2
+; -O1:    eor x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -5025,9 +5025,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    eor x9, x1, x3
-; -O1:    eor x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x0, x2
+; -O1:    eor x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -5046,9 +5046,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    eor x9, x1, x3
-; -O1:    eor x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x0, x2
+; -O1:    eor x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -5144,7 +5144,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -5156,7 +5156,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -5168,7 +5168,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -5180,7 +5180,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -5192,7 +5192,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -5204,7 +5204,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -5216,7 +5216,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -5228,7 +5228,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -5240,7 +5240,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -5252,7 +5252,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -5264,7 +5264,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_monotonic:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -5276,7 +5276,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acquire:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -5288,7 +5288,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_release:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -5300,7 +5300,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acq_rel:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -5312,7 +5312,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_seq_cst:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -5327,7 +5327,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -5342,7 +5342,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -5357,7 +5357,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -5372,7 +5372,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -5387,7 +5387,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -6022,8 +6022,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -6038,8 +6038,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -6054,8 +6054,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -6070,8 +6070,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -6086,8 +6086,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -6100,8 +6100,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -6114,8 +6114,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -6128,8 +6128,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -6142,8 +6142,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -6156,8 +6156,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -6170,8 +6170,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -6184,8 +6184,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -6198,8 +6198,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -6212,8 +6212,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -6226,8 +6226,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -6244,7 +6244,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -6261,7 +6261,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -6278,7 +6278,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -6295,7 +6295,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -6312,7 +6312,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -6947,8 +6947,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -6963,8 +6963,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -6979,8 +6979,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -6995,8 +6995,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -7011,8 +7011,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -7025,8 +7025,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -7039,8 +7039,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -7053,8 +7053,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -7067,8 +7067,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -7081,8 +7081,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -7095,8 +7095,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -7109,8 +7109,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -7123,8 +7123,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -7137,8 +7137,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -7151,8 +7151,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -7169,7 +7169,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -7186,7 +7186,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -7203,7 +7203,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -7220,7 +7220,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -7237,7 +7237,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -7872,8 +7872,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -7888,8 +7888,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -7904,8 +7904,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -7920,8 +7920,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -7936,8 +7936,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -7950,8 +7950,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -7964,8 +7964,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -7978,8 +7978,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -7992,8 +7992,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -8006,8 +8006,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -8020,8 +8020,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -8034,8 +8034,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -8048,8 +8048,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -8062,8 +8062,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -8076,8 +8076,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -8094,7 +8094,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -8111,7 +8111,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -8128,7 +8128,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -8145,7 +8145,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -8162,7 +8162,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -8797,8 +8797,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -8813,8 +8813,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -8829,8 +8829,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -8845,8 +8845,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -8861,8 +8861,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -8875,8 +8875,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -8889,8 +8889,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -8903,8 +8903,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -8917,8 +8917,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -8931,8 +8931,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -8945,8 +8945,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -8959,8 +8959,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -8973,8 +8973,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -8987,8 +8987,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -9001,8 +9001,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -9019,7 +9019,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -9036,7 +9036,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -9053,7 +9053,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -9070,7 +9070,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -9087,7 +9087,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc3.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc3.ll
index 9458345fef72e0..51d9766f6a8f92 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc3.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc3.ll
@@ -1129,7 +1129,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1141,7 +1141,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1153,7 +1153,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1165,7 +1165,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1177,7 +1177,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1189,7 +1189,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1201,7 +1201,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1213,7 +1213,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -1225,7 +1225,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -1237,7 +1237,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -1249,7 +1249,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_monotonic:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -1261,7 +1261,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acquire:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -1273,7 +1273,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_release:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -1285,7 +1285,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acq_rel:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -1297,7 +1297,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_seq_cst:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -1894,7 +1894,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1906,7 +1906,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1918,7 +1918,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1930,7 +1930,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1942,7 +1942,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1954,7 +1954,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1966,7 +1966,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1978,7 +1978,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -1990,7 +1990,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -2002,7 +2002,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -2014,7 +2014,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_monotonic:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -2026,7 +2026,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acquire:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -2038,7 +2038,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_release:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -2050,7 +2050,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acq_rel:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -2062,7 +2062,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_seq_cst:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -2487,9 +2487,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    and x9, x1, x3
-; -O1:    and x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    and x9, x0, x2
+; -O1:    and x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -2508,9 +2508,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    and x9, x1, x3
-; -O1:    and x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    and x9, x0, x2
+; -O1:    and x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -2529,9 +2529,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_release:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    and x9, x1, x3
-; -O1:    and x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x0, x2
+; -O1:    and x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -2550,9 +2550,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    and x9, x1, x3
-; -O1:    and x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x0, x2
+; -O1:    and x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -2571,9 +2571,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    and x9, x1, x3
-; -O1:    and x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x0, x2
+; -O1:    and x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -2669,7 +2669,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -2681,7 +2681,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -2693,7 +2693,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -2705,7 +2705,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -2717,7 +2717,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -2729,7 +2729,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -2741,7 +2741,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -2753,7 +2753,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -2765,7 +2765,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -2777,7 +2777,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -2789,7 +2789,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -2801,7 +2801,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -2813,7 +2813,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -2825,7 +2825,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -2837,7 +2837,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -2852,7 +2852,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -2867,7 +2867,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -2882,7 +2882,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -2897,7 +2897,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -2912,7 +2912,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -3315,9 +3315,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic:
 ; -O1:    ldxp x1, x0, [x8]
 ; -O1:    and x9, x1, x3
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x0, x2
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
@@ -3340,9 +3340,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire:
 ; -O1:    ldaxp x1, x0, [x8]
 ; -O1:    and x9, x1, x3
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x0, x2
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
@@ -3365,9 +3365,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_release:
 ; -O1:    ldxp x1, x0, [x8]
 ; -O1:    and x9, x1, x3
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x0, x2
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value release, align 16
     ret i128 %r
@@ -3390,9 +3390,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel:
 ; -O1:    ldaxp x1, x0, [x8]
 ; -O1:    and x9, x1, x3
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x0, x2
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
@@ -3415,9 +3415,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst:
 ; -O1:    ldaxp x1, x0, [x8]
 ; -O1:    and x9, x1, x3
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x0, x2
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
@@ -3525,7 +3525,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value monotonic, align 1
@@ -3539,7 +3539,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acquire, align 1
@@ -3553,7 +3553,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value release, align 1
@@ -3567,7 +3567,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acq_rel, align 1
@@ -3581,7 +3581,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value seq_cst, align 1
@@ -3595,7 +3595,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value monotonic, align 1
@@ -3609,7 +3609,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acquire, align 1
@@ -3623,7 +3623,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value release, align 1
@@ -3637,7 +3637,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acq_rel, align 1
@@ -3651,7 +3651,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value seq_cst, align 1
@@ -3665,7 +3665,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value monotonic, align 1
@@ -3679,7 +3679,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acquire, align 1
@@ -3693,7 +3693,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value release, align 1
@@ -3707,7 +3707,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acq_rel, align 1
@@ -3721,7 +3721,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value seq_cst, align 1
@@ -3739,7 +3739,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3758,7 +3758,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3777,7 +3777,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3796,7 +3796,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3815,7 +3815,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -4177,9 +4177,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    orr x9, x1, x3
-; -O1:    orr x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    orr x9, x0, x2
+; -O1:    orr x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -4198,9 +4198,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    orr x9, x1, x3
-; -O1:    orr x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    orr x9, x0, x2
+; -O1:    orr x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -4219,9 +4219,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_release:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    orr x9, x1, x3
-; -O1:    orr x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x0, x2
+; -O1:    orr x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -4240,9 +4240,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    orr x9, x1, x3
-; -O1:    orr x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x0, x2
+; -O1:    orr x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -4261,9 +4261,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    orr x9, x1, x3
-; -O1:    orr x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x0, x2
+; -O1:    orr x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -4359,7 +4359,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_monotonic(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -4371,7 +4371,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -4383,7 +4383,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -4395,7 +4395,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -4407,7 +4407,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -4419,7 +4419,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_monotonic(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -4431,7 +4431,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -4443,7 +4443,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -4455,7 +4455,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -4467,7 +4467,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -4479,7 +4479,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_monotonic(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_monotonic:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -4491,7 +4491,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acquire:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -4503,7 +4503,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_release:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -4515,7 +4515,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acq_rel:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -4527,7 +4527,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_seq_cst:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -4542,7 +4542,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_monotonic(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -4557,7 +4557,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acquire(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -4572,7 +4572,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_release(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -4587,7 +4587,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acq_rel(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -4602,7 +4602,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_seq_cst(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -4962,9 +4962,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    eor x9, x1, x3
-; -O1:    eor x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    eor x9, x0, x2
+; -O1:    eor x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -4983,9 +4983,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    eor x9, x1, x3
-; -O1:    eor x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    eor x9, x0, x2
+; -O1:    eor x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -5004,9 +5004,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_release:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    eor x9, x1, x3
-; -O1:    eor x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x0, x2
+; -O1:    eor x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -5025,9 +5025,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    eor x9, x1, x3
-; -O1:    eor x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x0, x2
+; -O1:    eor x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -5046,9 +5046,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    eor x9, x1, x3
-; -O1:    eor x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x0, x2
+; -O1:    eor x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -5144,7 +5144,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -5156,7 +5156,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -5168,7 +5168,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -5180,7 +5180,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -5192,7 +5192,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -5204,7 +5204,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -5216,7 +5216,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -5228,7 +5228,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -5240,7 +5240,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -5252,7 +5252,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -5264,7 +5264,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_monotonic:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -5276,7 +5276,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acquire:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -5288,7 +5288,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_release:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -5300,7 +5300,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acq_rel:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -5312,7 +5312,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_seq_cst:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -5327,7 +5327,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -5342,7 +5342,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -5357,7 +5357,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -5372,7 +5372,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -5387,7 +5387,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -6022,8 +6022,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -6038,8 +6038,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -6054,8 +6054,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -6070,8 +6070,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -6086,8 +6086,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -6100,8 +6100,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -6114,8 +6114,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -6128,8 +6128,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -6142,8 +6142,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -6156,8 +6156,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -6170,8 +6170,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -6184,8 +6184,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -6198,8 +6198,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -6212,8 +6212,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -6226,8 +6226,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -6244,7 +6244,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -6261,7 +6261,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -6278,7 +6278,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -6295,7 +6295,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -6312,7 +6312,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -6947,8 +6947,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -6963,8 +6963,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -6979,8 +6979,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -6995,8 +6995,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -7011,8 +7011,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -7025,8 +7025,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -7039,8 +7039,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -7053,8 +7053,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -7067,8 +7067,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -7081,8 +7081,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -7095,8 +7095,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -7109,8 +7109,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -7123,8 +7123,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -7137,8 +7137,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -7151,8 +7151,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -7169,7 +7169,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -7186,7 +7186,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -7203,7 +7203,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -7220,7 +7220,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -7237,7 +7237,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -7872,8 +7872,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -7888,8 +7888,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -7904,8 +7904,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -7920,8 +7920,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -7936,8 +7936,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -7950,8 +7950,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -7964,8 +7964,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -7978,8 +7978,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -7992,8 +7992,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -8006,8 +8006,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -8020,8 +8020,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -8034,8 +8034,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -8048,8 +8048,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -8062,8 +8062,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -8076,8 +8076,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -8094,7 +8094,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -8111,7 +8111,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -8128,7 +8128,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -8145,7 +8145,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -8162,7 +8162,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -8797,8 +8797,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -8813,8 +8813,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -8829,8 +8829,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -8845,8 +8845,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -8861,8 +8861,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -8875,8 +8875,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -8889,8 +8889,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -8903,8 +8903,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -8917,8 +8917,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -8931,8 +8931,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -8945,8 +8945,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -8959,8 +8959,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -8973,8 +8973,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -8987,8 +8987,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -9001,8 +9001,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -9019,7 +9019,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -9036,7 +9036,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -9053,7 +9053,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -9070,7 +9070,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -9087,7 +9087,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8_1a.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8_1a.ll
index b126100b749540..0c3ed9b0f1de0f 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8_1a.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8_1a.ll
@@ -659,7 +659,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -671,7 +671,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -683,7 +683,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -695,7 +695,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -707,7 +707,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -719,7 +719,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -731,7 +731,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -743,7 +743,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -755,7 +755,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -767,7 +767,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -779,7 +779,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_monotonic:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -791,7 +791,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acquire:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -803,7 +803,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_release:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -815,7 +815,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acq_rel:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -827,7 +827,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_seq_cst:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -1264,7 +1264,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1276,7 +1276,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1288,7 +1288,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1300,7 +1300,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1312,7 +1312,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1324,7 +1324,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1336,7 +1336,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1348,7 +1348,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -1360,7 +1360,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -1372,7 +1372,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -1384,7 +1384,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_monotonic:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -1396,7 +1396,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acquire:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -1408,7 +1408,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_release:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -1420,7 +1420,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acq_rel:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -1432,7 +1432,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_seq_cst:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -1804,7 +1804,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1816,7 +1816,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1828,7 +1828,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1840,7 +1840,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1852,7 +1852,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1864,7 +1864,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1876,7 +1876,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1888,7 +1888,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -1900,7 +1900,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -1912,7 +1912,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -1924,7 +1924,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -1936,7 +1936,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -1948,7 +1948,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -1960,7 +1960,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -1972,7 +1972,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -1987,7 +1987,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -2002,7 +2002,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -2017,7 +2017,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -2032,7 +2032,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -2047,7 +2047,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -2600,7 +2600,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value monotonic, align 1
@@ -2614,7 +2614,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acquire, align 1
@@ -2628,7 +2628,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value release, align 1
@@ -2642,7 +2642,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acq_rel, align 1
@@ -2656,7 +2656,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value seq_cst, align 1
@@ -2670,7 +2670,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value monotonic, align 1
@@ -2684,7 +2684,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acquire, align 1
@@ -2698,7 +2698,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value release, align 1
@@ -2712,7 +2712,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acq_rel, align 1
@@ -2726,7 +2726,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value seq_cst, align 1
@@ -2740,7 +2740,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value monotonic, align 1
@@ -2754,7 +2754,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acquire, align 1
@@ -2768,7 +2768,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value release, align 1
@@ -2782,7 +2782,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acq_rel, align 1
@@ -2796,7 +2796,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value seq_cst, align 1
@@ -2814,7 +2814,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -2833,7 +2833,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -2852,7 +2852,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -2871,7 +2871,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -2890,7 +2890,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3174,7 +3174,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_monotonic(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -3186,7 +3186,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -3198,7 +3198,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -3210,7 +3210,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -3222,7 +3222,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -3234,7 +3234,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_monotonic(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -3246,7 +3246,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -3258,7 +3258,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -3270,7 +3270,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -3282,7 +3282,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -3294,7 +3294,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_monotonic(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_monotonic:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -3306,7 +3306,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acquire:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -3318,7 +3318,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_release:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -3330,7 +3330,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acq_rel:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -3342,7 +3342,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_seq_cst:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -3357,7 +3357,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_monotonic(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -3372,7 +3372,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acquire(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -3387,7 +3387,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_release(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -3402,7 +3402,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acq_rel(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -3417,7 +3417,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_seq_cst(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -3699,7 +3699,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -3711,7 +3711,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -3723,7 +3723,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -3735,7 +3735,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -3747,7 +3747,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -3759,7 +3759,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -3771,7 +3771,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -3783,7 +3783,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -3795,7 +3795,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -3807,7 +3807,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -3819,7 +3819,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_monotonic:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -3831,7 +3831,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acquire:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -3843,7 +3843,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_release:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -3855,7 +3855,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acq_rel:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -3867,7 +3867,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_seq_cst:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -3882,7 +3882,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -3897,7 +3897,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -3912,7 +3912,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -3927,7 +3927,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -3942,7 +3942,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -4237,8 +4237,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -4253,8 +4253,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -4269,8 +4269,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -4285,8 +4285,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -4301,8 +4301,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -4315,8 +4315,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -4329,8 +4329,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -4343,8 +4343,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -4357,8 +4357,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -4371,8 +4371,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -4385,8 +4385,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -4399,8 +4399,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -4413,8 +4413,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -4427,8 +4427,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -4441,8 +4441,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -4459,7 +4459,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -4476,7 +4476,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -4493,7 +4493,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -4510,7 +4510,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -4527,7 +4527,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -4822,8 +4822,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -4838,8 +4838,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -4854,8 +4854,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -4870,8 +4870,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -4886,8 +4886,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -4900,8 +4900,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -4914,8 +4914,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -4928,8 +4928,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -4942,8 +4942,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -4956,8 +4956,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -4970,8 +4970,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -4984,8 +4984,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -4998,8 +4998,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -5012,8 +5012,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -5026,8 +5026,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -5044,7 +5044,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -5061,7 +5061,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -5078,7 +5078,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -5095,7 +5095,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -5112,7 +5112,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -5407,8 +5407,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -5423,8 +5423,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -5439,8 +5439,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -5455,8 +5455,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -5471,8 +5471,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -5485,8 +5485,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -5499,8 +5499,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -5513,8 +5513,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -5527,8 +5527,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -5541,8 +5541,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -5555,8 +5555,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -5569,8 +5569,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -5583,8 +5583,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -5597,8 +5597,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -5611,8 +5611,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -5629,7 +5629,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -5646,7 +5646,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -5663,7 +5663,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -5680,7 +5680,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -5697,7 +5697,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -5992,8 +5992,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -6008,8 +6008,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -6024,8 +6024,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -6040,8 +6040,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -6056,8 +6056,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -6070,8 +6070,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -6084,8 +6084,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -6098,8 +6098,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -6112,8 +6112,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -6126,8 +6126,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -6140,8 +6140,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -6154,8 +6154,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -6168,8 +6168,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -6182,8 +6182,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -6196,8 +6196,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -6214,7 +6214,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -6231,7 +6231,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -6248,7 +6248,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -6265,7 +6265,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -6282,7 +6282,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r

diff  --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8a.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8a.ll
index d52b800dac34d7..a58e5a987bb4c9 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8a.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8a.ll
@@ -1129,7 +1129,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1141,7 +1141,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1153,7 +1153,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1165,7 +1165,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1177,7 +1177,7 @@ define dso_local i16 @atomicrmw_add_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i16_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1189,7 +1189,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_monotonic:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1201,7 +1201,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acquire:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1213,7 +1213,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_release:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -1225,7 +1225,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_acq_rel:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -1237,7 +1237,7 @@ define dso_local i32 @atomicrmw_add_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i32_unaligned_seq_cst:
-; -O1:    add w8, w0, w19
+; -O1:    add w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -1249,7 +1249,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_monotonic:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -1261,7 +1261,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acquire:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -1273,7 +1273,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_release:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -1285,7 +1285,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_acq_rel:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -1297,7 +1297,7 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_add_i64_unaligned_seq_cst:
-; -O1:    add x8, x0, x19
+; -O1:    add x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw add ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -1894,7 +1894,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -1906,7 +1906,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -1918,7 +1918,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -1930,7 +1930,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -1942,7 +1942,7 @@ define dso_local i16 @atomicrmw_sub_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i16_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -1954,7 +1954,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_monotonic:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -1966,7 +1966,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acquire:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -1978,7 +1978,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_release:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -1990,7 +1990,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_acq_rel:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -2002,7 +2002,7 @@ define dso_local i32 @atomicrmw_sub_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i32_unaligned_seq_cst:
-; -O1:    sub w8, w0, w19
+; -O1:    sub w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -2014,7 +2014,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_monotonic:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -2026,7 +2026,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acquire:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -2038,7 +2038,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_release:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -2050,7 +2050,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_acq_rel:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -2062,7 +2062,7 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i64_unaligned_seq_cst:
-; -O1:    sub x8, x0, x19
+; -O1:    sub x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw sub ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -2487,9 +2487,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    and x9, x1, x3
-; -O1:    and x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    and x9, x0, x2
+; -O1:    and x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -2508,9 +2508,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    and x9, x1, x3
-; -O1:    and x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    and x9, x0, x2
+; -O1:    and x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -2529,9 +2529,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_release:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    and x9, x1, x3
-; -O1:    and x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x0, x2
+; -O1:    and x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -2550,9 +2550,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    and x9, x1, x3
-; -O1:    and x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x0, x2
+; -O1:    and x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -2571,9 +2571,9 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    and x9, x1, x3
-; -O1:    and x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    and x9, x0, x2
+; -O1:    and x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -2669,7 +2669,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -2681,7 +2681,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -2693,7 +2693,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -2705,7 +2705,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -2717,7 +2717,7 @@ define dso_local i16 @atomicrmw_and_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -2729,7 +2729,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -2741,7 +2741,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -2753,7 +2753,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -2765,7 +2765,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -2777,7 +2777,7 @@ define dso_local i32 @atomicrmw_and_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -2789,7 +2789,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -2801,7 +2801,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -2813,7 +2813,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -2825,7 +2825,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -2837,7 +2837,7 @@ define dso_local i64 @atomicrmw_and_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_and_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -2852,7 +2852,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -2867,7 +2867,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -2882,7 +2882,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -2897,7 +2897,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -2912,7 +2912,7 @@ define dso_local i128 @atomicrmw_and_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_and_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw and ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -3315,9 +3315,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic:
 ; -O1:    ldxp x1, x0, [x8]
 ; -O1:    and x9, x1, x3
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x0, x2
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
@@ -3340,9 +3340,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire:
 ; -O1:    ldaxp x1, x0, [x8]
 ; -O1:    and x9, x1, x3
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x0, x2
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
@@ -3365,9 +3365,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_release:
 ; -O1:    ldxp x1, x0, [x8]
 ; -O1:    and x9, x1, x3
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x0, x2
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value release, align 16
     ret i128 %r
@@ -3390,9 +3390,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel:
 ; -O1:    ldaxp x1, x0, [x8]
 ; -O1:    and x9, x1, x3
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x0, x2
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
@@ -3415,9 +3415,9 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst:
 ; -O1:    ldaxp x1, x0, [x8]
 ; -O1:    and x9, x1, x3
-; -O1:    mvn x9, x9
 ; -O1:    and x10, x0, x2
 ; -O1:    mvn x10, x10
+; -O1:    mvn x9, x9
 ; -O1:    stlxp w11, x9, x10, [x8]
     %r = atomicrmw nand ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
@@ -3525,7 +3525,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value monotonic, align 1
@@ -3539,7 +3539,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acquire, align 1
@@ -3553,7 +3553,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value release, align 1
@@ -3567,7 +3567,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value acq_rel, align 1
@@ -3581,7 +3581,7 @@ define dso_local i16 @atomicrmw_nand_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i16_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i16 %value seq_cst, align 1
@@ -3595,7 +3595,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_monotonic:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value monotonic, align 1
@@ -3609,7 +3609,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acquire:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acquire, align 1
@@ -3623,7 +3623,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_release:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value release, align 1
@@ -3637,7 +3637,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_acq_rel:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value acq_rel, align 1
@@ -3651,7 +3651,7 @@ define dso_local i32 @atomicrmw_nand_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i32_unaligned_seq_cst:
-; -O1:    and w8, w0, w19
+; -O1:    and w8, w0, w20
 ; -O1:    mvn w8, w8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i32 %value seq_cst, align 1
@@ -3665,7 +3665,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_monotonic:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value monotonic, align 1
@@ -3679,7 +3679,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acquire:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acquire, align 1
@@ -3693,7 +3693,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_release:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value release, align 1
@@ -3707,7 +3707,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_acq_rel:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value acq_rel, align 1
@@ -3721,7 +3721,7 @@ define dso_local i64 @atomicrmw_nand_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_nand_i64_unaligned_seq_cst:
-; -O1:    and x8, x0, x19
+; -O1:    and x8, x0, x20
 ; -O1:    mvn x8, x8
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw nand ptr %ptr, i64 %value seq_cst, align 1
@@ -3739,7 +3739,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3758,7 +3758,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3777,7 +3777,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3796,7 +3796,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -3815,7 +3815,7 @@ define dso_local i128 @atomicrmw_nand_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_nand_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    and x8, x1, x19
-; -O1:    and x9, x0, x20
+; -O1:    and x9, x0, x21
 ; -O1:    mvn x8, x8
 ; -O1:    mvn x9, x9
 ; -O1:    bl __atomic_compare_exchange
@@ -4177,9 +4177,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    orr x9, x1, x3
-; -O1:    orr x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    orr x9, x0, x2
+; -O1:    orr x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -4198,9 +4198,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    orr x9, x1, x3
-; -O1:    orr x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    orr x9, x0, x2
+; -O1:    orr x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -4219,9 +4219,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_release:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    orr x9, x1, x3
-; -O1:    orr x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x0, x2
+; -O1:    orr x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -4240,9 +4240,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    orr x9, x1, x3
-; -O1:    orr x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x0, x2
+; -O1:    orr x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -4261,9 +4261,9 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    orr x9, x1, x3
-; -O1:    orr x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    orr x9, x0, x2
+; -O1:    orr x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -4359,7 +4359,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_monotonic(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -4371,7 +4371,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -4383,7 +4383,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -4395,7 +4395,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -4407,7 +4407,7 @@ define dso_local i16 @atomicrmw_or_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i16_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -4419,7 +4419,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_monotonic(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_monotonic:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -4431,7 +4431,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acquire:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -4443,7 +4443,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_release:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -4455,7 +4455,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_acq_rel:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -4467,7 +4467,7 @@ define dso_local i32 @atomicrmw_or_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i32_unaligned_seq_cst:
-; -O1:    orr w8, w0, w19
+; -O1:    orr w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -4479,7 +4479,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_monotonic(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_monotonic:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -4491,7 +4491,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acquire:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -4503,7 +4503,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_release:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -4515,7 +4515,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_acq_rel:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -4527,7 +4527,7 @@ define dso_local i64 @atomicrmw_or_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_or_i64_unaligned_seq_cst:
-; -O1:    orr x8, x0, x19
+; -O1:    orr x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -4542,7 +4542,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_monotonic(ptr %ptr, i128 %val
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -4557,7 +4557,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acquire(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -4572,7 +4572,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_release(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -4587,7 +4587,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_acq_rel(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -4602,7 +4602,7 @@ define dso_local i128 @atomicrmw_or_i128_unaligned_seq_cst(ptr %ptr, i128 %value
 ; -O1-LABEL: atomicrmw_or_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    orr x8, x1, x19
-; -O1:    orr x9, x0, x20
+; -O1:    orr x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw or ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -4962,9 +4962,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %valu
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    eor x9, x1, x3
-; -O1:    eor x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    eor x9, x0, x2
+; -O1:    eor x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 16
     ret i128 %r
 }
@@ -4983,9 +4983,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    eor x9, x1, x3
-; -O1:    eor x10, x0, x2
-; -O1:    stxp w11, x9, x10, [x8]
+; -O1:    eor x9, x0, x2
+; -O1:    eor x10, x1, x3
+; -O1:    stxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 16
     ret i128 %r
 }
@@ -5004,9 +5004,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_release:
 ; -O1:    ldxp x1, x0, [x8]
-; -O1:    eor x9, x1, x3
-; -O1:    eor x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x0, x2
+; -O1:    eor x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value release, align 16
     ret i128 %r
 }
@@ -5025,9 +5025,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    eor x9, x1, x3
-; -O1:    eor x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x0, x2
+; -O1:    eor x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 16
     ret i128 %r
 }
@@ -5046,9 +5046,9 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value)
 ;
 ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst:
 ; -O1:    ldaxp x1, x0, [x8]
-; -O1:    eor x9, x1, x3
-; -O1:    eor x10, x0, x2
-; -O1:    stlxp w11, x9, x10, [x8]
+; -O1:    eor x9, x0, x2
+; -O1:    eor x10, x1, x3
+; -O1:    stlxp w11, x10, x9, [x8]
     %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 16
     ret i128 %r
 }
@@ -5144,7 +5144,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -5156,7 +5156,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -5168,7 +5168,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -5180,7 +5180,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -5192,7 +5192,7 @@ define dso_local i16 @atomicrmw_xor_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i16_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -5204,7 +5204,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_monotonic:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -5216,7 +5216,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acquire:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -5228,7 +5228,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_release:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -5240,7 +5240,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_acq_rel:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -5252,7 +5252,7 @@ define dso_local i32 @atomicrmw_xor_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i32_unaligned_seq_cst:
-; -O1:    eor w8, w0, w19
+; -O1:    eor w8, w0, w20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -5264,7 +5264,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_monotonic:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -5276,7 +5276,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acquire:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -5288,7 +5288,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_release:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -5300,7 +5300,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_acq_rel:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -5312,7 +5312,7 @@ define dso_local i64 @atomicrmw_xor_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_xor_i64_unaligned_seq_cst:
-; -O1:    eor x8, x0, x19
+; -O1:    eor x8, x0, x20
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -5327,7 +5327,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_monotonic:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -5342,7 +5342,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acquire:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -5357,7 +5357,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_release:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -5372,7 +5372,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_acq_rel:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -5387,7 +5387,7 @@ define dso_local i128 @atomicrmw_xor_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1-LABEL: atomicrmw_xor_i128_unaligned_seq_cst:
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    eor x8, x1, x19
-; -O1:    eor x9, x0, x20
+; -O1:    eor x9, x0, x21
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw xor ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -6022,8 +6022,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -6038,8 +6038,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -6054,8 +6054,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -6070,8 +6070,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -6086,8 +6086,8 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -6100,8 +6100,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -6114,8 +6114,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -6128,8 +6128,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -6142,8 +6142,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -6156,8 +6156,8 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, gt
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -6170,8 +6170,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -6184,8 +6184,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -6198,8 +6198,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -6212,8 +6212,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -6226,8 +6226,8 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, gt
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, gt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -6244,7 +6244,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -6261,7 +6261,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -6278,7 +6278,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -6295,7 +6295,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -6312,7 +6312,7 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lt
-; -O1:    csel x9, x0, x20, lt
+; -O1:    csel x9, x0, x21, lt
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw max ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -6947,8 +6947,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_monotonic:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -6963,8 +6963,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acquire:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -6979,8 +6979,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_release:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -6995,8 +6995,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -7011,8 +7011,8 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
 ; -O1:    sxth w8, w0
-; -O1:    cmp w8, w19, sxth
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w8, w20, sxth
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -7025,8 +7025,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -7039,8 +7039,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -7053,8 +7053,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -7067,8 +7067,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -7081,8 +7081,8 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, le
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -7095,8 +7095,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -7109,8 +7109,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -7123,8 +7123,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -7137,8 +7137,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -7151,8 +7151,8 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, le
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, le
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -7169,7 +7169,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -7186,7 +7186,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -7203,7 +7203,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -7220,7 +7220,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -7237,7 +7237,7 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %valu
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, ge
-; -O1:    csel x9, x0, x20, ge
+; -O1:    csel x9, x0, x21, ge
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw min ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -7872,8 +7872,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -7888,8 +7888,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -7904,8 +7904,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -7920,8 +7920,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -7936,8 +7936,8 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -7950,8 +7950,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -7964,8 +7964,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -7978,8 +7978,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -7992,8 +7992,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -8006,8 +8006,8 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, hi
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -8020,8 +8020,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -8034,8 +8034,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -8048,8 +8048,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -8062,8 +8062,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -8076,8 +8076,8 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, hi
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, hi
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -8094,7 +8094,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -8111,7 +8111,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -8128,7 +8128,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -8145,7 +8145,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -8162,7 +8162,7 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, lo
-; -O1:    csel x9, x0, x20, lo
+; -O1:    csel x9, x0, x21, lo
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umax ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r
@@ -8797,8 +8797,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value monotonic, align 1
     ret i16 %r
@@ -8813,8 +8813,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acquire:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acquire, align 1
     ret i16 %r
@@ -8829,8 +8829,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_release:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value release, align 1
     ret i16 %r
@@ -8845,8 +8845,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value acq_rel, align 1
     ret i16 %r
@@ -8861,8 +8861,8 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
 ; -O1:    and w8, w0, #0xffff
-; -O1:    cmp w8, w19, uxth
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w8, w20, uxth
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i16 %value seq_cst, align 1
     ret i16 %r
@@ -8875,8 +8875,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value monotonic, align 1
     ret i32 %r
@@ -8889,8 +8889,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acquire:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acquire, align 1
     ret i32 %r
@@ -8903,8 +8903,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_release:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value release, align 1
     ret i32 %r
@@ -8917,8 +8917,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value acq_rel, align 1
     ret i32 %r
@@ -8931,8 +8931,8 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
-; -O1:    cmp w0, w19
-; -O1:    csel w8, w0, w19, ls
+; -O1:    cmp w0, w20
+; -O1:    csel w8, w0, w20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i32 %value seq_cst, align 1
     ret i32 %r
@@ -8945,8 +8945,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value monotonic, align 1
     ret i64 %r
@@ -8959,8 +8959,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acquire:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acquire, align 1
     ret i64 %r
@@ -8973,8 +8973,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_release:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value release, align 1
     ret i64 %r
@@ -8987,8 +8987,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value acq_rel, align 1
     ret i64 %r
@@ -9001,8 +9001,8 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
-; -O1:    cmp x0, x19
-; -O1:    csel x8, x0, x19, ls
+; -O1:    cmp x0, x20
+; -O1:    csel x8, x0, x20, ls
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i64 %value seq_cst, align 1
     ret i64 %r
@@ -9019,7 +9019,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value monotonic, align 1
     ret i128 %r
@@ -9036,7 +9036,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acquire, align 1
     ret i128 %r
@@ -9053,7 +9053,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value release, align 1
     ret i128 %r
@@ -9070,7 +9070,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value acq_rel, align 1
     ret i128 %r
@@ -9087,7 +9087,7 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %val
 ; -O1:    ldp x0, x1, [x0]
 ; -O1:    cmp x19, x1
 ; -O1:    csel x8, x1, x19, hs
-; -O1:    csel x9, x0, x20, hs
+; -O1:    csel x9, x0, x21, hs
 ; -O1:    bl __atomic_compare_exchange
     %r = atomicrmw umin ptr %ptr, i128 %value seq_cst, align 1
     ret i128 %r

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/aapcs_vararg_frame.ll b/llvm/test/CodeGen/AArch64/GlobalISel/aapcs_vararg_frame.ll
index 9df0f1b82c1b5e..7892e892a24126 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/aapcs_vararg_frame.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/aapcs_vararg_frame.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc < %s --global-isel=0 -mtriple=aarch64-linux-gnu -mattr=+fp-armv8 | FileCheck %s
 ; RUN: llc < %s --global-isel=1 -mtriple=aarch64-linux-gnu -mattr=+fp-armv8 | FileCheck %s --check-prefix=GISEL
 
@@ -5,9 +6,9 @@ define void @va(i32 %count, half %f, ...) nounwind {
 ; CHECK-LABEL: va:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    sub sp, sp, #176
-; CHECK-NEXT:    stp x4, x5, [sp, #144]
 ; CHECK-NEXT:    stp x2, x3, [sp, #128]
 ; CHECK-NEXT:    str x1, [sp, #120]
+; CHECK-NEXT:    stp x4, x5, [sp, #144]
 ; CHECK-NEXT:    stp x6, x7, [sp, #160]
 ; CHECK-NEXT:    stp q1, q2, [sp]
 ; CHECK-NEXT:    stp q3, q4, [sp, #32]

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
index 44ef5574130679..1ec020b7896753 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
@@ -502,7 +502,7 @@ define i64 @fetch_and_nand_64(ptr %p) #0 {
 define i32 @fetch_and_or(ptr %p) #0 {
 ; CHECK-NOLSE-O1-LABEL: fetch_and_or:
 ; CHECK-NOLSE-O1:       ; %bb.0:
-; CHECK-NOLSE-O1-NEXT:    mov w9, #5
+; CHECK-NOLSE-O1-NEXT:    mov w9, #5 ; =0x5
 ; CHECK-NOLSE-O1-NEXT:  LBB8_1: ; %atomicrmw.start
 ; CHECK-NOLSE-O1-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NOLSE-O1-NEXT:    ldaxr w8, [x0]
@@ -525,7 +525,7 @@ define i32 @fetch_and_or(ptr %p) #0 {
 ; CHECK-NOLSE-O0-NEXT:    ; Child Loop BB8_2 Depth 2
 ; CHECK-NOLSE-O0-NEXT:    ldr w8, [sp, #28] ; 4-byte Folded Reload
 ; CHECK-NOLSE-O0-NEXT:    ldr x11, [sp, #16] ; 8-byte Folded Reload
-; CHECK-NOLSE-O0-NEXT:    mov w9, #5
+; CHECK-NOLSE-O0-NEXT:    mov w9, #5 ; =0x5
 ; CHECK-NOLSE-O0-NEXT:    orr w12, w8, w9
 ; CHECK-NOLSE-O0-NEXT:  LBB8_2: ; %atomicrmw.start
 ; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB8_1 Depth=1
@@ -552,13 +552,13 @@ define i32 @fetch_and_or(ptr %p) #0 {
 ;
 ; CHECK-LSE-O1-LABEL: fetch_and_or:
 ; CHECK-LSE-O1:       ; %bb.0:
-; CHECK-LSE-O1-NEXT:    mov w8, #5
+; CHECK-LSE-O1-NEXT:    mov w8, #5 ; =0x5
 ; CHECK-LSE-O1-NEXT:    ldsetal w8, w0, [x0]
 ; CHECK-LSE-O1-NEXT:    ret
 ;
 ; CHECK-LSE-O0-LABEL: fetch_and_or:
 ; CHECK-LSE-O0:       ; %bb.0:
-; CHECK-LSE-O0-NEXT:    mov w8, #5
+; CHECK-LSE-O0-NEXT:    mov w8, #5 ; =0x5
 ; CHECK-LSE-O0-NEXT:    ldsetal w8, w0, [x0]
 ; CHECK-LSE-O0-NEXT:    ret
   %val = atomicrmw or ptr %p, i32 5 seq_cst
@@ -616,13 +616,13 @@ define i64 @fetch_and_or_64(ptr %p) #0 {
 ;
 ; CHECK-LSE-O1-LABEL: fetch_and_or_64:
 ; CHECK-LSE-O1:       ; %bb.0:
-; CHECK-LSE-O1-NEXT:    mov w8, #7
+; CHECK-LSE-O1-NEXT:    mov w8, #7 ; =0x7
 ; CHECK-LSE-O1-NEXT:    ldset x8, x0, [x0]
 ; CHECK-LSE-O1-NEXT:    ret
 ;
 ; CHECK-LSE-O0-LABEL: fetch_and_or_64:
 ; CHECK-LSE-O0:       ; %bb.0:
-; CHECK-LSE-O0-NEXT:    mov w8, #7
+; CHECK-LSE-O0-NEXT:    mov w8, #7 ; =0x7
 ; CHECK-LSE-O0-NEXT:    ; kill: def $x8 killed $w8
 ; CHECK-LSE-O0-NEXT:    ldset x8, x0, [x0]
 ; CHECK-LSE-O0-NEXT:    ret
@@ -709,14 +709,14 @@ define i32 @atomic_load(ptr %p) #0 {
 define i8 @atomic_load_relaxed_8(ptr %p, i32 %off32) #0 {
 ; CHECK-NOLSE-O1-LABEL: atomic_load_relaxed_8:
 ; CHECK-NOLSE-O1:       ; %bb.0:
-; CHECK-NOLSE-O1-NEXT:    add x8, x0, #291, lsl #12 ; =1191936
-; CHECK-NOLSE-O1-NEXT:    ldrb w9, [x0, #4095]
-; CHECK-NOLSE-O1-NEXT:    ldrb w10, [x0, w1, sxtw]
-; CHECK-NOLSE-O1-NEXT:    ldurb w11, [x0, #-256]
-; CHECK-NOLSE-O1-NEXT:    ldrb w8, [x8]
-; CHECK-NOLSE-O1-NEXT:    add w9, w9, w11
-; CHECK-NOLSE-O1-NEXT:    add w9, w9, w10
-; CHECK-NOLSE-O1-NEXT:    add w0, w9, w8
+; CHECK-NOLSE-O1-NEXT:    ldrb w8, [x0, #4095]
+; CHECK-NOLSE-O1-NEXT:    ldrb w9, [x0, w1, sxtw]
+; CHECK-NOLSE-O1-NEXT:    add x11, x0, #291, lsl #12 ; =1191936
+; CHECK-NOLSE-O1-NEXT:    ldurb w10, [x0, #-256]
+; CHECK-NOLSE-O1-NEXT:    add w8, w8, w9
+; CHECK-NOLSE-O1-NEXT:    ldrb w9, [x11]
+; CHECK-NOLSE-O1-NEXT:    add w8, w8, w10
+; CHECK-NOLSE-O1-NEXT:    add w0, w8, w9
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
 ; CHECK-NOLSE-O0-LABEL: atomic_load_relaxed_8:
@@ -779,14 +779,14 @@ define i8 @atomic_load_relaxed_8(ptr %p, i32 %off32) #0 {
 define i16 @atomic_load_relaxed_16(ptr %p, i32 %off32) #0 {
 ; CHECK-NOLSE-O1-LABEL: atomic_load_relaxed_16:
 ; CHECK-NOLSE-O1:       ; %bb.0:
-; CHECK-NOLSE-O1-NEXT:    add x8, x0, #291, lsl #12 ; =1191936
-; CHECK-NOLSE-O1-NEXT:    ldrh w9, [x0, #8190]
-; CHECK-NOLSE-O1-NEXT:    ldrh w10, [x0, w1, sxtw #1]
-; CHECK-NOLSE-O1-NEXT:    ldurh w11, [x0, #-256]
-; CHECK-NOLSE-O1-NEXT:    ldrh w8, [x8]
-; CHECK-NOLSE-O1-NEXT:    add w9, w9, w11
-; CHECK-NOLSE-O1-NEXT:    add w9, w9, w10
-; CHECK-NOLSE-O1-NEXT:    add w0, w9, w8
+; CHECK-NOLSE-O1-NEXT:    ldrh w8, [x0, #8190]
+; CHECK-NOLSE-O1-NEXT:    ldrh w9, [x0, w1, sxtw #1]
+; CHECK-NOLSE-O1-NEXT:    add x11, x0, #291, lsl #12 ; =1191936
+; CHECK-NOLSE-O1-NEXT:    ldurh w10, [x0, #-256]
+; CHECK-NOLSE-O1-NEXT:    add w8, w8, w9
+; CHECK-NOLSE-O1-NEXT:    ldrh w9, [x11]
+; CHECK-NOLSE-O1-NEXT:    add w8, w8, w10
+; CHECK-NOLSE-O1-NEXT:    add w0, w8, w9
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
 ; CHECK-NOLSE-O0-LABEL: atomic_load_relaxed_16:
@@ -849,14 +849,14 @@ define i16 @atomic_load_relaxed_16(ptr %p, i32 %off32) #0 {
 define i32 @atomic_load_relaxed_32(ptr %p, i32 %off32) #0 {
 ; CHECK-NOLSE-O1-LABEL: atomic_load_relaxed_32:
 ; CHECK-NOLSE-O1:       ; %bb.0:
-; CHECK-NOLSE-O1-NEXT:    add x8, x0, #291, lsl #12 ; =1191936
-; CHECK-NOLSE-O1-NEXT:    ldr w9, [x0, #16380]
-; CHECK-NOLSE-O1-NEXT:    ldr w10, [x0, w1, sxtw #2]
-; CHECK-NOLSE-O1-NEXT:    ldur w11, [x0, #-256]
-; CHECK-NOLSE-O1-NEXT:    ldr w8, [x8]
-; CHECK-NOLSE-O1-NEXT:    add w9, w9, w11
-; CHECK-NOLSE-O1-NEXT:    add w9, w9, w10
-; CHECK-NOLSE-O1-NEXT:    add w0, w9, w8
+; CHECK-NOLSE-O1-NEXT:    ldr w8, [x0, #16380]
+; CHECK-NOLSE-O1-NEXT:    ldr w9, [x0, w1, sxtw #2]
+; CHECK-NOLSE-O1-NEXT:    add x11, x0, #291, lsl #12 ; =1191936
+; CHECK-NOLSE-O1-NEXT:    ldur w10, [x0, #-256]
+; CHECK-NOLSE-O1-NEXT:    add w8, w8, w9
+; CHECK-NOLSE-O1-NEXT:    ldr w9, [x11]
+; CHECK-NOLSE-O1-NEXT:    add w8, w8, w10
+; CHECK-NOLSE-O1-NEXT:    add w0, w8, w9
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
 ; CHECK-NOLSE-O0-LABEL: atomic_load_relaxed_32:
@@ -915,14 +915,14 @@ define i32 @atomic_load_relaxed_32(ptr %p, i32 %off32) #0 {
 define i64 @atomic_load_relaxed_64(ptr %p, i32 %off32) #0 {
 ; CHECK-NOLSE-O1-LABEL: atomic_load_relaxed_64:
 ; CHECK-NOLSE-O1:       ; %bb.0:
-; CHECK-NOLSE-O1-NEXT:    add x8, x0, #291, lsl #12 ; =1191936
-; CHECK-NOLSE-O1-NEXT:    ldr x9, [x0, #32760]
-; CHECK-NOLSE-O1-NEXT:    ldr x10, [x0, w1, sxtw #3]
-; CHECK-NOLSE-O1-NEXT:    ldur x11, [x0, #-256]
-; CHECK-NOLSE-O1-NEXT:    ldr x8, [x8]
-; CHECK-NOLSE-O1-NEXT:    add x9, x9, x11
-; CHECK-NOLSE-O1-NEXT:    add x9, x9, x10
-; CHECK-NOLSE-O1-NEXT:    add x0, x9, x8
+; CHECK-NOLSE-O1-NEXT:    ldr x8, [x0, #32760]
+; CHECK-NOLSE-O1-NEXT:    ldr x9, [x0, w1, sxtw #3]
+; CHECK-NOLSE-O1-NEXT:    add x11, x0, #291, lsl #12 ; =1191936
+; CHECK-NOLSE-O1-NEXT:    ldur x10, [x0, #-256]
+; CHECK-NOLSE-O1-NEXT:    add x8, x8, x9
+; CHECK-NOLSE-O1-NEXT:    ldr x9, [x11]
+; CHECK-NOLSE-O1-NEXT:    add x8, x8, x10
+; CHECK-NOLSE-O1-NEXT:    add x0, x8, x9
 ; CHECK-NOLSE-O1-NEXT:    ret
 ;
 ; CHECK-NOLSE-O0-LABEL: atomic_load_relaxed_64:
@@ -982,19 +982,19 @@ define i64 @atomic_load_relaxed_64(ptr %p, i32 %off32) #0 {
 define void @atomc_store(ptr %p) #0 {
 ; CHECK-NOLSE-LABEL: atomc_store:
 ; CHECK-NOLSE:       ; %bb.0:
-; CHECK-NOLSE-NEXT:    mov w8, #4
+; CHECK-NOLSE-NEXT:    mov w8, #4 ; =0x4
 ; CHECK-NOLSE-NEXT:    stlr w8, [x0]
 ; CHECK-NOLSE-NEXT:    ret
 ;
 ; CHECK-LSE-O1-LABEL: atomc_store:
 ; CHECK-LSE-O1:       ; %bb.0:
-; CHECK-LSE-O1-NEXT:    mov w8, #4
+; CHECK-LSE-O1-NEXT:    mov w8, #4 ; =0x4
 ; CHECK-LSE-O1-NEXT:    stlr w8, [x0]
 ; CHECK-LSE-O1-NEXT:    ret
 ;
 ; CHECK-LSE-O0-LABEL: atomc_store:
 ; CHECK-LSE-O0:       ; %bb.0:
-; CHECK-LSE-O0-NEXT:    mov w8, #4
+; CHECK-LSE-O0-NEXT:    mov w8, #4 ; =0x4
 ; CHECK-LSE-O0-NEXT:    stlr w8, [x0]
 ; CHECK-LSE-O0-NEXT:    ret
    store atomic i32 4, ptr %p seq_cst, align 4
@@ -2743,7 +2743,7 @@ define { i8, i1 } @cmpxchg_i8(ptr %ptr, i8 %desired, i8 %new) {
 ; CHECK-NOLSE-O1-NEXT:    stxrb w10, w2, [x8]
 ; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB47_1
 ; CHECK-NOLSE-O1-NEXT:  ; %bb.3:
-; CHECK-NOLSE-O1-NEXT:    mov w1, #1
+; CHECK-NOLSE-O1-NEXT:    mov w1, #1 ; =0x1
 ; CHECK-NOLSE-O1-NEXT:    ; kill: def $w0 killed $w0 killed $x0
 ; CHECK-NOLSE-O1-NEXT:    ret
 ; CHECK-NOLSE-O1-NEXT:  LBB47_4: ; %cmpxchg.nostore
@@ -2810,7 +2810,7 @@ define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) {
 ; CHECK-NOLSE-O1-NEXT:    stxrh w10, w2, [x8]
 ; CHECK-NOLSE-O1-NEXT:    cbnz w10, LBB48_1
 ; CHECK-NOLSE-O1-NEXT:  ; %bb.3:
-; CHECK-NOLSE-O1-NEXT:    mov w1, #1
+; CHECK-NOLSE-O1-NEXT:    mov w1, #1 ; =0x1
 ; CHECK-NOLSE-O1-NEXT:    ; kill: def $w0 killed $w0 killed $x0
 ; CHECK-NOLSE-O1-NEXT:    ret
 ; CHECK-NOLSE-O1-NEXT:  LBB48_4: ; %cmpxchg.nostore

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
index 21ac5bc697df20..54e7d5ee9c13b7 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
@@ -384,14 +384,14 @@ define i8 @atomic_load_relaxed_8(ptr %p, i32 %off32) {
   ; CHECK: bb.0 (%ir-block.0):
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $x8 = ADDXri renamable $x0, 291, 12
-  ; CHECK-NEXT:   renamable $w9 = LDRBBui renamable $x0, 4095, pcsections !0 :: (load monotonic (s8) from %ir.ptr_unsigned)
-  ; CHECK-NEXT:   renamable $w10 = LDRBBroW renamable $x0, killed renamable $w1, 1, 0, pcsections !0 :: (load unordered (s8) from %ir.ptr_regoff)
-  ; CHECK-NEXT:   renamable $w11 = LDURBBi killed renamable $x0, -256, pcsections !0 :: (load monotonic (s8) from %ir.ptr_unscaled)
-  ; CHECK-NEXT:   renamable $w8 = LDRBBui killed renamable $x8, 0, pcsections !0 :: (load unordered (s8) from %ir.ptr_random)
-  ; CHECK-NEXT:   $w9 = ADDWrs killed renamable $w9, killed renamable $w11, 0, pcsections !0
-  ; CHECK-NEXT:   $w9 = ADDWrs killed renamable $w9, killed renamable $w10, 0, pcsections !0
-  ; CHECK-NEXT:   $w0 = ADDWrs killed renamable $w9, killed renamable $w8, 0, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDRBBui renamable $x0, 4095, pcsections !0 :: (load monotonic (s8) from %ir.ptr_unsigned)
+  ; CHECK-NEXT:   renamable $w9 = LDRBBroW renamable $x0, killed renamable $w1, 1, 0, pcsections !0 :: (load unordered (s8) from %ir.ptr_regoff)
+  ; CHECK-NEXT:   renamable $w10 = LDURBBi renamable $x0, -256, pcsections !0 :: (load monotonic (s8) from %ir.ptr_unscaled)
+  ; CHECK-NEXT:   renamable $x11 = ADDXri killed renamable $x0, 291, 12
+  ; CHECK-NEXT:   $w8 = ADDWrs killed renamable $w8, killed renamable $w9, 0, pcsections !0
+  ; CHECK-NEXT:   renamable $w9 = LDRBBui killed renamable $x11, 0, pcsections !0 :: (load unordered (s8) from %ir.ptr_random)
+  ; CHECK-NEXT:   $w8 = ADDWrs killed renamable $w8, killed renamable $w10, 0, pcsections !0
+  ; CHECK-NEXT:   $w0 = ADDWrs killed renamable $w8, killed renamable $w9, 0, pcsections !0
   ; CHECK-NEXT:   RET undef $lr, implicit $w0
   %ptr_unsigned = getelementptr i8, ptr %p, i32 4095
   %val_unsigned = load atomic i8, ptr %ptr_unsigned monotonic, align 1, !pcsections !0
@@ -416,14 +416,14 @@ define i16 @atomic_load_relaxed_16(ptr %p, i32 %off32) {
   ; CHECK: bb.0 (%ir-block.0):
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $x8 = ADDXri renamable $x0, 291, 12
-  ; CHECK-NEXT:   renamable $w9 = LDRHHui renamable $x0, 4095, pcsections !0 :: (load monotonic (s16) from %ir.ptr_unsigned)
-  ; CHECK-NEXT:   renamable $w10 = LDRHHroW renamable $x0, killed renamable $w1, 1, 1, pcsections !0 :: (load unordered (s16) from %ir.ptr_regoff)
-  ; CHECK-NEXT:   renamable $w11 = LDURHHi killed renamable $x0, -256, pcsections !0 :: (load monotonic (s16) from %ir.ptr_unscaled)
-  ; CHECK-NEXT:   renamable $w8 = LDRHHui killed renamable $x8, 0, pcsections !0 :: (load unordered (s16) from %ir.ptr_random)
-  ; CHECK-NEXT:   $w9 = ADDWrs killed renamable $w9, killed renamable $w11, 0, pcsections !0
-  ; CHECK-NEXT:   $w9 = ADDWrs killed renamable $w9, killed renamable $w10, 0, pcsections !0
-  ; CHECK-NEXT:   $w0 = ADDWrs killed renamable $w9, killed renamable $w8, 0, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDRHHui renamable $x0, 4095, pcsections !0 :: (load monotonic (s16) from %ir.ptr_unsigned)
+  ; CHECK-NEXT:   renamable $w9 = LDRHHroW renamable $x0, killed renamable $w1, 1, 1, pcsections !0 :: (load unordered (s16) from %ir.ptr_regoff)
+  ; CHECK-NEXT:   renamable $w10 = LDURHHi renamable $x0, -256, pcsections !0 :: (load monotonic (s16) from %ir.ptr_unscaled)
+  ; CHECK-NEXT:   renamable $x11 = ADDXri killed renamable $x0, 291, 12
+  ; CHECK-NEXT:   $w8 = ADDWrs killed renamable $w8, killed renamable $w9, 0, pcsections !0
+  ; CHECK-NEXT:   renamable $w9 = LDRHHui killed renamable $x11, 0, pcsections !0 :: (load unordered (s16) from %ir.ptr_random)
+  ; CHECK-NEXT:   $w8 = ADDWrs killed renamable $w8, killed renamable $w10, 0, pcsections !0
+  ; CHECK-NEXT:   $w0 = ADDWrs killed renamable $w8, killed renamable $w9, 0, pcsections !0
   ; CHECK-NEXT:   RET undef $lr, implicit $w0
   %ptr_unsigned = getelementptr i16, ptr %p, i32 4095
   %val_unsigned = load atomic i16, ptr %ptr_unsigned monotonic, align 2, !pcsections !0
@@ -448,14 +448,14 @@ define i32 @atomic_load_relaxed_32(ptr %p, i32 %off32) {
   ; CHECK: bb.0 (%ir-block.0):
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $x8 = ADDXri renamable $x0, 291, 12
-  ; CHECK-NEXT:   renamable $w9 = LDRWui renamable $x0, 4095, pcsections !0 :: (load monotonic (s32) from %ir.ptr_unsigned)
-  ; CHECK-NEXT:   renamable $w10 = LDRWroW renamable $x0, killed renamable $w1, 1, 1, pcsections !0 :: (load unordered (s32) from %ir.ptr_regoff)
-  ; CHECK-NEXT:   renamable $w11 = LDURWi killed renamable $x0, -256, pcsections !0 :: (load monotonic (s32) from %ir.ptr_unscaled)
-  ; CHECK-NEXT:   renamable $w8 = LDRWui killed renamable $x8, 0, pcsections !0 :: (load unordered (s32) from %ir.ptr_random)
-  ; CHECK-NEXT:   $w9 = ADDWrs killed renamable $w9, killed renamable $w11, 0, pcsections !0
-  ; CHECK-NEXT:   $w9 = ADDWrs killed renamable $w9, killed renamable $w10, 0, pcsections !0
-  ; CHECK-NEXT:   $w0 = ADDWrs killed renamable $w9, killed renamable $w8, 0, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDRWui renamable $x0, 4095, pcsections !0 :: (load monotonic (s32) from %ir.ptr_unsigned)
+  ; CHECK-NEXT:   renamable $w9 = LDRWroW renamable $x0, killed renamable $w1, 1, 1, pcsections !0 :: (load unordered (s32) from %ir.ptr_regoff)
+  ; CHECK-NEXT:   renamable $w10 = LDURWi renamable $x0, -256, pcsections !0 :: (load monotonic (s32) from %ir.ptr_unscaled)
+  ; CHECK-NEXT:   renamable $x11 = ADDXri killed renamable $x0, 291, 12
+  ; CHECK-NEXT:   $w8 = ADDWrs killed renamable $w8, killed renamable $w9, 0, pcsections !0
+  ; CHECK-NEXT:   renamable $w9 = LDRWui killed renamable $x11, 0, pcsections !0 :: (load unordered (s32) from %ir.ptr_random)
+  ; CHECK-NEXT:   $w8 = ADDWrs killed renamable $w8, killed renamable $w10, 0, pcsections !0
+  ; CHECK-NEXT:   $w0 = ADDWrs killed renamable $w8, killed renamable $w9, 0, pcsections !0
   ; CHECK-NEXT:   RET undef $lr, implicit $w0
   %ptr_unsigned = getelementptr i32, ptr %p, i32 4095
   %val_unsigned = load atomic i32, ptr %ptr_unsigned monotonic, align 4, !pcsections !0
@@ -480,14 +480,14 @@ define i64 @atomic_load_relaxed_64(ptr %p, i32 %off32) {
   ; CHECK: bb.0 (%ir-block.0):
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $x8 = ADDXri renamable $x0, 291, 12
-  ; CHECK-NEXT:   renamable $x9 = LDRXui renamable $x0, 4095, pcsections !0 :: (load monotonic (s64) from %ir.ptr_unsigned)
-  ; CHECK-NEXT:   renamable $x10 = LDRXroW renamable $x0, killed renamable $w1, 1, 1, pcsections !0 :: (load unordered (s64) from %ir.ptr_regoff)
-  ; CHECK-NEXT:   renamable $x11 = LDURXi killed renamable $x0, -256, pcsections !0 :: (load monotonic (s64) from %ir.ptr_unscaled)
-  ; CHECK-NEXT:   renamable $x8 = LDRXui killed renamable $x8, 0, pcsections !0 :: (load unordered (s64) from %ir.ptr_random)
-  ; CHECK-NEXT:   $x9 = ADDXrs killed renamable $x9, killed renamable $x11, 0, pcsections !0
-  ; CHECK-NEXT:   $x9 = ADDXrs killed renamable $x9, killed renamable $x10, 0, pcsections !0
-  ; CHECK-NEXT:   $x0 = ADDXrs killed renamable $x9, killed renamable $x8, 0, pcsections !0
+  ; CHECK-NEXT:   renamable $x8 = LDRXui renamable $x0, 4095, pcsections !0 :: (load monotonic (s64) from %ir.ptr_unsigned)
+  ; CHECK-NEXT:   renamable $x9 = LDRXroW renamable $x0, killed renamable $w1, 1, 1, pcsections !0 :: (load unordered (s64) from %ir.ptr_regoff)
+  ; CHECK-NEXT:   renamable $x10 = LDURXi renamable $x0, -256, pcsections !0 :: (load monotonic (s64) from %ir.ptr_unscaled)
+  ; CHECK-NEXT:   renamable $x11 = ADDXri killed renamable $x0, 291, 12
+  ; CHECK-NEXT:   $x8 = ADDXrs killed renamable $x8, killed renamable $x9, 0, pcsections !0
+  ; CHECK-NEXT:   renamable $x9 = LDRXui killed renamable $x11, 0, pcsections !0 :: (load unordered (s64) from %ir.ptr_random)
+  ; CHECK-NEXT:   $x8 = ADDXrs killed renamable $x8, killed renamable $x10, 0, pcsections !0
+  ; CHECK-NEXT:   $x0 = ADDXrs killed renamable $x8, killed renamable $x9, 0, pcsections !0
   ; CHECK-NEXT:   RET undef $lr, implicit $x0
   %ptr_unsigned = getelementptr i64, ptr %p, i32 4095
   %val_unsigned = load atomic i64, ptr %ptr_unsigned monotonic, align 8, !pcsections !0
@@ -525,10 +525,10 @@ define void @atomic_store_relaxed_8(ptr %p, i32 %off32, i8 %val) {
   ; CHECK: bb.0 (%ir-block.0):
   ; CHECK-NEXT:   liveins: $w1, $w2, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $x8 = ADDXri renamable $x0, 291, 12
   ; CHECK-NEXT:   STRBBui renamable $w2, renamable $x0, 4095, pcsections !0 :: (store monotonic (s8) into %ir.ptr_unsigned)
   ; CHECK-NEXT:   STRBBroW renamable $w2, renamable $x0, killed renamable $w1, 1, 0, pcsections !0 :: (store unordered (s8) into %ir.ptr_regoff)
-  ; CHECK-NEXT:   STURBBi renamable $w2, killed renamable $x0, -256, pcsections !0 :: (store monotonic (s8) into %ir.ptr_unscaled)
+  ; CHECK-NEXT:   STURBBi renamable $w2, renamable $x0, -256, pcsections !0 :: (store monotonic (s8) into %ir.ptr_unscaled)
+  ; CHECK-NEXT:   renamable $x8 = ADDXri killed renamable $x0, 291, 12
   ; CHECK-NEXT:   STRBBui killed renamable $w2, killed renamable $x8, 0, pcsections !0 :: (store unordered (s8) into %ir.ptr_random)
   ; CHECK-NEXT:   RET undef $lr
   %ptr_unsigned = getelementptr i8, ptr %p, i32 4095
@@ -551,10 +551,10 @@ define void @atomic_store_relaxed_16(ptr %p, i32 %off32, i16 %val) {
   ; CHECK: bb.0 (%ir-block.0):
   ; CHECK-NEXT:   liveins: $w1, $w2, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $x8 = ADDXri renamable $x0, 291, 12
   ; CHECK-NEXT:   STRHHui renamable $w2, renamable $x0, 4095, pcsections !0 :: (store monotonic (s16) into %ir.ptr_unsigned)
   ; CHECK-NEXT:   STRHHroW renamable $w2, renamable $x0, killed renamable $w1, 1, 1, pcsections !0 :: (store unordered (s16) into %ir.ptr_regoff)
-  ; CHECK-NEXT:   STURHHi renamable $w2, killed renamable $x0, -256, pcsections !0 :: (store monotonic (s16) into %ir.ptr_unscaled)
+  ; CHECK-NEXT:   STURHHi renamable $w2, renamable $x0, -256, pcsections !0 :: (store monotonic (s16) into %ir.ptr_unscaled)
+  ; CHECK-NEXT:   renamable $x8 = ADDXri killed renamable $x0, 291, 12
   ; CHECK-NEXT:   STRHHui killed renamable $w2, killed renamable $x8, 0, pcsections !0 :: (store unordered (s16) into %ir.ptr_random)
   ; CHECK-NEXT:   RET undef $lr
   %ptr_unsigned = getelementptr i16, ptr %p, i32 4095
@@ -577,10 +577,10 @@ define void @atomic_store_relaxed_32(ptr %p, i32 %off32, i32 %val) {
   ; CHECK: bb.0 (%ir-block.0):
   ; CHECK-NEXT:   liveins: $w1, $w2, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $x8 = ADDXri renamable $x0, 291, 12
   ; CHECK-NEXT:   STRWui renamable $w2, renamable $x0, 4095, pcsections !0 :: (store monotonic (s32) into %ir.ptr_unsigned)
   ; CHECK-NEXT:   STRWroW renamable $w2, renamable $x0, killed renamable $w1, 1, 1, pcsections !0 :: (store unordered (s32) into %ir.ptr_regoff)
-  ; CHECK-NEXT:   STURWi renamable $w2, killed renamable $x0, -256, pcsections !0 :: (store monotonic (s32) into %ir.ptr_unscaled)
+  ; CHECK-NEXT:   STURWi renamable $w2, renamable $x0, -256, pcsections !0 :: (store monotonic (s32) into %ir.ptr_unscaled)
+  ; CHECK-NEXT:   renamable $x8 = ADDXri killed renamable $x0, 291, 12
   ; CHECK-NEXT:   STRWui killed renamable $w2, killed renamable $x8, 0, pcsections !0 :: (store unordered (s32) into %ir.ptr_random)
   ; CHECK-NEXT:   RET undef $lr
   %ptr_unsigned = getelementptr i32, ptr %p, i32 4095
@@ -603,10 +603,10 @@ define void @atomic_store_relaxed_64(ptr %p, i32 %off32, i64 %val) {
   ; CHECK: bb.0 (%ir-block.0):
   ; CHECK-NEXT:   liveins: $w1, $x0, $x2
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $x8 = ADDXri renamable $x0, 291, 12
   ; CHECK-NEXT:   STRXui renamable $x2, renamable $x0, 4095, pcsections !0 :: (store monotonic (s64) into %ir.ptr_unsigned)
   ; CHECK-NEXT:   STRXroW renamable $x2, renamable $x0, killed renamable $w1, 1, 1, pcsections !0 :: (store unordered (s64) into %ir.ptr_regoff)
-  ; CHECK-NEXT:   STURXi renamable $x2, killed renamable $x0, -256, pcsections !0 :: (store monotonic (s64) into %ir.ptr_unscaled)
+  ; CHECK-NEXT:   STURXi renamable $x2, renamable $x0, -256, pcsections !0 :: (store monotonic (s64) into %ir.ptr_unscaled)
+  ; CHECK-NEXT:   renamable $x8 = ADDXri killed renamable $x0, 291, 12
   ; CHECK-NEXT:   STRXui killed renamable $x2, killed renamable $x8, 0, pcsections !0 :: (store unordered (s64) into %ir.ptr_random)
   ; CHECK-NEXT:   RET undef $lr
   %ptr_unsigned = getelementptr i64, ptr %p, i32 4095
@@ -633,6 +633,7 @@ define i32 @load_zext(ptr %p8, ptr %p16) {
   ; CHECK-NOLSE-NEXT:   renamable $w9 = LDRHHui killed renamable $x1, 0, pcsections !0 :: (load unordered (s16) from %ir.p16)
   ; CHECK-NOLSE-NEXT:   renamable $w0 = ADDWrx killed renamable $w9, killed renamable $w8, 0, pcsections !0
   ; CHECK-NOLSE-NEXT:   RET undef $lr, implicit $w0
+  ;
   ; CHECK-LDAPR-LABEL: name: load_zext
   ; CHECK-LDAPR: bb.0 (%ir-block.0):
   ; CHECK-LDAPR-NEXT:   liveins: $x0, $x1
@@ -659,6 +660,7 @@ define { i32, i64 } @load_acq(ptr %p32, ptr %p64) {
   ; CHECK-NOLSE-NEXT:   renamable $w0 = LDARW killed renamable $x0, pcsections !0 :: (load seq_cst (s32) from %ir.p32)
   ; CHECK-NOLSE-NEXT:   renamable $x1 = LDARX killed renamable $x1, pcsections !0 :: (load acquire (s64) from %ir.p64)
   ; CHECK-NOLSE-NEXT:   RET undef $lr, implicit $w0, implicit $x1
+  ;
   ; CHECK-LDAPR-LABEL: name: load_acq
   ; CHECK-LDAPR: bb.0 (%ir-block.0):
   ; CHECK-LDAPR-NEXT:   liveins: $x0, $x1
@@ -685,6 +687,7 @@ define i32 @load_sext(ptr %p8, ptr %p16) {
   ; CHECK-NOLSE-NEXT:   renamable $w9 = SBFMWri killed renamable $w9, 0, 15
   ; CHECK-NOLSE-NEXT:   renamable $w0 = ADDWrx killed renamable $w9, killed renamable $w8, 32, pcsections !0
   ; CHECK-NOLSE-NEXT:   RET undef $lr, implicit $w0
+  ;
   ; CHECK-LDAPR-LABEL: name: load_sext
   ; CHECK-LDAPR: bb.0 (%ir-block.0):
   ; CHECK-LDAPR-NEXT:   liveins: $x0, $x1

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/call-translator-variadic-musttail.ll b/llvm/test/CodeGen/AArch64/GlobalISel/call-translator-variadic-musttail.ll
index 2d8a675c737c32..a6133bd898681c 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/call-translator-variadic-musttail.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/call-translator-variadic-musttail.ll
@@ -70,17 +70,17 @@ define i32 @test_musttail_variadic_spill(i32 %arg0, ...) {
 ; CHECK-NEXT:    bl _puts
 ; CHECK-NEXT:    ldp q1, q0, [sp, #96] ; 32-byte Folded Reload
 ; CHECK-NEXT:    mov w0, w19
+; CHECK-NEXT:    ldp q3, q2, [sp, #64] ; 32-byte Folded Reload
 ; CHECK-NEXT:    mov x1, x20
+; CHECK-NEXT:    ldp q5, q4, [sp, #32] ; 32-byte Folded Reload
 ; CHECK-NEXT:    mov x2, x21
+; CHECK-NEXT:    ldp q7, q6, [sp] ; 32-byte Folded Reload
 ; CHECK-NEXT:    mov x3, x22
 ; CHECK-NEXT:    mov x4, x23
 ; CHECK-NEXT:    mov x5, x24
 ; CHECK-NEXT:    mov x6, x25
 ; CHECK-NEXT:    mov x7, x26
 ; CHECK-NEXT:    mov x8, x27
-; CHECK-NEXT:    ldp q3, q2, [sp, #64] ; 32-byte Folded Reload
-; CHECK-NEXT:    ldp q5, q4, [sp, #32] ; 32-byte Folded Reload
-; CHECK-NEXT:    ldp q7, q6, [sp] ; 32-byte Folded Reload
 ; CHECK-NEXT:    ldp x29, x30, [sp, #208] ; 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x20, x19, [sp, #192] ; 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x22, x21, [sp, #176] ; 16-byte Folded Reload
@@ -122,8 +122,9 @@ define void @f_thunk(ptr %this, ...) {
 ; CHECK-NEXT:    .cfi_offset w26, -80
 ; CHECK-NEXT:    .cfi_offset w27, -88
 ; CHECK-NEXT:    .cfi_offset w28, -96
-; CHECK-NEXT:    add x9, sp, #128
-; CHECK-NEXT:    add x10, sp, #256
+; CHECK-NEXT:    mov x27, x8
+; CHECK-NEXT:    add x8, sp, #128
+; CHECK-NEXT:    add x9, sp, #256
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    mov x20, x1
 ; CHECK-NEXT:    mov x21, x2
@@ -133,16 +134,18 @@ define void @f_thunk(ptr %this, ...) {
 ; CHECK-NEXT:    mov x25, x6
 ; CHECK-NEXT:    mov x26, x7
 ; CHECK-NEXT:    stp q7, q6, [sp] ; 32-byte Folded Spill
-; CHECK-NEXT:    mov x27, x8
 ; CHECK-NEXT:    stp q5, q4, [sp, #32] ; 32-byte Folded Spill
 ; CHECK-NEXT:    stp q3, q2, [sp, #64] ; 32-byte Folded Spill
 ; CHECK-NEXT:    stp q1, q0, [sp, #96] ; 32-byte Folded Spill
-; CHECK-NEXT:    str x10, [x9]
+; CHECK-NEXT:    str x9, [x8]
 ; CHECK-NEXT:    bl _get_f
-; CHECK-NEXT:    ldp q1, q0, [sp, #96] ; 32-byte Folded Reload
 ; CHECK-NEXT:    mov x9, x0
+; CHECK-NEXT:    ldp q1, q0, [sp, #96] ; 32-byte Folded Reload
+; CHECK-NEXT:    ldp q3, q2, [sp, #64] ; 32-byte Folded Reload
 ; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    ldp q5, q4, [sp, #32] ; 32-byte Folded Reload
 ; CHECK-NEXT:    mov x1, x20
+; CHECK-NEXT:    ldp q7, q6, [sp] ; 32-byte Folded Reload
 ; CHECK-NEXT:    mov x2, x21
 ; CHECK-NEXT:    mov x3, x22
 ; CHECK-NEXT:    mov x4, x23
@@ -150,9 +153,6 @@ define void @f_thunk(ptr %this, ...) {
 ; CHECK-NEXT:    mov x6, x25
 ; CHECK-NEXT:    mov x7, x26
 ; CHECK-NEXT:    mov x8, x27
-; CHECK-NEXT:    ldp q3, q2, [sp, #64] ; 32-byte Folded Reload
-; CHECK-NEXT:    ldp q5, q4, [sp, #32] ; 32-byte Folded Reload
-; CHECK-NEXT:    ldp q7, q6, [sp] ; 32-byte Folded Reload
 ; CHECK-NEXT:    ldp x29, x30, [sp, #240] ; 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x20, x19, [sp, #224] ; 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x22, x21, [sp, #208] ; 16-byte Folded Reload
@@ -193,7 +193,7 @@ define void @h_thunk(ptr %this, ...) {
 ; CHECK-NEXT:  Lloh2:
 ; CHECK-NEXT:    adrp x10, _g at GOTPAGE
 ; CHECK-NEXT:    ldr x9, [x0, #16]
-; CHECK-NEXT:    mov w11, #42
+; CHECK-NEXT:    mov w11, #42 ; =0x2a
 ; CHECK-NEXT:  Lloh3:
 ; CHECK-NEXT:    ldr x10, [x10, _g at GOTPAGEOFF]
 ; CHECK-NEXT:  Lloh4:

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
index 395a4a8b87de28..56d851c52bb67a 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
@@ -35,10 +35,10 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
 ; SDAG-LABEL: combine_vec_udiv_nonuniform:
 ; SDAG:       // %bb.0:
 ; SDAG-NEXT:    adrp x8, .LCPI1_0
-; SDAG-NEXT:    adrp x9, .LCPI1_1
 ; SDAG-NEXT:    ldr q1, [x8, :lo12:.LCPI1_0]
+; SDAG-NEXT:    adrp x8, .LCPI1_1
+; SDAG-NEXT:    ldr q2, [x8, :lo12:.LCPI1_1]
 ; SDAG-NEXT:    adrp x8, .LCPI1_2
-; SDAG-NEXT:    ldr q2, [x9, :lo12:.LCPI1_1]
 ; SDAG-NEXT:    ushl v1.8h, v0.8h, v1.8h
 ; SDAG-NEXT:    umull2 v3.4s, v1.8h, v2.8h
 ; SDAG-NEXT:    umull v1.4s, v1.4h, v2.4h
@@ -48,39 +48,39 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
 ; SDAG-NEXT:    sub v0.8h, v0.8h, v1.8h
 ; SDAG-NEXT:    umull2 v3.4s, v0.8h, v2.8h
 ; SDAG-NEXT:    umull v0.4s, v0.4h, v2.4h
-; SDAG-NEXT:    ldr q2, [x8, :lo12:.LCPI1_3]
 ; SDAG-NEXT:    uzp2 v0.8h, v0.8h, v3.8h
 ; SDAG-NEXT:    add v0.8h, v0.8h, v1.8h
-; SDAG-NEXT:    ushl v0.8h, v0.8h, v2.8h
+; SDAG-NEXT:    ldr q1, [x8, :lo12:.LCPI1_3]
+; SDAG-NEXT:    ushl v0.8h, v0.8h, v1.8h
 ; SDAG-NEXT:    ret
 ;
 ; GISEL-LABEL: combine_vec_udiv_nonuniform:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    adrp x8, .LCPI1_4
-; GISEL-NEXT:    adrp x9, .LCPI1_5
+; GISEL-NEXT:    adrp x9, .LCPI1_0
 ; GISEL-NEXT:    ldr q1, [x8, :lo12:.LCPI1_4]
 ; GISEL-NEXT:    adrp x8, .LCPI1_3
-; GISEL-NEXT:    neg v1.8h, v1.8h
 ; GISEL-NEXT:    ldr q2, [x8, :lo12:.LCPI1_3]
 ; GISEL-NEXT:    adrp x8, .LCPI1_2
+; GISEL-NEXT:    neg v1.8h, v1.8h
 ; GISEL-NEXT:    ushl v1.8h, v0.8h, v1.8h
 ; GISEL-NEXT:    umull2 v3.4s, v1.8h, v2.8h
 ; GISEL-NEXT:    umull v1.4s, v1.4h, v2.4h
-; GISEL-NEXT:    ldr q2, [x8, :lo12:.LCPI1_2]
-; GISEL-NEXT:    adrp x8, .LCPI1_1
 ; GISEL-NEXT:    uzp2 v1.8h, v1.8h, v3.8h
-; GISEL-NEXT:    sub v3.8h, v0.8h, v1.8h
-; GISEL-NEXT:    umull2 v4.4s, v3.8h, v2.8h
-; GISEL-NEXT:    umull v2.4s, v3.4h, v2.4h
+; GISEL-NEXT:    ldr q3, [x8, :lo12:.LCPI1_2]
+; GISEL-NEXT:    adrp x8, .LCPI1_1
+; GISEL-NEXT:    sub v2.8h, v0.8h, v1.8h
+; GISEL-NEXT:    umull2 v4.4s, v2.8h, v3.8h
+; GISEL-NEXT:    umull v2.4s, v2.4h, v3.4h
 ; GISEL-NEXT:    ldr q3, [x8, :lo12:.LCPI1_1]
-; GISEL-NEXT:    adrp x8, .LCPI1_0
-; GISEL-NEXT:    neg v3.8h, v3.8h
+; GISEL-NEXT:    adrp x8, .LCPI1_5
 ; GISEL-NEXT:    uzp2 v2.8h, v2.8h, v4.8h
-; GISEL-NEXT:    ldr q4, [x9, :lo12:.LCPI1_5]
-; GISEL-NEXT:    ldr q5, [x8, :lo12:.LCPI1_0]
+; GISEL-NEXT:    ldr q4, [x9, :lo12:.LCPI1_0]
 ; GISEL-NEXT:    add v1.8h, v2.8h, v1.8h
-; GISEL-NEXT:    cmeq v2.8h, v4.8h, v5.8h
-; GISEL-NEXT:    ushl v1.8h, v1.8h, v3.8h
+; GISEL-NEXT:    neg v2.8h, v3.8h
+; GISEL-NEXT:    ldr q3, [x8, :lo12:.LCPI1_5]
+; GISEL-NEXT:    ushl v1.8h, v1.8h, v2.8h
+; GISEL-NEXT:    cmeq v2.8h, v3.8h, v4.8h
 ; GISEL-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; GISEL-NEXT:    ret
   %1 = udiv <8 x i16> %x, <i16 23, i16 34, i16 -23, i16 56, i16 128, i16 -1, i16 -256, i16 -32768>
@@ -91,38 +91,38 @@ define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) {
 ; SDAG-LABEL: combine_vec_udiv_nonuniform2:
 ; SDAG:       // %bb.0:
 ; SDAG-NEXT:    adrp x8, .LCPI2_0
-; SDAG-NEXT:    adrp x9, .LCPI2_1
 ; SDAG-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
-; SDAG-NEXT:    adrp x8, .LCPI2_2
-; SDAG-NEXT:    ldr q2, [x9, :lo12:.LCPI2_1]
+; SDAG-NEXT:    adrp x8, .LCPI2_1
 ; SDAG-NEXT:    ushl v0.8h, v0.8h, v1.8h
-; SDAG-NEXT:    umull2 v1.4s, v0.8h, v2.8h
-; SDAG-NEXT:    umull v0.4s, v0.4h, v2.4h
-; SDAG-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
+; SDAG-NEXT:    ldr q1, [x8, :lo12:.LCPI2_1]
+; SDAG-NEXT:    adrp x8, .LCPI2_2
+; SDAG-NEXT:    umull2 v2.4s, v0.8h, v1.8h
+; SDAG-NEXT:    umull v0.4s, v0.4h, v1.4h
 ; SDAG-NEXT:    ldr q1, [x8, :lo12:.LCPI2_2]
+; SDAG-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
 ; SDAG-NEXT:    ushl v0.8h, v0.8h, v1.8h
 ; SDAG-NEXT:    ret
 ;
 ; GISEL-LABEL: combine_vec_udiv_nonuniform2:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    adrp x8, .LCPI2_3
-; GISEL-NEXT:    adrp x9, .LCPI2_1
+; GISEL-NEXT:    adrp x9, .LCPI2_0
 ; GISEL-NEXT:    ldr q1, [x8, :lo12:.LCPI2_3]
 ; GISEL-NEXT:    adrp x8, .LCPI2_2
-; GISEL-NEXT:    ldr q4, [x9, :lo12:.LCPI2_1]
-; GISEL-NEXT:    neg v1.8h, v1.8h
+; GISEL-NEXT:    ldr q4, [x9, :lo12:.LCPI2_0]
 ; GISEL-NEXT:    ldr q2, [x8, :lo12:.LCPI2_2]
-; GISEL-NEXT:    adrp x8, .LCPI2_4
+; GISEL-NEXT:    adrp x8, .LCPI2_1
+; GISEL-NEXT:    neg v1.8h, v1.8h
 ; GISEL-NEXT:    ushl v1.8h, v0.8h, v1.8h
-; GISEL-NEXT:    neg v4.8h, v4.8h
 ; GISEL-NEXT:    umull2 v3.4s, v1.8h, v2.8h
 ; GISEL-NEXT:    umull v1.4s, v1.4h, v2.4h
-; GISEL-NEXT:    ldr q2, [x8, :lo12:.LCPI2_4]
-; GISEL-NEXT:    adrp x8, .LCPI2_0
+; GISEL-NEXT:    ldr q2, [x8, :lo12:.LCPI2_1]
+; GISEL-NEXT:    adrp x8, .LCPI2_4
+; GISEL-NEXT:    neg v2.8h, v2.8h
 ; GISEL-NEXT:    uzp2 v1.8h, v1.8h, v3.8h
-; GISEL-NEXT:    ldr q3, [x8, :lo12:.LCPI2_0]
-; GISEL-NEXT:    cmeq v2.8h, v2.8h, v3.8h
-; GISEL-NEXT:    ushl v1.8h, v1.8h, v4.8h
+; GISEL-NEXT:    ldr q3, [x8, :lo12:.LCPI2_4]
+; GISEL-NEXT:    ushl v1.8h, v1.8h, v2.8h
+; GISEL-NEXT:    cmeq v2.8h, v3.8h, v4.8h
 ; GISEL-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; GISEL-NEXT:    ret
   %1 = udiv <8 x i16> %x, <i16 -34, i16 35, i16 36, i16 -37, i16 38, i16 -39, i16 40, i16 -41>
@@ -147,21 +147,21 @@ define <8 x i16> @combine_vec_udiv_nonuniform3(<8 x i16> %x) {
 ; GISEL-LABEL: combine_vec_udiv_nonuniform3:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    adrp x8, .LCPI3_2
-; GISEL-NEXT:    adrp x9, .LCPI3_1
+; GISEL-NEXT:    adrp x9, .LCPI3_0
 ; GISEL-NEXT:    ldr q1, [x8, :lo12:.LCPI3_2]
-; GISEL-NEXT:    adrp x8, .LCPI3_3
-; GISEL-NEXT:    ldr q3, [x9, :lo12:.LCPI3_1]
+; GISEL-NEXT:    adrp x8, .LCPI3_1
+; GISEL-NEXT:    ldr q4, [x9, :lo12:.LCPI3_0]
 ; GISEL-NEXT:    umull2 v2.4s, v0.8h, v1.8h
 ; GISEL-NEXT:    umull v1.4s, v0.4h, v1.4h
-; GISEL-NEXT:    neg v3.8h, v3.8h
 ; GISEL-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
-; GISEL-NEXT:    ldr q2, [x8, :lo12:.LCPI3_3]
-; GISEL-NEXT:    adrp x8, .LCPI3_0
-; GISEL-NEXT:    sub v4.8h, v0.8h, v1.8h
-; GISEL-NEXT:    ldr q5, [x8, :lo12:.LCPI3_0]
-; GISEL-NEXT:    usra v1.8h, v4.8h, #1
-; GISEL-NEXT:    cmeq v2.8h, v2.8h, v5.8h
-; GISEL-NEXT:    ushl v1.8h, v1.8h, v3.8h
+; GISEL-NEXT:    sub v2.8h, v0.8h, v1.8h
+; GISEL-NEXT:    usra v1.8h, v2.8h, #1
+; GISEL-NEXT:    ldr q2, [x8, :lo12:.LCPI3_1]
+; GISEL-NEXT:    adrp x8, .LCPI3_3
+; GISEL-NEXT:    ldr q3, [x8, :lo12:.LCPI3_3]
+; GISEL-NEXT:    neg v2.8h, v2.8h
+; GISEL-NEXT:    ushl v1.8h, v1.8h, v2.8h
+; GISEL-NEXT:    cmeq v2.8h, v3.8h, v4.8h
 ; GISEL-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; GISEL-NEXT:    ret
   %1 = udiv <8 x i16> %x, <i16 7, i16 23, i16 25, i16 27, i16 31, i16 47, i16 63, i16 127>
@@ -174,8 +174,8 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 ; SDAG-NEXT:    movi v1.16b, #171
 ; SDAG-NEXT:    adrp x8, .LCPI4_0
 ; SDAG-NEXT:    adrp x9, .LCPI4_1
-; SDAG-NEXT:    umull2 v2.8h, v0.16b, v1.16b
 ; SDAG-NEXT:    ldr q3, [x9, :lo12:.LCPI4_1]
+; SDAG-NEXT:    umull2 v2.8h, v0.16b, v1.16b
 ; SDAG-NEXT:    umull v1.8h, v0.8b, v1.8b
 ; SDAG-NEXT:    and v0.16b, v0.16b, v3.16b
 ; SDAG-NEXT:    uzp2 v1.16b, v1.16b, v2.16b
@@ -188,19 +188,19 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 ; GISEL-LABEL: combine_vec_udiv_nonuniform4:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    adrp x8, .LCPI4_2
-; GISEL-NEXT:    adrp x9, .LCPI4_1
+; GISEL-NEXT:    adrp x9, .LCPI4_0
 ; GISEL-NEXT:    ldr q1, [x8, :lo12:.LCPI4_2]
+; GISEL-NEXT:    adrp x8, .LCPI4_1
+; GISEL-NEXT:    ldr q4, [x9, :lo12:.LCPI4_0]
+; GISEL-NEXT:    ldr q3, [x8, :lo12:.LCPI4_1]
 ; GISEL-NEXT:    adrp x8, .LCPI4_3
-; GISEL-NEXT:    ldr q4, [x9, :lo12:.LCPI4_1]
 ; GISEL-NEXT:    umull2 v2.8h, v0.16b, v1.16b
-; GISEL-NEXT:    ldr q3, [x8, :lo12:.LCPI4_3]
 ; GISEL-NEXT:    umull v1.8h, v0.8b, v1.8b
-; GISEL-NEXT:    adrp x8, .LCPI4_0
-; GISEL-NEXT:    neg v4.16b, v4.16b
 ; GISEL-NEXT:    uzp2 v1.16b, v1.16b, v2.16b
-; GISEL-NEXT:    ldr q2, [x8, :lo12:.LCPI4_0]
-; GISEL-NEXT:    cmeq v2.16b, v3.16b, v2.16b
-; GISEL-NEXT:    ushl v1.16b, v1.16b, v4.16b
+; GISEL-NEXT:    neg v2.16b, v3.16b
+; GISEL-NEXT:    ldr q3, [x8, :lo12:.LCPI4_3]
+; GISEL-NEXT:    ushl v1.16b, v1.16b, v2.16b
+; GISEL-NEXT:    cmeq v2.16b, v3.16b, v4.16b
 ; GISEL-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; GISEL-NEXT:    ret
   %div = udiv <16 x i8> %x, <i8 -64, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
@@ -211,52 +211,52 @@ define <8 x i16> @pr38477(<8 x i16> %a0) {
 ; SDAG-LABEL: pr38477:
 ; SDAG:       // %bb.0:
 ; SDAG-NEXT:    adrp x8, .LCPI5_0
-; SDAG-NEXT:    adrp x9, .LCPI5_3
+; SDAG-NEXT:    adrp x9, .LCPI5_4
 ; SDAG-NEXT:    ldr q1, [x8, :lo12:.LCPI5_0]
 ; SDAG-NEXT:    adrp x8, .LCPI5_1
+; SDAG-NEXT:    ldr q3, [x8, :lo12:.LCPI5_1]
+; SDAG-NEXT:    adrp x8, .LCPI5_2
 ; SDAG-NEXT:    umull2 v2.4s, v0.8h, v1.8h
 ; SDAG-NEXT:    umull v1.4s, v0.4h, v1.4h
 ; SDAG-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
-; SDAG-NEXT:    ldr q2, [x8, :lo12:.LCPI5_1]
-; SDAG-NEXT:    adrp x8, .LCPI5_2
-; SDAG-NEXT:    sub v3.8h, v0.8h, v1.8h
-; SDAG-NEXT:    umull2 v4.4s, v3.8h, v2.8h
-; SDAG-NEXT:    umull v2.4s, v3.4h, v2.4h
-; SDAG-NEXT:    ldr q3, [x8, :lo12:.LCPI5_2]
-; SDAG-NEXT:    adrp x8, .LCPI5_4
+; SDAG-NEXT:    sub v2.8h, v0.8h, v1.8h
+; SDAG-NEXT:    umull2 v4.4s, v2.8h, v3.8h
+; SDAG-NEXT:    umull v2.4s, v2.4h, v3.4h
+; SDAG-NEXT:    ldr q3, [x9, :lo12:.LCPI5_4]
+; SDAG-NEXT:    and v0.16b, v0.16b, v3.16b
 ; SDAG-NEXT:    uzp2 v2.8h, v2.8h, v4.8h
-; SDAG-NEXT:    ldr q4, [x9, :lo12:.LCPI5_3]
 ; SDAG-NEXT:    add v1.8h, v2.8h, v1.8h
-; SDAG-NEXT:    ldr q2, [x8, :lo12:.LCPI5_4]
-; SDAG-NEXT:    ushl v1.8h, v1.8h, v3.8h
-; SDAG-NEXT:    and v0.16b, v0.16b, v2.16b
-; SDAG-NEXT:    and v1.16b, v1.16b, v4.16b
+; SDAG-NEXT:    ldr q2, [x8, :lo12:.LCPI5_2]
+; SDAG-NEXT:    adrp x8, .LCPI5_3
+; SDAG-NEXT:    ushl v1.8h, v1.8h, v2.8h
+; SDAG-NEXT:    ldr q2, [x8, :lo12:.LCPI5_3]
+; SDAG-NEXT:    and v1.16b, v1.16b, v2.16b
 ; SDAG-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; SDAG-NEXT:    ret
 ;
 ; GISEL-LABEL: pr38477:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    adrp x8, .LCPI5_3
-; GISEL-NEXT:    adrp x9, .LCPI5_4
+; GISEL-NEXT:    adrp x9, .LCPI5_0
 ; GISEL-NEXT:    ldr q1, [x8, :lo12:.LCPI5_3]
 ; GISEL-NEXT:    adrp x8, .LCPI5_2
+; GISEL-NEXT:    ldr q3, [x8, :lo12:.LCPI5_2]
+; GISEL-NEXT:    adrp x8, .LCPI5_1
 ; GISEL-NEXT:    umull2 v2.4s, v0.8h, v1.8h
 ; GISEL-NEXT:    umull v1.4s, v0.4h, v1.4h
 ; GISEL-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
-; GISEL-NEXT:    ldr q2, [x8, :lo12:.LCPI5_2]
-; GISEL-NEXT:    adrp x8, .LCPI5_1
-; GISEL-NEXT:    sub v3.8h, v0.8h, v1.8h
-; GISEL-NEXT:    umull2 v4.4s, v3.8h, v2.8h
-; GISEL-NEXT:    umull v2.4s, v3.4h, v2.4h
+; GISEL-NEXT:    sub v2.8h, v0.8h, v1.8h
+; GISEL-NEXT:    umull2 v4.4s, v2.8h, v3.8h
+; GISEL-NEXT:    umull v2.4s, v2.4h, v3.4h
 ; GISEL-NEXT:    ldr q3, [x8, :lo12:.LCPI5_1]
-; GISEL-NEXT:    adrp x8, .LCPI5_0
-; GISEL-NEXT:    neg v3.8h, v3.8h
+; GISEL-NEXT:    adrp x8, .LCPI5_4
 ; GISEL-NEXT:    uzp2 v2.8h, v2.8h, v4.8h
-; GISEL-NEXT:    ldr q4, [x9, :lo12:.LCPI5_4]
-; GISEL-NEXT:    ldr q5, [x8, :lo12:.LCPI5_0]
+; GISEL-NEXT:    ldr q4, [x9, :lo12:.LCPI5_0]
 ; GISEL-NEXT:    add v1.8h, v2.8h, v1.8h
-; GISEL-NEXT:    cmeq v2.8h, v4.8h, v5.8h
-; GISEL-NEXT:    ushl v1.8h, v1.8h, v3.8h
+; GISEL-NEXT:    neg v2.8h, v3.8h
+; GISEL-NEXT:    ldr q3, [x8, :lo12:.LCPI5_4]
+; GISEL-NEXT:    ushl v1.8h, v1.8h, v2.8h
+; GISEL-NEXT:    cmeq v2.8h, v3.8h, v4.8h
 ; GISEL-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; GISEL-NEXT:    ret
   %1 = udiv <8 x i16> %a0, <i16 1, i16 119, i16 73, i16 -111, i16 -3, i16 118, i16 32, i16 31>

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-bitfield-insert.ll b/llvm/test/CodeGen/AArch64/GlobalISel/select-bitfield-insert.ll
index b8e2364485966c..e1df07c93ebf1f 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-bitfield-insert.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-bitfield-insert.ll
@@ -145,8 +145,8 @@ define i64 @extra_use2(i64 %in1, i64 %in2, ptr %p) {
 ; SDAG:       ; %bb.0: ; %bb
 ; SDAG-NEXT:    and x8, x1, #0x1
 ; SDAG-NEXT:    bfi x1, x0, #1, #63
-; SDAG-NEXT:    mov x0, x1
 ; SDAG-NEXT:    str x8, [x2]
+; SDAG-NEXT:    mov x0, x1
 ; SDAG-NEXT:    ret
 bb:
   %tmp3 = shl i64 %in1, 1

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll b/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll
index 16bf85af9c17b3..ae26c363ef56bf 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/stacksave-stackrestore.ll
@@ -17,8 +17,8 @@ define void @test_scoped_alloca(i64 %n) {
 ; CHECK-NEXT:    .cfi_offset w29, -32
 ; CHECK-NEXT:    add x9, x0, #15
 ; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
 ; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
 ; CHECK-NEXT:    sub x0, x8, x9
 ; CHECK-NEXT:    mov sp, x0
 ; CHECK-NEXT:    bl use_addr

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.ll b/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.ll
index 7a9eee6f30b672..23886d8bc4a7ba 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.ll
@@ -4,8 +4,8 @@
 define void @test_simple_2xs8(ptr %ptr) {
 ; CHECK-LABEL: test_simple_2xs8:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov w8, #4
-; CHECK-NEXT:    mov w9, #5
+; CHECK-NEXT:    mov w8, #4 ; =0x4
+; CHECK-NEXT:    mov w9, #5 ; =0x5
 ; CHECK-NEXT:    strb w8, [x0]
 ; CHECK-NEXT:    strb w9, [x0, #1]
 ; CHECK-NEXT:    ret
@@ -18,7 +18,7 @@ define void @test_simple_2xs8(ptr %ptr) {
 define void @test_simple_2xs16(ptr %ptr) {
 ; CHECK-LABEL: test_simple_2xs16:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov w8, #4
+; CHECK-NEXT:    mov w8, #4 ; =0x4
 ; CHECK-NEXT:    movk w8, #5, lsl #16
 ; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    ret
@@ -31,7 +31,7 @@ define void @test_simple_2xs16(ptr %ptr) {
 define void @test_simple_4xs16(ptr %ptr) {
 ; CHECK-LABEL: test_simple_4xs16:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov x8, #4
+; CHECK-NEXT:    mov x8, #4 ; =0x4
 ; CHECK-NEXT:    movk x8, #5, lsl #16
 ; CHECK-NEXT:    movk x8, #9, lsl #32
 ; CHECK-NEXT:    movk x8, #14, lsl #48
@@ -50,7 +50,7 @@ define void @test_simple_4xs16(ptr %ptr) {
 define void @test_simple_2xs32(ptr %ptr) {
 ; CHECK-LABEL: test_simple_2xs32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov x8, #4
+; CHECK-NEXT:    mov x8, #4 ; =0x4
 ; CHECK-NEXT:    movk x8, #5, lsl #32
 ; CHECK-NEXT:    str x8, [x0]
 ; CHECK-NEXT:    ret
@@ -63,8 +63,8 @@ define void @test_simple_2xs32(ptr %ptr) {
 define void @test_simple_2xs64_illegal(ptr %ptr) {
 ; CHECK-LABEL: test_simple_2xs64_illegal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov w8, #4
-; CHECK-NEXT:    mov w9, #5
+; CHECK-NEXT:    mov w8, #4 ; =0x4
+; CHECK-NEXT:    mov w9, #5 ; =0x5
 ; CHECK-NEXT:    stp x8, x9, [x0]
 ; CHECK-NEXT:    ret
   store i64 4, ptr %ptr
@@ -77,14 +77,14 @@ define void @test_simple_2xs64_illegal(ptr %ptr) {
 define void @test_simple_vector(ptr %ptr) {
 ; CHECK-LABEL: test_simple_vector:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov w8, #4
-; CHECK-NEXT:    mov w9, #7
-; CHECK-NEXT:    mov w10, #5
-; CHECK-NEXT:    mov w11, #8
+; CHECK-NEXT:    mov w8, #4 ; =0x4
+; CHECK-NEXT:    mov w9, #7 ; =0x7
 ; CHECK-NEXT:    strh w8, [x0]
+; CHECK-NEXT:    mov w8, #5 ; =0x5
 ; CHECK-NEXT:    strh w9, [x0, #2]
-; CHECK-NEXT:    strh w10, [x0, #4]
-; CHECK-NEXT:    strh w11, [x0, #6]
+; CHECK-NEXT:    mov w9, #8 ; =0x8
+; CHECK-NEXT:    strh w8, [x0, #4]
+; CHECK-NEXT:    strh w9, [x0, #6]
 ; CHECK-NEXT:    ret
   store <2 x i16> <i16 4, i16 7>, ptr %ptr
   %addr2 = getelementptr <2 x i16>, ptr %ptr, i64 1
@@ -95,10 +95,10 @@ define void @test_simple_vector(ptr %ptr) {
 define i32 @test_unknown_alias(ptr %ptr, ptr %aliasptr) {
 ; CHECK-LABEL: test_unknown_alias:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov w9, #4
+; CHECK-NEXT:    mov w9, #4 ; =0x4
 ; CHECK-NEXT:    mov x8, x0
 ; CHECK-NEXT:    str w9, [x0]
-; CHECK-NEXT:    mov w9, #5
+; CHECK-NEXT:    mov w9, #5 ; =0x5
 ; CHECK-NEXT:    ldr w0, [x1]
 ; CHECK-NEXT:    str w9, [x8, #4]
 ; CHECK-NEXT:    ret
@@ -112,12 +112,12 @@ define i32 @test_unknown_alias(ptr %ptr, ptr %aliasptr) {
 define void @test_2x_2xs32(ptr %ptr, ptr %ptr2) {
 ; CHECK-LABEL: test_2x_2xs32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov x10, #9
-; CHECK-NEXT:    mov w8, #4
-; CHECK-NEXT:    mov w9, #5
-; CHECK-NEXT:    movk x10, #17, lsl #32
+; CHECK-NEXT:    mov w8, #4 ; =0x4
+; CHECK-NEXT:    mov w9, #5 ; =0x5
 ; CHECK-NEXT:    stp w8, w9, [x0]
-; CHECK-NEXT:    str x10, [x1]
+; CHECK-NEXT:    mov x8, #9 ; =0x9
+; CHECK-NEXT:    movk x8, #17, lsl #32
+; CHECK-NEXT:    str x8, [x1]
 ; CHECK-NEXT:    ret
   store i32 4, ptr %ptr
   %addr2 = getelementptr i32, ptr %ptr, i64 1
@@ -170,14 +170,14 @@ define void @test_simple_var_2xs32(ptr %ptr, i32 %v1, i32 %v2) {
 define void @test_alias_4xs16(ptr %ptr, ptr %ptr2) {
 ; CHECK-LABEL: test_alias_4xs16:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov w8, #4
-; CHECK-NEXT:    mov w9, #9
+; CHECK-NEXT:    mov w8, #4 ; =0x4
+; CHECK-NEXT:    mov w9, #9 ; =0x9
 ; CHECK-NEXT:    movk w8, #5, lsl #16
-; CHECK-NEXT:    mov w10, #14
 ; CHECK-NEXT:    strh w9, [x0, #4]
 ; CHECK-NEXT:    str w8, [x0]
+; CHECK-NEXT:    mov w8, #14 ; =0xe
 ; CHECK-NEXT:    strh wzr, [x1]
-; CHECK-NEXT:    strh w10, [x0, #6]
+; CHECK-NEXT:    strh w8, [x0, #6]
 ; CHECK-NEXT:    ret
   store i16 4, ptr %ptr
   %addr2 = getelementptr i16, ptr %ptr, i64 1
@@ -194,13 +194,13 @@ define void @test_alias_4xs16(ptr %ptr, ptr %ptr2) {
 define void @test_alias2_4xs16(ptr %ptr, ptr %ptr2, ptr %ptr3) {
 ; CHECK-LABEL: test_alias2_4xs16:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov w8, #4
-; CHECK-NEXT:    mov w9, #5
-; CHECK-NEXT:    movk w9, #9, lsl #16
+; CHECK-NEXT:    mov w8, #4 ; =0x4
 ; CHECK-NEXT:    strh w8, [x0]
-; CHECK-NEXT:    mov w8, #14
+; CHECK-NEXT:    mov w8, #5 ; =0x5
+; CHECK-NEXT:    movk w8, #9, lsl #16
 ; CHECK-NEXT:    strh wzr, [x2]
-; CHECK-NEXT:    stur w9, [x0, #2]
+; CHECK-NEXT:    stur w8, [x0, #2]
+; CHECK-NEXT:    mov w8, #14 ; =0xe
 ; CHECK-NEXT:    strh wzr, [x1]
 ; CHECK-NEXT:    strh w8, [x0, #6]
 ; CHECK-NEXT:    ret
@@ -220,17 +220,17 @@ define void @test_alias2_4xs16(ptr %ptr, ptr %ptr2, ptr %ptr3) {
 define void @test_alias3_4xs16(ptr %ptr, ptr %ptr2, ptr %ptr3, ptr %ptr4) {
 ; CHECK-LABEL: test_alias3_4xs16:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov w8, #4
-; CHECK-NEXT:    mov w9, #5
+; CHECK-NEXT:    mov w8, #4 ; =0x4
 ; CHECK-NEXT:    strh w8, [x0]
-; CHECK-NEXT:    mov w8, #9
+; CHECK-NEXT:    mov w8, #5 ; =0x5
 ; CHECK-NEXT:    strh wzr, [x2]
-; CHECK-NEXT:    strh w9, [x0, #2]
-; CHECK-NEXT:    mov w9, #14
+; CHECK-NEXT:    strh w8, [x0, #2]
+; CHECK-NEXT:    mov w8, #9 ; =0x9
 ; CHECK-NEXT:    strh wzr, [x3]
 ; CHECK-NEXT:    strh w8, [x0, #4]
+; CHECK-NEXT:    mov w8, #14 ; =0xe
 ; CHECK-NEXT:    strh wzr, [x1]
-; CHECK-NEXT:    strh w9, [x0, #6]
+; CHECK-NEXT:    strh w8, [x0, #6]
 ; CHECK-NEXT:    ret
   store i16 4, ptr %ptr
   %addr2 = getelementptr i16, ptr %ptr, i64 1
@@ -251,7 +251,7 @@ define i32 @test_alias_allocas_2xs32(ptr %ptr) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #32
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    mov x8, #4
+; CHECK-NEXT:    mov x8, #4 ; =0x4
 ; CHECK-NEXT:    ldr w0, [sp, #4]
 ; CHECK-NEXT:    movk x8, #5, lsl #32
 ; CHECK-NEXT:    str x8, [sp, #8]
@@ -285,9 +285,9 @@ define void @test_atomic(ptr %ptr) {
 ; CHECK-LABEL: test_atomic:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    add x9, x8, #4
 ; CHECK-NEXT:    stlr wzr, [x8]
-; CHECK-NEXT:    stlr wzr, [x9]
+; CHECK-NEXT:    add x8, x8, #4
+; CHECK-NEXT:    stlr wzr, [x8]
 ; CHECK-NEXT:    ret
 entry:
   %0 = load ptr, ptr %ptr, align 8
@@ -304,14 +304,14 @@ entry:
 define i32 @test_alias_3xs16(ptr %ptr, ptr %ptr2, ptr %ptr3, ptr noalias %safe_ptr) {
 ; CHECK-LABEL: test_alias_3xs16:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov x10, #9
 ; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    mov w9, #5 ; =0x5
 ; CHECK-NEXT:    ldr w0, [x3]
-; CHECK-NEXT:    mov w9, #5
-; CHECK-NEXT:    movk x10, #14, lsl #32
 ; CHECK-NEXT:    str w9, [x8, #4]
+; CHECK-NEXT:    mov x9, #9 ; =0x9
+; CHECK-NEXT:    movk x9, #14, lsl #32
 ; CHECK-NEXT:    strh wzr, [x8, #4]
-; CHECK-NEXT:    str x10, [x8, #8]
+; CHECK-NEXT:    str x9, [x8, #8]
 ; CHECK-NEXT:    ret
   %safeld = load i32, ptr %safe_ptr
   %addr2 = getelementptr i32, ptr %ptr, i64 1

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/swifterror.ll b/llvm/test/CodeGen/AArch64/GlobalISel/swifterror.ll
index 9dfb2696594bef..6d27e4f4d603bd 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/swifterror.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/swifterror.ll
@@ -55,8 +55,8 @@ handler:
 ; "caller2" is the caller of "foo", it calls "foo" inside a loop.
 define float @caller2(ptr %error_ref) {
 ; CHECK-LABEL: caller2:
-; CHECK: mov [[ID:x[0-9]+]], x0
 ; CHECK: fmov [[CMP:s[0-9]+]], #1.0
+; CHECK: mov [[ID:x[0-9]+]], x0
 ; CHECK: mov x21, xzr
 ; CHECK: bl {{.*}}foo
 ; CHECK: cbnz x21
@@ -160,8 +160,8 @@ define void @foo_sret(ptr sret(%struct.S) %agg.result, i32 %val1, ptr swifterror
 ; CHECK-DAG: mov w0, #16
 ; CHECK: malloc
 ; CHECK: mov [[ID:w[0-9]+]], #1
-; CHECK: mov x21, x0
 ; CHECK: strb [[ID]], [x0, #8]
+; CHECK: mov x21, x0
 ; CHECK: str w{{.*}}, [{{.*}}[[SRET]], #4]
 ; CHECK-NOT: x21
 
@@ -214,9 +214,8 @@ define float @foo_vararg(ptr swifterror %error_ptr_ref, ...) {
 ; CHECK: mov w0, #16
 ; CHECK: malloc
 ; CHECK: mov [[ID:w[0-9]+]], #1
-; CHECK: mov x21, x0
-; CHECK-NOT: x21
 ; CHECK: strb [[ID]], [x0, #8]
+; CHECK: mov x21, x0
 ; CHECK-NOT: x21
 
 ; First vararg
@@ -336,6 +335,7 @@ entry:
 ; CHECK:  str     xzr, [sp]
 ; CHECK:  bl      _params_in_reg2
 ; Restore original arguments for next call.
+; CHECK:  ldr     x8, [sp, #24]
 ; CHECK:  mov      x1, x20
 ; CHECK:  mov      x2, x22
 ; CHECK:  mov      x3, x23
@@ -345,12 +345,13 @@ entry:
 ; CHECK:  mov      x7, x27
 ; Restore original swiftself argument and swifterror %err.
 ; CHECK:  mov      x21, x28
-; CHECK:  ldr      x8, [sp
 ; CHECK:  bl      _params_in_reg2
 ; Restore calle save registers but don't clober swifterror x21.
 ; CHECK-NOT: x21
 ; CHECK:  ldp     x29, x30, [sp
 ; CHECK-NOT: x21
+; CHECK:  ldr     x28, [sp
+; CHECK-NOT: x21
 ; CHECK:  ldp     x20, x19, [sp
 ; CHECK-NOT: x21
 ; CHECK:  ldp     x23, x22, [sp
@@ -359,8 +360,6 @@ entry:
 ; CHECK-NOT: x21
 ; CHECK:  ldp     x27, x26, [sp
 ; CHECK-NOT: x21
-; CHECK:  ldr     x28, [sp
-; CHECK-NOT: x21
 ; CHECK:  ret
 define swiftcc void @params_in_reg(i64, i64, i64, i64, i64, i64, i64, i64, ptr, ptr nocapture swifterror %err) {
   %error_ptr_ref = alloca swifterror ptr, align 8
@@ -373,7 +372,7 @@ declare swiftcc void @params_in_reg2(i64, i64, i64, i64, i64, i64, i64, i64, ptr
 
 ; CHECK-LABEL: params_and_return_in_reg
 ; Store callee saved registers.
-; CHECK:  stp     x28, x21, [sp, #16
+; CHECK:  stp     x28, x0, [sp, #16
 ; CHECK:  stp     x27, x26, [sp
 ; CHECK:  stp     x25, x24, [sp
 ; CHECK:  stp     x23, x22, [sp
@@ -399,9 +398,9 @@ declare swiftcc void @params_in_reg2(i64, i64, i64, i64, i64, i64, i64, i64, ptr
 ; CHECK:  mov      x21, xzr
 ; CHECK:  bl      _params_in_reg2
 ; Store swifterror %error_ptr_ref.
+; CHECK:  ldr     x0, [sp, #24]
 ; CHECK:  stp     {{x[0-9]+}}, x21, [sp]
 ; Setup call arguments from original arguments.
-; CHECK:  mov      x0, x19
 ; CHECK:  mov      x1, x20
 ; CHECK:  mov      x2, x22
 ; CHECK:  mov      x3, x23
@@ -409,19 +408,19 @@ declare swiftcc void @params_in_reg2(i64, i64, i64, i64, i64, i64, i64, i64, ptr
 ; CHECK:  mov      x5, x25
 ; CHECK:  mov      x6, x26
 ; CHECK:  mov      x7, x27
-; CHECK:  ldr      x21, [sp, #24
+; CHECK:  mov      x21, x28
 ; CHECK:  bl      _params_and_return_in_reg2
+; CHECK:  mov      x19, x21
+; CHECK:  ldr      x21, [sp, #8
 ; Store return values.
-; CHECK:  mov      x19, x0
-; CHECK:  mov      x20, x1
-; CHECK:  mov      x22, x2
-; CHECK:  mov      x23, x3
-; CHECK:  mov      x24, x4
-; CHECK:  mov      x25, x5
-; CHECK:  mov      x26, x6
-; CHECK:  mov      x27, x7
-; Save swifterror %err.
-; CHECK:  mov      x28, x21
+; CHECK:  mov     x20, x0
+; CHECK:  mov     x22, x1
+; CHECK:  mov     x23, x2
+; CHECK:  mov     x24, x3
+; CHECK:  mov     x25, x4
+; CHECK:  mov     x26, x5
+; CHECK:  mov     x27, x6
+; CHECK:  mov     x28, x7
 ; Setup call.
 ; CHECK:  mov     w0, #1
 ; CHECK:  mov     w1, #2
@@ -431,26 +430,25 @@ declare swiftcc void @params_in_reg2(i64, i64, i64, i64, i64, i64, i64, i64, ptr
 ; CHECK:  mov     w5, #6
 ; CHECK:  mov     w6, #7
 ; CHECK:  mov     w7, #8
-; ... setup call with swiferror %error_ptr_ref.
-; CHECK:  ldr     x21, [sp, #8]
+; CHECK:  str     xzr, [sp]
 ; CHECK:  bl      _params_in_reg2
 ; Restore return values for return from this function.
-; CHECK:  mov      x0, x19
-; CHECK:  mov      x1, x20
-; CHECK:  mov      x2, x22
-; CHECK:  mov      x3, x23
-; CHECK:  mov      x4, x24
-; CHECK:  mov      x5, x25
-; CHECK:  mov      x6, x26
-; CHECK:  mov      x7, x27
-; CHECK:  ldp     x29, x30, [sp
-; CHECK:  mov      x21, x28
-; Restore callee save registers.
-; CHECK:  ldp     x20, x19, [sp
-; CHECK:  ldp     x23, x22, [sp
-; CHECK:  ldp     x25, x24, [sp
-; CHECK:  ldp     x27, x26, [sp
-; CHECK:  ldr     x28, [sp
+; CHECK:  mov     x0, x20
+; CHECK:  mov     x1, x22
+; CHECK:  mov     x2, x23
+; CHECK:  mov     x3, x24
+; CHECK:  mov     x4, x25
+; CHECK:  mov     x5, x26
+; CHECK:  mov     x6, x27
+; CHECK:  mov     x21, x19
+; CHECK:  mov     x7, x28
+; CHECK:  ldp     x29, x30, [sp, #96]             ; 16-byte Folded Reload
+; CHECK:  ldr     x28, [sp, #16]                  ; 8-byte Folded Reload
+; CHECK:  ldp     x20, x19, [sp, #80]             ; 16-byte Folded Reload
+; CHECK:  ldp     x23, x22, [sp, #64]             ; 16-byte Folded Reload
+; CHECK:  ldp     x25, x24, [sp, #48]             ; 16-byte Folded Reload
+; CHECK:  ldp     x27, x26, [sp, #32]             ; 16-byte Folded Reload
+; CHECK:  add     sp, sp, #112
 ; CHECK:  ret
 define swiftcc { i64, i64, i64, i64, i64, i64, i64, i64 } @params_and_return_in_reg(i64, i64, i64, i64, i64, i64, i64, i64, ptr , ptr nocapture swifterror %err) {
   %error_ptr_ref = alloca swifterror ptr, align 8

diff  --git a/llvm/test/CodeGen/AArch64/a57-csel.ll b/llvm/test/CodeGen/AArch64/a57-csel.ll
index b8df1d9eaa9359..c96f03e57e0fae 100644
--- a/llvm/test/CodeGen/AArch64/a57-csel.ll
+++ b/llvm/test/CodeGen/AArch64/a57-csel.ll
@@ -1,9 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -mcpu=cortex-a57 -aarch64-enable-early-ifcvt=false | FileCheck %s
 
 ; Check that the select isn't expanded into a branch sequence
 ; when the icmp's first operand %x0 is from load.
 define i64 @f(i64 %a, i64 %b, ptr %c, i64 %d, i64 %e) {
-  ; CHECK: csel
+; CHECK-LABEL: f:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x2]
+; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:    csel x8, x0, x1, eq
+; CHECK-NEXT:    add x0, x8, x3
+; CHECK-NEXT:    ret
   %x0 = load i64, ptr %c
   %x1 = icmp eq i64 %x0, 0
   %x2 = select i1 %x1, i64 %a, i64 %b

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
index 1c05fe737883ce..e3bf87c7bb79c3 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
@@ -91,13 +91,13 @@ define i32 @oversized_ADDV_256(ptr noalias nocapture readonly %arg1, ptr noalias
 ; GISEL-NEXT:    usubl v2.4s, v3.4h, v4.4h
 ; GISEL-NEXT:    cmgt v3.4s, v0.4s, v1.4s
 ; GISEL-NEXT:    neg v4.4s, v1.4s
-; GISEL-NEXT:    cmgt v0.4s, v0.4s, v2.4s
 ; GISEL-NEXT:    shl v3.4s, v3.4s, #31
-; GISEL-NEXT:    shl v0.4s, v0.4s, #31
+; GISEL-NEXT:    cmgt v0.4s, v0.4s, v2.4s
 ; GISEL-NEXT:    neg v5.4s, v2.4s
 ; GISEL-NEXT:    sshr v3.4s, v3.4s, #31
-; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
+; GISEL-NEXT:    shl v0.4s, v0.4s, #31
 ; GISEL-NEXT:    bit v1.16b, v4.16b, v3.16b
+; GISEL-NEXT:    sshr v0.4s, v0.4s, #31
 ; GISEL-NEXT:    bsl v0.16b, v5.16b, v2.16b
 ; GISEL-NEXT:    add v0.4s, v1.4s, v0.4s
 ; GISEL-NEXT:    addv s0, v0.4s
@@ -122,9 +122,9 @@ define i32 @oversized_ADDV_512(ptr %arr)  {
 ; SDAG-LABEL: oversized_ADDV_512:
 ; SDAG:       // %bb.0:
 ; SDAG-NEXT:    ldp q0, q1, [x0, #32]
-; SDAG-NEXT:    ldp q3, q2, [x0]
-; SDAG-NEXT:    add v0.4s, v3.4s, v0.4s
-; SDAG-NEXT:    add v1.4s, v2.4s, v1.4s
+; SDAG-NEXT:    ldp q2, q3, [x0]
+; SDAG-NEXT:    add v1.4s, v3.4s, v1.4s
+; SDAG-NEXT:    add v0.4s, v2.4s, v0.4s
 ; SDAG-NEXT:    add v0.4s, v0.4s, v1.4s
 ; SDAG-NEXT:    addv s0, v0.4s
 ; SDAG-NEXT:    fmov w0, s0

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-be-bv.ll b/llvm/test/CodeGen/AArch64/aarch64-be-bv.ll
index dd562a4b2177b5..4afe3626864401 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-be-bv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-be-bv.ll
@@ -6,11 +6,11 @@
 define dso_local void @movi_modimm_t1() nounwind {
 ; CHECK-LABEL: movi_modimm_t1:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.4s, #1
 ; CHECK-NEXT:    adrp x8, vec_v8i16
 ; CHECK-NEXT:    add x8, x8, :lo12:vec_v8i16
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    ld1 { v0.8h }, [x8]
-; CHECK-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ld1 { v1.8h }, [x8]
+; CHECK-NEXT:    add v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    st1 { v0.8h }, [x8]
 ; CHECK-NEXT:    ret
   %in = load <8 x i16>, ptr @vec_v8i16
@@ -22,11 +22,11 @@ define dso_local void @movi_modimm_t1() nounwind {
 define dso_local void @movi_modimm_t2() nounwind {
 ; CHECK-LABEL: movi_modimm_t2:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.4s, #1, lsl #8
 ; CHECK-NEXT:    adrp x8, vec_v8i16
 ; CHECK-NEXT:    add x8, x8, :lo12:vec_v8i16
-; CHECK-NEXT:    movi v1.4s, #1, lsl #8
-; CHECK-NEXT:    ld1 { v0.8h }, [x8]
-; CHECK-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ld1 { v1.8h }, [x8]
+; CHECK-NEXT:    add v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    st1 { v0.8h }, [x8]
 ; CHECK-NEXT:    ret
   %in = load <8 x i16>, ptr @vec_v8i16
@@ -38,11 +38,11 @@ define dso_local void @movi_modimm_t2() nounwind {
 define dso_local void @movi_modimm_t3() nounwind {
 ; CHECK-LABEL: movi_modimm_t3:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.4s, #1, lsl #16
 ; CHECK-NEXT:    adrp x8, vec_v8i16
 ; CHECK-NEXT:    add x8, x8, :lo12:vec_v8i16
-; CHECK-NEXT:    movi v1.4s, #1, lsl #16
-; CHECK-NEXT:    ld1 { v0.8h }, [x8]
-; CHECK-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ld1 { v1.8h }, [x8]
+; CHECK-NEXT:    add v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    st1 { v0.8h }, [x8]
 ; CHECK-NEXT:    ret
   %in = load <8 x i16>, ptr @vec_v8i16
@@ -54,11 +54,11 @@ define dso_local void @movi_modimm_t3() nounwind {
 define dso_local void @movi_modimm_t4() nounwind {
 ; CHECK-LABEL: movi_modimm_t4:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.4s, #1, lsl #24
 ; CHECK-NEXT:    adrp x8, vec_v8i16
 ; CHECK-NEXT:    add x8, x8, :lo12:vec_v8i16
-; CHECK-NEXT:    movi v1.4s, #1, lsl #24
-; CHECK-NEXT:    ld1 { v0.8h }, [x8]
-; CHECK-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ld1 { v1.8h }, [x8]
+; CHECK-NEXT:    add v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    st1 { v0.8h }, [x8]
 ; CHECK-NEXT:    ret
   %in = load <8 x i16>, ptr @vec_v8i16
@@ -70,11 +70,11 @@ define dso_local void @movi_modimm_t4() nounwind {
 define dso_local void @movi_modimm_t5() nounwind {
 ; CHECK-LABEL: movi_modimm_t5:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.8h, #1
 ; CHECK-NEXT:    adrp x8, vec_v8i16
 ; CHECK-NEXT:    add x8, x8, :lo12:vec_v8i16
-; CHECK-NEXT:    movi v1.8h, #1
-; CHECK-NEXT:    ld1 { v0.8h }, [x8]
-; CHECK-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ld1 { v1.8h }, [x8]
+; CHECK-NEXT:    add v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    st1 { v0.8h }, [x8]
 ; CHECK-NEXT:    ret
   %in = load <8 x i16>, ptr @vec_v8i16
@@ -86,11 +86,11 @@ define dso_local void @movi_modimm_t5() nounwind {
 define dso_local void @movi_modimm_t6() nounwind {
 ; CHECK-LABEL: movi_modimm_t6:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.8h, #1, lsl #8
 ; CHECK-NEXT:    adrp x8, vec_v8i16
 ; CHECK-NEXT:    add x8, x8, :lo12:vec_v8i16
-; CHECK-NEXT:    movi v1.8h, #1, lsl #8
-; CHECK-NEXT:    ld1 { v0.8h }, [x8]
-; CHECK-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ld1 { v1.8h }, [x8]
+; CHECK-NEXT:    add v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    st1 { v0.8h }, [x8]
 ; CHECK-NEXT:    ret
   %in = load <8 x i16>, ptr @vec_v8i16
@@ -102,11 +102,11 @@ define dso_local void @movi_modimm_t6() nounwind {
 define dso_local void @movi_modimm_t7() nounwind {
 ; CHECK-LABEL: movi_modimm_t7:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.4s, #1, msl #8
 ; CHECK-NEXT:    adrp x8, vec_v8i16
 ; CHECK-NEXT:    add x8, x8, :lo12:vec_v8i16
-; CHECK-NEXT:    movi v1.4s, #1, msl #8
-; CHECK-NEXT:    ld1 { v0.8h }, [x8]
-; CHECK-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ld1 { v1.8h }, [x8]
+; CHECK-NEXT:    add v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    st1 { v0.8h }, [x8]
 ; CHECK-NEXT:    ret
   %in = load <8 x i16>, ptr @vec_v8i16
@@ -118,11 +118,11 @@ define dso_local void @movi_modimm_t7() nounwind {
 define dso_local void @movi_modimm_t8() nounwind {
 ; CHECK-LABEL: movi_modimm_t8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.4s, #1, msl #16
 ; CHECK-NEXT:    adrp x8, vec_v8i16
 ; CHECK-NEXT:    add x8, x8, :lo12:vec_v8i16
-; CHECK-NEXT:    movi v1.4s, #1, msl #16
-; CHECK-NEXT:    ld1 { v0.8h }, [x8]
-; CHECK-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ld1 { v1.8h }, [x8]
+; CHECK-NEXT:    add v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    st1 { v0.8h }, [x8]
 ; CHECK-NEXT:    ret
   %in = load <8 x i16>, ptr @vec_v8i16
@@ -134,11 +134,11 @@ define dso_local void @movi_modimm_t8() nounwind {
 define dso_local void @movi_modimm_t9() nounwind {
 ; CHECK-LABEL: movi_modimm_t9:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.16b, #1
 ; CHECK-NEXT:    adrp x8, vec_v8i16
 ; CHECK-NEXT:    add x8, x8, :lo12:vec_v8i16
-; CHECK-NEXT:    movi v1.16b, #1
-; CHECK-NEXT:    ld1 { v0.8h }, [x8]
-; CHECK-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ld1 { v1.8h }, [x8]
+; CHECK-NEXT:    add v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    st1 { v0.8h }, [x8]
 ; CHECK-NEXT:    ret
   %in = load <8 x i16>, ptr @vec_v8i16
@@ -150,11 +150,11 @@ define dso_local void @movi_modimm_t9() nounwind {
 define dso_local void @movi_modimm_t10() nounwind {
 ; CHECK-LABEL: movi_modimm_t10:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v0.2d, #0x00ffff0000ffff
 ; CHECK-NEXT:    adrp x8, vec_v8i16
 ; CHECK-NEXT:    add x8, x8, :lo12:vec_v8i16
-; CHECK-NEXT:    movi v1.2d, #0x00ffff0000ffff
-; CHECK-NEXT:    ld1 { v0.8h }, [x8]
-; CHECK-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ld1 { v1.8h }, [x8]
+; CHECK-NEXT:    add v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    st1 { v0.8h }, [x8]
 ; CHECK-NEXT:    ret
   %in = load <8 x i16>, ptr @vec_v8i16
@@ -166,11 +166,11 @@ define dso_local void @movi_modimm_t10() nounwind {
 define dso_local void @fmov_modimm_t11() nounwind {
 ; CHECK-LABEL: fmov_modimm_t11:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov v0.4s, #3.00000000
 ; CHECK-NEXT:    adrp x8, vec_v8i16
 ; CHECK-NEXT:    add x8, x8, :lo12:vec_v8i16
-; CHECK-NEXT:    fmov v1.4s, #3.00000000
-; CHECK-NEXT:    ld1 { v0.8h }, [x8]
-; CHECK-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ld1 { v1.8h }, [x8]
+; CHECK-NEXT:    add v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    st1 { v0.8h }, [x8]
 ; CHECK-NEXT:    ret
   %in = load <8 x i16>, ptr @vec_v8i16
@@ -182,11 +182,11 @@ define dso_local void @fmov_modimm_t11() nounwind {
 define dso_local void @fmov_modimm_t12() nounwind {
 ; CHECK-LABEL: fmov_modimm_t12:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov v0.2d, #0.17968750
 ; CHECK-NEXT:    adrp x8, vec_v8i16
 ; CHECK-NEXT:    add x8, x8, :lo12:vec_v8i16
-; CHECK-NEXT:    fmov v1.2d, #0.17968750
-; CHECK-NEXT:    ld1 { v0.8h }, [x8]
-; CHECK-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ld1 { v1.8h }, [x8]
+; CHECK-NEXT:    add v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    st1 { v0.8h }, [x8]
 ; CHECK-NEXT:    ret
   %in = load <8 x i16>, ptr @vec_v8i16

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
index 4cfc879526dd50..92fd446ebe8f58 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
@@ -151,23 +151,23 @@ define <4 x i32> @test_bit_sink_operand(<4 x i32> %src, <4 x i32> %dst, <4 x i32
 ; CHECK-NEXT:    sub sp, sp, #32
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    cinc w9, w0, lt
-; CHECK-NEXT:    asr w9, w9, #1
+; CHECK-NEXT:    mov w9, wzr
+; CHECK-NEXT:    cinc w8, w0, lt
+; CHECK-NEXT:    asr w8, w8, #1
 ; CHECK-NEXT:  .LBB11_1: // %do.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    bit v1.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    add x10, sp, #16
-; CHECK-NEXT:    bfi x10, x8, #2, #2
 ; CHECK-NEXT:    mov x11, sp
-; CHECK-NEXT:    bfi x11, x8, #2, #2
-; CHECK-NEXT:    add w8, w8, #1
-; CHECK-NEXT:    cmp w8, #5
+; CHECK-NEXT:    bfi x10, x9, #2, #2
+; CHECK-NEXT:    bfi x11, x9, #2, #2
+; CHECK-NEXT:    add w9, w9, #1
+; CHECK-NEXT:    cmp w9, #5
 ; CHECK-NEXT:    str q1, [sp, #16]
 ; CHECK-NEXT:    str w0, [x10]
 ; CHECK-NEXT:    ldr q1, [sp, #16]
 ; CHECK-NEXT:    str q0, [sp]
-; CHECK-NEXT:    str w9, [x11]
+; CHECK-NEXT:    str w8, [x11]
 ; CHECK-NEXT:    ldr q0, [sp]
 ; CHECK-NEXT:    b.ne .LBB11_1
 ; CHECK-NEXT:  // %bb.2: // %do.end

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-combine-add-sub-mul.ll b/llvm/test/CodeGen/AArch64/aarch64-combine-add-sub-mul.ll
index 2a369eade5a91c..e086ab92421fb5 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-combine-add-sub-mul.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-combine-add-sub-mul.ll
@@ -4,8 +4,8 @@
 define <2 x i64> @test_mul_add_2x64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
 ; CHECK-LABEL: test_mul_add_2x64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 def $z2
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
@@ -19,8 +19,8 @@ define <2 x i64> @test_mul_add_2x64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
 define <1 x i64> @test_mul_add_1x64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) {
 ; CHECK-LABEL: test_mul_add_1x64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $z2
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
@@ -34,8 +34,8 @@ define <1 x i64> @test_mul_add_1x64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) {
 define <2 x i64> @test_mul_sub_2x64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
 ; CHECK-LABEL: test_mul_sub_2x64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 def $z2
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    mls z0.d, p0/m, z1.d, z2.d
@@ -49,14 +49,15 @@ define <2 x i64> @test_mul_sub_2x64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
 define <2 x i64> @test_mul_sub_2x64_2(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
 ; CHECK-LABEL: test_mul_sub_2x64_2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q2 killed $q2 def $z2
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    // kill: def $q3 killed $q3 def $z3
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
-; CHECK-NEXT:    mul z2.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    // kill: def $q3 killed $q3 def $z3
+; CHECK-NEXT:    // kill: def $q2 killed $q2 def $z2
 ; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    sub v0.2d, v2.2d, v0.2d
+; CHECK-NEXT:    movprfx z1, z2
+; CHECK-NEXT:    mul z1.d, p0/m, z1.d, z3.d
+; CHECK-NEXT:    sub v0.2d, v1.2d, v0.2d
 ; CHECK-NEXT:    ret
   %div = sdiv <2 x i64> %a, %b
   %mul = mul <2 x i64> %c, %d
@@ -67,8 +68,8 @@ define <2 x i64> @test_mul_sub_2x64_2(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c,
 define <2 x i64> @test_mul_sub_2x64_3(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
 ; CHECK-LABEL: test_mul_sub_2x64_3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    // kill: def $q3 killed $q3 def $z3
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 def $z2
@@ -85,8 +86,8 @@ define <2 x i64> @test_mul_sub_2x64_3(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c,
 define <1 x i64> @test_mul_sub_1x64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) {
 ; CHECK-LABEL: test_mul_sub_1x64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $z2
 ; CHECK-NEXT:    mul z1.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    sub d0, d1, d0

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll
index 36b81d8e495ce6..fdeae9f326ad83 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll
@@ -4,8 +4,8 @@
 define <vscale x 2 x i16> @dupsext_v2i8_v2i16(i8 %src, <vscale x 2 x i16> %b) {
 ; CHECK-LABEL: dupsext_v2i8_v2i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sxtb w8, w0
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sxtb w8, w0
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -20,8 +20,8 @@ entry:
 define <vscale x 4 x i16> @dupsext_v4i8_v4i16(i8 %src, <vscale x 4 x i16> %b) {
 ; CHECK-LABEL: dupsext_v4i8_v4i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sxtb w8, w0
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    sxtb w8, w0
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -36,8 +36,8 @@ entry:
 define <vscale x 8 x i16> @dupsext_v8i8_v8i16(i8 %src, <vscale x 8 x i16> %b) {
 ; CHECK-LABEL: dupsext_v8i8_v8i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sxtb w8, w0
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    sxtb w8, w0
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
@@ -52,8 +52,8 @@ entry:
 define <vscale x 2 x i32> @dupsext_v2i8_v2i32(i8 %src, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: dupsext_v2i8_v2i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sxtb w8, w0
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sxtb w8, w0
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -68,8 +68,8 @@ entry:
 define <vscale x 4 x i32> @dupsext_v4i8_v4i32(i8 %src, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: dupsext_v4i8_v4i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sxtb w8, w0
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    sxtb w8, w0
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -84,9 +84,9 @@ entry:
 define <vscale x 2 x i64> @dupsext_v2i8_v2i64(i8 %src, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: dupsext_v2i8_v2i64:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    sxtb x8, w0
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -101,8 +101,8 @@ entry:
 define <vscale x 2 x i32> @dupsext_v2i16_v2i32(i16 %src, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: dupsext_v2i16_v2i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sxth w8, w0
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sxth w8, w0
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -117,8 +117,8 @@ entry:
 define <vscale x 4 x i32> @dupsext_v4i16_v4i32(i16 %src, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: dupsext_v4i16_v4i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sxth w8, w0
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    sxth w8, w0
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -133,9 +133,9 @@ entry:
 define <vscale x 2 x i64> @dupsext_v2i16_v2i64(i16 %src, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: dupsext_v2i16_v2i64:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    sxth x8, w0
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -150,9 +150,9 @@ entry:
 define <vscale x 2 x i64> @dupsext_v2i32_v2i64(i32 %src, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: dupsext_v2i32_v2i64:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -167,8 +167,8 @@ entry:
 define <vscale x 2 x i16> @dupzext_v2i8_v2i16(i8 %src, <vscale x 2 x i16> %b) {
 ; CHECK-LABEL: dupzext_v2i8_v2i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and w8, w0, #0xff
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    and w8, w0, #0xff
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -183,8 +183,8 @@ entry:
 define <vscale x 4 x i16> @dupzext_v4i8_v4i16(i8 %src, <vscale x 4 x i16> %b) {
 ; CHECK-LABEL: dupzext_v4i8_v4i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and w8, w0, #0xff
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    and w8, w0, #0xff
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -199,8 +199,8 @@ entry:
 define <vscale x 8 x i16> @dupzext_v8i8_v8i16(i8 %src, <vscale x 8 x i16> %b) {
 ; CHECK-LABEL: dupzext_v8i8_v8i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and w8, w0, #0xff
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    and w8, w0, #0xff
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
@@ -215,8 +215,8 @@ entry:
 define <vscale x 2 x i32> @dupzext_v2i8_v2i32(i8 %src, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: dupzext_v2i8_v2i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and w8, w0, #0xff
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    and w8, w0, #0xff
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -231,8 +231,8 @@ entry:
 define <vscale x 4 x i32> @dupzext_v4i8_v4i32(i8 %src, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: dupzext_v4i8_v4i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and w8, w0, #0xff
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    and w8, w0, #0xff
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -247,9 +247,9 @@ entry:
 define <vscale x 2 x i64> @dupzext_v2i8_v2i64(i8 %src, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: dupzext_v2i8_v2i64:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    and x8, x0, #0xff
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -264,8 +264,8 @@ entry:
 define <vscale x 2 x i32> @dupzext_v2i16_v2i32(i16 %src, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: dupzext_v2i16_v2i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and w8, w0, #0xffff
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    and w8, w0, #0xffff
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -280,8 +280,8 @@ entry:
 define <vscale x 4 x i32> @dupzext_v4i16_v4i32(i16 %src, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: dupzext_v4i16_v4i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and w8, w0, #0xffff
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    and w8, w0, #0xffff
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -296,9 +296,9 @@ entry:
 define <vscale x 2 x i64> @dupzext_v2i16_v2i64(i16 %src, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: dupzext_v2i16_v2i64:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    and x8, x0, #0xffff
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -313,8 +313,8 @@ entry:
 define <vscale x 2 x i64> @dupzext_v2i32_v2i64(i32 %src, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: dupzext_v2i32_v2i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
index 1506a537b4180e..2dde251cbc02f9 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
@@ -100,8 +100,8 @@ define <2 x i16> @dupsext_v2i8_v2i16(i8 %src, <2 x i8> %b) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    sxtb w8, w0
 ; CHECK-NEXT:    shl v0.2s, v0.2s, #24
-; CHECK-NEXT:    sshr v0.2s, v0.2s, #24
 ; CHECK-NEXT:    dup v1.2s, w8
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #24
 ; CHECK-NEXT:    mul v0.2s, v1.2s, v0.2s
 ; CHECK-NEXT:    ret
 entry:
@@ -116,8 +116,8 @@ entry:
 define <2 x i64> @dupzext_v2i16_v2i64(i16 %src, <2 x i16> %b) {
 ; CHECK-LABEL: dupzext_v2i16_v2i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and w8, w0, #0xffff
 ; CHECK-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-NEXT:    and w8, w0, #0xffff
 ; CHECK-NEXT:    dup v2.2s, w8
 ; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    umull v0.2d, v2.2s, v0.2s
@@ -191,8 +191,8 @@ define void @typei1_orig(i64 %a, ptr %p, ptr %q) {
 ; CHECK-NEXT:    cmp x0, #0
 ; CHECK-NEXT:    ldr q0, [x2]
 ; CHECK-NEXT:    cset w8, gt
-; CHECK-NEXT:    cmtst v0.8h, v0.8h, v0.8h
 ; CHECK-NEXT:    dup v1.8h, w8
+; CHECK-NEXT:    cmtst v0.8h, v0.8h, v0.8h
 ; CHECK-NEXT:    cmeq v1.8h, v1.8h, #0
 ; CHECK-NEXT:    bic v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    xtn v0.8b, v0.8h
@@ -217,8 +217,8 @@ define void @typei1_orig(i64 %a, ptr %p, ptr %q) {
 define <8 x i16> @typei1_v8i1_v8i16(i1 %src, <8 x i1> %b) {
 ; CHECK-LABEL: typei1_v8i1_v8i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    movi v1.8b, #1
+; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    dup v2.8b, w8
 ; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    umull v0.8h, v2.8b, v0.8b

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll b/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
index a304ec09e88985..7e97116d9d0227 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
@@ -297,15 +297,15 @@ entry:
 ; CHECK: .cfi_offset w20, -16
 ; CHECK: .cfi_offset w30, -24
 ; CHECK: .cfi_offset w29, -32
+;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
+; CHECK: ubfiz	 x8, x0, #2, #32
 ;   Check correct access to arguments passed on the stack, through frame pointer
 ; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #40]
-;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
-; CHECK: ubfiz	 x9, x0, #2, #32
-; CHECK: add	 x9, x9, #15
 ; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #56]
-; CHECK: and	 x9, x9, #0x7fffffff0
+; CHECK: add	 x8, x8, #15
+; CHECK: and	 x8, x8, #0x7fffffff0
 ; CHECK: mov	 x10, sp
-; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
+; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x8
 ; CHECK: mov	 sp, x[[VLASPTMP]]
 ;   Check correct access to local variable, through frame pointer
 ; CHECK: ldur	w[[ILOC:[0-9]+]], [x29, #-4]
@@ -342,16 +342,16 @@ entry:
 ;   Check that space is reserved on the stack for the local variable,
 ;   rounded up to a multiple of 16 to keep the stack pointer 16-byte aligned.
 ; CHECK: sub	sp, sp, #16
+;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
+; CHECK: ubfiz	x8, x0, #2, #32
 ;   Check correctness of cfi pseudo-instructions
 ;   Check correct access to arguments passed on the stack, through frame pointer
 ; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #24]
-;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
-; CHECK: ubfiz	x9, x0, #2, #32
-; CHECK: add	x9, x9, #15
 ; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
-; CHECK: and	x9, x9, #0x7fffffff0
+; CHECK: add	x8, x8, #15
+; CHECK: and	x8, x8, #0x7fffffff0
 ; CHECK: mov	x10, sp
-; CHECK: sub	x[[VLASPTMP:[0-9]+]], x10, x9
+; CHECK: sub	x[[VLASPTMP:[0-9]+]], x10, x8
 ; CHECK: mov	sp, x[[VLASPTMP]]
 ;   Check correct access to local variable, through frame pointer
 ; CHECK: ldur	w[[ILOC:[0-9]+]], [x29, #-4]
@@ -402,16 +402,16 @@ entry:
 ; CHECK: .cfi_offset w21, -32
 ; CHECK: .cfi_offset w30, -40
 ; CHECK: .cfi_offset w29, -48
-;   Check correct access to arguments passed on the stack, through frame pointer
-; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #56]
 ;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
 ;   and set-up of base pointer (x19).
-; CHECK: ubfiz	 x9, x0, #2, #32
-; CHECK: add	 x9, x9, #15
+; CHECK: ubfiz	 x8, x0, #2, #32
+;   Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #56]
 ; CHECK: ldr	 d[[DARG:[0-9]+]], [x29, #72]
-; CHECK: and	 x9, x9, #0x7fffffff0
+; CHECK: add	 x8, x8, #15
+; CHECK: and	 x8, x8, #0x7fffffff0
 ; CHECK: mov	 x10, sp
-; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
+; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x8
 ; CHECK: mov	 sp, x[[VLASPTMP]]
 ;   Check correct access to local variable, through base pointer
 ; CHECK: ldr	w[[ILOC:[0-9]+]], [x19]
@@ -448,16 +448,16 @@ entry:
 ; CHECK-MACHO: .cfi_offset w20, -32
 ; CHECK-MACHO: .cfi_offset w21, -40
 ; CHECK-MACHO: .cfi_offset w22, -48
-;   Check correct access to arguments passed on the stack, through frame pointer
-; CHECK-MACHO: ldr	w[[IARG:[0-9]+]], [x29, #20]
 ;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
 ;   and set-up of base pointer (x19).
-; CHECK-MACHO: ubfiz	 x9, x0, #2, #32
-; CHECK-MACHO: add	 x9, x9, #15
+; CHECK-MACHO: ubfiz	 x8, x0, #2, #32
+;   Check correct access to arguments passed on the stack, through frame pointer
+; CHECK-MACHO: ldr	w[[IARG:[0-9]+]], [x29, #20]
 ; CHECK-MACHO: ldr	d[[DARG:[0-9]+]], [x29, #32]
-; CHECK-MACHO: and	 x9, x9, #0x7fffffff0
+; CHECK-MACHO: add	 x8, x8, #15
+; CHECK-MACHO: and	 x8, x8, #0x7fffffff0
 ; CHECK-MACHO: mov	 x10, sp
-; CHECK-MACHO: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
+; CHECK-MACHO: sub	 x[[VLASPTMP:[0-9]+]], x10, x8
 ; CHECK-MACHO: mov	 sp, x[[VLASPTMP]]
 ;   Check correct access to local variable, through base pointer
 ; CHECK-MACHO: ldr	w[[ILOC:[0-9]+]], [x19]
@@ -500,16 +500,16 @@ entry:
 ; CHECK: sub	x9, sp, #96
 ; CHECK: and	sp, x9, #0xffffffffffffff80
 ; CHECK: mov    x19, sp
-;   Check correct access to arguments passed on the stack, through frame pointer
-; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #40]
 ;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
 ;   and set-up of base pointer (x19).
-; CHECK: ubfiz	 x9, x0, #2, #32
-; CHECK: add	 x9, x9, #15
+; CHECK: ubfiz	 x8, x0, #2, #32
+;   Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #40]
 ; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #56]
-; CHECK: and	 x9, x9, #0x7fffffff0
+; CHECK: add	 x8, x8, #15
+; CHECK: and	 x8, x8, #0x7fffffff0
 ; CHECK: mov	 x10, sp
-; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
+; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x8
 ; CHECK: mov	 sp, x[[VLASPTMP]]
 ;   Check correct access to local variable, through base pointer
 ; CHECK: ldr	w[[ILOC:[0-9]+]], [x19]
@@ -534,16 +534,16 @@ entry:
 ; CHECK-MACHO: sub	x9, sp, #96
 ; CHECK-MACHO: and	sp, x9, #0xffffffffffffff80
 ; CHECK-MACHO: mov    x19, sp
-;   Check correct access to arguments passed on the stack, through frame pointer
-; CHECK-MACHO: ldr	w[[IARG:[0-9]+]], [x29, #20]
 ;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
 ;   and set-up of base pointer (x19).
-; CHECK-MACHO: ubfiz	 x9, x0, #2, #32
-; CHECK-MACHO: add 	 x9, x9, #15
+; CHECK-MACHO: ubfiz	 x8, x0, #2, #32
+;   Check correct access to arguments passed on the stack, through frame pointer
+; CHECK-MACHO: ldr	w[[IARG:[0-9]+]], [x29, #20]
 ; CHECK-MACHO: ldr	d[[DARG:[0-9]+]], [x29, #32]
-; CHECK-MACHO: and	 x9, x9, #0x7fffffff0
+; CHECK-MACHO: add 	 x8, x8, #15
+; CHECK-MACHO: and	 x8, x8, #0x7fffffff0
 ; CHECK-MACHO: mov	 x10, sp
-; CHECK-MACHO: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
+; CHECK-MACHO: sub	 x[[VLASPTMP:[0-9]+]], x10, x8
 ; CHECK-MACHO: mov	 sp, x[[VLASPTMP]]
 ;   Check correct access to local variable, through base pointer
 ; CHECK-MACHO: ldr	w[[ILOC:[0-9]+]], [x19]
@@ -584,16 +584,16 @@ entry:
 ; CHECK: sub	x9, sp, #7, lsl #12
 ; CHECK: and	sp, x9, #0xffffffffffff8000
 ; CHECK: mov    x19, sp
-;   Check correct access to arguments passed on the stack, through frame pointer
-; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #40]
 ;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
 ;   and set-up of base pointer (x19).
-; CHECK: ubfiz	 x9, x0, #2, #32
-; CHECK: add	 x9, x9, #15
+; CHECK: ubfiz	 x8, x0, #2, #32
+;   Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #40]
 ; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #56]
-; CHECK: and	 x9, x9, #0x7fffffff0
+; CHECK: add	 x8, x8, #15
+; CHECK: and	 x8, x8, #0x7fffffff0
 ; CHECK: mov	 x10, sp
-; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
+; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x8
 ; CHECK: mov	 sp, x[[VLASPTMP]]
 ;   Check correct access to local variable, through base pointer
 ; CHECK: ldr	w[[ILOC:[0-9]+]], [x19]
@@ -618,16 +618,16 @@ entry:
 ; CHECK-MACHO: sub	x9, sp, #7, lsl #12
 ; CHECK-MACHO: and	sp, x9, #0xffffffffffff8000
 ; CHECK-MACHO: mov    x19, sp
-;   Check correct access to arguments passed on the stack, through frame pointer
-; CHECK-MACHO: ldr	w[[IARG:[0-9]+]], [x29, #20]
 ;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
 ;   and set-up of base pointer (x19).
-; CHECK-MACHO: ubfiz	 x9, x0, #2, #32
-; CHECK-MACHO: add	 x9, x9, #15
+; CHECK-MACHO: ubfiz	 x8, x0, #2, #32
+;   Check correct access to arguments passed on the stack, through frame pointer
+; CHECK-MACHO: ldr	w[[IARG:[0-9]+]], [x29, #20]
 ; CHECK-MACHO: ldr	d[[DARG:[0-9]+]], [x29, #32]
-; CHECK-MACHO: and	 x9, x9, #0x7fffffff0
+; CHECK-MACHO: add	 x8, x8, #15
+; CHECK-MACHO: and	 x8, x8, #0x7fffffff0
 ; CHECK-MACHO: mov	 x10, sp
-; CHECK-MACHO: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
+; CHECK-MACHO: sub	 x[[VLASPTMP:[0-9]+]], x10, x8
 ; CHECK-MACHO: mov	 sp, x[[VLASPTMP]]
 ;   Check correct access to local variable, through base pointer
 ; CHECK-MACHO: ldr	w[[ILOC:[0-9]+]], [x19]

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-fixup-statepoint-regs-crash.ll b/llvm/test/CodeGen/AArch64/aarch64-fixup-statepoint-regs-crash.ll
index 055944a14b514d..95ccb983dfde00 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-fixup-statepoint-regs-crash.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-fixup-statepoint-regs-crash.ll
@@ -14,8 +14,8 @@ define dso_local ptr addrspace(1) @foo(ptr addrspace(1) %arg) gc "statepoint-exa
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w30, -16
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    str q0, [sp, #16]
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl baz // 8-byte Folded Reload
 ; CHECK-NEXT:  .Ltmp0:
 ; CHECK-NEXT:    ldp x19, x0, [sp, #8] // 8-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
index ae3827cc65b5ea..59cd87f58ab08e 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
@@ -14,8 +14,8 @@ define i16 @halfword(ptr %ctx, i32 %xor72) nounwind {
 ; CHECK0-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK0-NEXT:    ubfx x8, x1, #9, #8
 ; CHECK0-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK0-NEXT:    lsl x21, x8, #1
 ; CHECK0-NEXT:    mov x19, x0
+; CHECK0-NEXT:    lsl x21, x8, #1
 ; CHECK0-NEXT:    ldrh w20, [x0, x21]
 ; CHECK0-NEXT:    bl foo
 ; CHECK0-NEXT:    mov w0, w20
@@ -55,8 +55,8 @@ define i32 @word(ptr %ctx, i32 %xor72) nounwind {
 ; CHECK0-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK0-NEXT:    ubfx x8, x1, #9, #8
 ; CHECK0-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK0-NEXT:    lsl x21, x8, #2
 ; CHECK0-NEXT:    mov x19, x0
+; CHECK0-NEXT:    lsl x21, x8, #2
 ; CHECK0-NEXT:    ldr w20, [x0, x21]
 ; CHECK0-NEXT:    bl foo
 ; CHECK0-NEXT:    mov w0, w20
@@ -96,8 +96,8 @@ define i64 @doubleword(ptr %ctx, i32 %xor72) nounwind {
 ; CHECK0-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK0-NEXT:    ubfx x8, x1, #9, #8
 ; CHECK0-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK0-NEXT:    lsl x21, x8, #3
 ; CHECK0-NEXT:    mov x19, x0
+; CHECK0-NEXT:    lsl x21, x8, #3
 ; CHECK0-NEXT:    ldr x20, [x0, x21]
 ; CHECK0-NEXT:    bl foo
 ; CHECK0-NEXT:    mov x0, x20

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-interleaved-access-w-undef.ll b/llvm/test/CodeGen/AArch64/aarch64-interleaved-access-w-undef.ll
index 15d279fe61b7cd..cbda7b027587d9 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-interleaved-access-w-undef.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-interleaved-access-w-undef.ll
@@ -27,14 +27,14 @@ BB:
 define void @f_undef_15(<8 x i64> %a, ptr %dst) {
 ; CHECK-LABEL: f_undef_15:
 ; CHECK:       // %bb.0: // %BB
-; CHECK-NEXT:    mov x9, x0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $q0_q1
-; CHECK-NEXT:    add x8, x0, #64
+; CHECK-NEXT:    mov x8, x0
 ; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    st2 { v0.2d, v1.2d }, [x9], #32
+; CHECK-NEXT:    st2 { v0.2d, v1.2d }, [x8], #32
+; CHECK-NEXT:    st2 { v0.2d, v1.2d }, [x8]
+; CHECK-NEXT:    add x8, x0, #64
 ; CHECK-NEXT:    st2 { v0.2d, v1.2d }, [x8]
 ; CHECK-NEXT:    add x8, x0, #96
-; CHECK-NEXT:    st2 { v0.2d, v1.2d }, [x9]
 ; CHECK-NEXT:    st2 { v0.2d, v1.2d }, [x8]
 ; CHECK-NEXT:    ret
 BB:
@@ -46,20 +46,20 @@ BB:
 define void @f_undef_1(<8 x i64> %a, ptr %dst) {
 ; CHECK-LABEL: f_undef_1:
 ; CHECK:       // %bb.0: // %BB
-; CHECK-NEXT:    mov x9, x0
-; CHECK-NEXT:    add x8, x0, #64
 ; CHECK-NEXT:    mov v16.16b, v0.16b
-; CHECK-NEXT:    // kill: def $q3 killed $q3 def $q3_q4
-; CHECK-NEXT:    mov v17.16b, v16.16b
+; CHECK-NEXT:    mov x8, x0
 ; CHECK-NEXT:    mov v5.16b, v2.16b
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $q1_q2
-; CHECK-NEXT:    st2 { v16.2d, v17.2d }, [x9], #32
-; CHECK-NEXT:    mov v6.16b, v5.16b
+; CHECK-NEXT:    // kill: def $q3 killed $q3 def $q3_q4
 ; CHECK-NEXT:    mov v2.16b, v1.16b
 ; CHECK-NEXT:    mov v4.16b, v3.16b
+; CHECK-NEXT:    mov v17.16b, v16.16b
+; CHECK-NEXT:    mov v6.16b, v5.16b
+; CHECK-NEXT:    st2 { v16.2d, v17.2d }, [x8], #32
+; CHECK-NEXT:    st2 { v1.2d, v2.2d }, [x8]
+; CHECK-NEXT:    add x8, x0, #64
 ; CHECK-NEXT:    st2 { v5.2d, v6.2d }, [x8]
 ; CHECK-NEXT:    add x8, x0, #96
-; CHECK-NEXT:    st2 { v1.2d, v2.2d }, [x9]
 ; CHECK-NEXT:    st2 { v3.2d, v4.2d }, [x8]
 ; CHECK-NEXT:    ret
 BB:
@@ -75,8 +75,8 @@ define void @noundefs(<8 x i32> %a, <8 x i32> %b, ptr %dst) {
 ; CHECK-NEXT:    mov v5.16b, v2.16b
 ; CHECK-NEXT:    // kill: def $q3 killed $q3 def $q2_q3
 ; CHECK-NEXT:    mov v4.16b, v0.16b
-; CHECK-NEXT:    st2 { v4.4s, v5.4s }, [x0], #32
 ; CHECK-NEXT:    mov v2.16b, v1.16b
+; CHECK-NEXT:    st2 { v4.4s, v5.4s }, [x0], #32
 ; CHECK-NEXT:    st2 { v2.4s, v3.4s }, [x0]
 ; CHECK-NEXT:    ret
 BB:
@@ -91,8 +91,8 @@ define void @undefs(<8 x i32> %a, <8 x i32> %b, ptr %dst) {
 ; CHECK-NEXT:    mov v5.16b, v2.16b
 ; CHECK-NEXT:    // kill: def $q3 killed $q3 def $q2_q3
 ; CHECK-NEXT:    mov v4.16b, v0.16b
-; CHECK-NEXT:    st2 { v4.4s, v5.4s }, [x0], #32
 ; CHECK-NEXT:    mov v2.16b, v1.16b
+; CHECK-NEXT:    st2 { v4.4s, v5.4s }, [x0], #32
 ; CHECK-NEXT:    st2 { v2.4s, v3.4s }, [x0]
 ; CHECK-NEXT:    ret
 BB:

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-isel-csinc.ll b/llvm/test/CodeGen/AArch64/aarch64-isel-csinc.ll
index cbcd5ef66ae9a8..e113fc0f3579e4 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-isel-csinc.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-isel-csinc.ll
@@ -101,8 +101,8 @@ entry:
 define i32 @csinc7(i32 %a, i32 %b) {
 ; CHECK-LABEL: csinc7:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #-4097 // =0xffffefff
 ; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    mov w8, #-4097
 ; CHECK-NEXT:    csinc w8, w8, wzr, eq
 ; CHECK-NEXT:    add w0, w8, w1
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
index b75783339eda30..cc7dffc497495a 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
@@ -106,18 +106,18 @@ define <2 x i32> @fsext_v2i32(ptr %a) {
 ; CHECK-LE-LABEL: fsext_v2i32:
 ; CHECK-LE:       // %bb.0:
 ; CHECK-LE-NEXT:    ldrsb w8, [x0]
+; CHECK-LE-NEXT:    ldrsb w9, [x0, #1]
 ; CHECK-LE-NEXT:    fmov s0, w8
-; CHECK-LE-NEXT:    ldrsb w8, [x0, #1]
-; CHECK-LE-NEXT:    mov v0.s[1], w8
+; CHECK-LE-NEXT:    mov v0.s[1], w9
 ; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-LE-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: fsext_v2i32:
 ; CHECK-BE:       // %bb.0:
 ; CHECK-BE-NEXT:    ldrsb w8, [x0]
+; CHECK-BE-NEXT:    ldrsb w9, [x0, #1]
 ; CHECK-BE-NEXT:    fmov s0, w8
-; CHECK-BE-NEXT:    ldrsb w8, [x0, #1]
-; CHECK-BE-NEXT:    mov v0.s[1], w8
+; CHECK-BE-NEXT:    mov v0.s[1], w9
 ; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-BE-NEXT:    ret
   %x = load <2 x i8>, ptr %a
@@ -187,12 +187,12 @@ define <8 x i32> @fsext_v8i32(ptr %a) {
 ; CHECK-BE:       // %bb.0:
 ; CHECK-BE-NEXT:    ld1 { v0.8b }, [x0]
 ; CHECK-BE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-BE-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-BE-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-BE-NEXT:    rev64 v1.4s, v1.4s
+; CHECK-BE-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-BE-NEXT:    sshll2 v0.4s, v0.8h, #0
 ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
-; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    rev64 v2.4s, v1.4s
+; CHECK-BE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    ext v0.16b, v2.16b, v2.16b, #8
 ; CHECK-BE-NEXT:    ret
   %x = load <8 x i8>, ptr %a
   %y = sext <8 x i8> %x to <8 x i32>
@@ -251,18 +251,18 @@ define <2 x i16> @fsext_v2i16(ptr %a) {
 ; CHECK-LE-LABEL: fsext_v2i16:
 ; CHECK-LE:       // %bb.0:
 ; CHECK-LE-NEXT:    ldrsb w8, [x0]
+; CHECK-LE-NEXT:    ldrsb w9, [x0, #1]
 ; CHECK-LE-NEXT:    fmov s0, w8
-; CHECK-LE-NEXT:    ldrsb w8, [x0, #1]
-; CHECK-LE-NEXT:    mov v0.s[1], w8
+; CHECK-LE-NEXT:    mov v0.s[1], w9
 ; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-LE-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: fsext_v2i16:
 ; CHECK-BE:       // %bb.0:
 ; CHECK-BE-NEXT:    ldrsb w8, [x0]
+; CHECK-BE-NEXT:    ldrsb w9, [x0, #1]
 ; CHECK-BE-NEXT:    fmov s0, w8
-; CHECK-BE-NEXT:    ldrsb w8, [x0, #1]
-; CHECK-BE-NEXT:    mov v0.s[1], w8
+; CHECK-BE-NEXT:    mov v0.s[1], w9
 ; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-BE-NEXT:    ret
   %x = load <2 x i8>, ptr %a
@@ -344,12 +344,12 @@ define <16 x i16> @fsext_v16i16(ptr %a) {
 ; CHECK-BE-LABEL: fsext_v16i16:
 ; CHECK-BE:       // %bb.0:
 ; CHECK-BE-NEXT:    ld1 { v0.16b }, [x0]
-; CHECK-BE-NEXT:    sshll2 v1.8h, v0.16b, #0
-; CHECK-BE-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h
+; CHECK-BE-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-BE-NEXT:    sshll2 v0.8h, v0.16b, #0
 ; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
-; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    rev64 v2.8h, v1.8h
+; CHECK-BE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    ext v0.16b, v2.16b, v2.16b, #8
 ; CHECK-BE-NEXT:    ret
   %x = load <16 x i8>, ptr %a
   %y = sext <16 x i8> %x to <16 x i16>

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 3b1b909b091104..2eb1031cefe5bf 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -193,18 +193,18 @@ define void @larger_smull(i16* nocapture noundef readonly %x, i16 noundef %y, i3
 ; CHECK-NEXT:    b.lt .LBB3_8
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-NEXT:    sxth w8, w1
-; CHECK-NEXT:    mov w9, w3
 ; CHECK-NEXT:    cmp w3, #15
+; CHECK-NEXT:    mov w9, w3
 ; CHECK-NEXT:    b.hi .LBB3_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    mov x10, xzr
 ; CHECK-NEXT:    b .LBB3_6
 ; CHECK-NEXT:  .LBB3_3: // %vector.ph
+; CHECK-NEXT:    dup v0.8h, w8
 ; CHECK-NEXT:    and x10, x9, #0xfffffff0
 ; CHECK-NEXT:    add x11, x2, #32
 ; CHECK-NEXT:    add x12, x0, #16
 ; CHECK-NEXT:    mov x13, x10
-; CHECK-NEXT:    dup v0.8h, w8
 ; CHECK-NEXT:  .LBB3_4: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldp q1, q2, [x12, #-16]
@@ -221,15 +221,15 @@ define void @larger_smull(i16* nocapture noundef readonly %x, i16 noundef %y, i3
 ; CHECK-NEXT:    cmp x10, x9
 ; CHECK-NEXT:    b.eq .LBB3_8
 ; CHECK-NEXT:  .LBB3_6: // %for.body.preheader1
-; CHECK-NEXT:    sub x9, x9, x10
 ; CHECK-NEXT:    add x11, x2, x10, lsl #2
-; CHECK-NEXT:    add x10, x0, x10, lsl #1
+; CHECK-NEXT:    add x12, x0, x10, lsl #1
+; CHECK-NEXT:    sub x9, x9, x10
 ; CHECK-NEXT:  .LBB3_7: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldrsh w12, [x10], #2
+; CHECK-NEXT:    ldrsh w10, [x12], #2
 ; CHECK-NEXT:    subs x9, x9, #1
-; CHECK-NEXT:    mul w12, w12, w8
-; CHECK-NEXT:    str w12, [x11], #4
+; CHECK-NEXT:    mul w10, w10, w8
+; CHECK-NEXT:    str w10, [x11], #4
 ; CHECK-NEXT:    b.ne .LBB3_7
 ; CHECK-NEXT:  .LBB3_8: // %for.cond.cleanup
 ; CHECK-NEXT:    ret
@@ -304,19 +304,19 @@ define void @larger_umull(i16* nocapture noundef readonly %x, i16 noundef %y, i3
 ; CHECK-NEXT:    cmp w3, #1
 ; CHECK-NEXT:    b.lt .LBB4_8
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    cmp w3, #15
 ; CHECK-NEXT:    and w8, w1, #0xffff
 ; CHECK-NEXT:    mov w9, w3
-; CHECK-NEXT:    cmp w3, #15
 ; CHECK-NEXT:    b.hi .LBB4_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    mov x10, xzr
 ; CHECK-NEXT:    b .LBB4_6
 ; CHECK-NEXT:  .LBB4_3: // %vector.ph
+; CHECK-NEXT:    dup v0.8h, w8
 ; CHECK-NEXT:    and x10, x9, #0xfffffff0
 ; CHECK-NEXT:    add x11, x2, #32
 ; CHECK-NEXT:    add x12, x0, #16
 ; CHECK-NEXT:    mov x13, x10
-; CHECK-NEXT:    dup v0.8h, w8
 ; CHECK-NEXT:  .LBB4_4: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldp q1, q2, [x12, #-16]
@@ -333,15 +333,15 @@ define void @larger_umull(i16* nocapture noundef readonly %x, i16 noundef %y, i3
 ; CHECK-NEXT:    cmp x10, x9
 ; CHECK-NEXT:    b.eq .LBB4_8
 ; CHECK-NEXT:  .LBB4_6: // %for.body.preheader1
-; CHECK-NEXT:    sub x9, x9, x10
 ; CHECK-NEXT:    add x11, x2, x10, lsl #2
-; CHECK-NEXT:    add x10, x0, x10, lsl #1
+; CHECK-NEXT:    add x12, x0, x10, lsl #1
+; CHECK-NEXT:    sub x9, x9, x10
 ; CHECK-NEXT:  .LBB4_7: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldrh w12, [x10], #2
+; CHECK-NEXT:    ldrh w10, [x12], #2
 ; CHECK-NEXT:    subs x9, x9, #1
-; CHECK-NEXT:    mul w12, w12, w8
-; CHECK-NEXT:    str w12, [x11], #4
+; CHECK-NEXT:    mul w10, w10, w8
+; CHECK-NEXT:    str w10, [x11], #4
 ; CHECK-NEXT:    b.ne .LBB4_7
 ; CHECK-NEXT:  .LBB4_8: // %for.cond.cleanup
 ; CHECK-NEXT:    ret
@@ -416,8 +416,8 @@ define i16 @red_mla_dup_ext_u8_s8_s16(i8* noalias nocapture noundef readonly %A,
 ; CHECK-NEXT:    cbz w2, .LBB5_3
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-NEXT:    sxtb w9, w1
-; CHECK-NEXT:    mov w10, w2
 ; CHECK-NEXT:    cmp w2, #15
+; CHECK-NEXT:    mov w10, w2
 ; CHECK-NEXT:    b.hi .LBB5_4
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    mov x11, xzr
@@ -428,12 +428,12 @@ define i16 @red_mla_dup_ext_u8_s8_s16(i8* noalias nocapture noundef readonly %A,
 ; CHECK-NEXT:    mov w0, w8
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB5_4: // %vector.ph
-; CHECK-NEXT:    and x11, x10, #0xfffffff0
-; CHECK-NEXT:    add x8, x0, #8
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    mov x12, x11
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    and x11, x10, #0xfffffff0
 ; CHECK-NEXT:    dup v2.8h, w9
+; CHECK-NEXT:    add x8, x0, #8
+; CHECK-NEXT:    mov x12, x11
 ; CHECK-NEXT:  .LBB5_5: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldp d3, d4, [x8, #-8]
@@ -536,8 +536,8 @@ define void @sink_v2z64_1(i32 *%p, i32 *%d, i64 %n, <2 x i32> %a) {
 ; CHECK-NEXT:  .LBB6_1: // %loop
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    add x8, x8, #8
 ; CHECK-NEXT:    subs x2, x2, #8
+; CHECK-NEXT:    add x8, x8, #8
 ; CHECK-NEXT:    umull v1.2d, v1.2s, v0.s[1]
 ; CHECK-NEXT:    shrn v1.2s, v1.2d, #15
 ; CHECK-NEXT:    str d1, [x0], #32
@@ -577,8 +577,8 @@ define void @sink_v4i64_1(i32 *%p, i32 *%d, i64 %n, <2 x i32> %a) {
 ; CHECK-NEXT:  .LBB7_1: // %loop
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    add x8, x8, #8
 ; CHECK-NEXT:    subs x2, x2, #8
+; CHECK-NEXT:    add x8, x8, #8
 ; CHECK-NEXT:    smull v2.2d, v1.2s, v0.s[1]
 ; CHECK-NEXT:    smull2 v1.2d, v1.4s, v0.s[1]
 ; CHECK-NEXT:    shrn v2.2s, v2.2d, #15
@@ -620,8 +620,8 @@ define void @sink_v8z16_0(i32 *%p, i32 *%d, i64 %n, <16 x i8> %a) {
 ; CHECK-NEXT:  .LBB8_1: // %loop
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    add x8, x8, #8
 ; CHECK-NEXT:    subs x2, x2, #8
+; CHECK-NEXT:    add x8, x8, #8
 ; CHECK-NEXT:    umull v1.8h, v1.8b, v0.8b
 ; CHECK-NEXT:    cmlt v1.8h, v1.8h, #0
 ; CHECK-NEXT:    xtn v1.8b, v1.8h
@@ -657,18 +657,18 @@ exit:
 define void @sink_v16s16_8(i32 *%p, i32 *%d, i64 %n, <16 x i8> %a) {
 ; CHECK-LABEL: sink_v16s16_8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    dup v0.16b, v0.b[10]
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  .LBB9_1: // %loop
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    add x8, x8, #8
 ; CHECK-NEXT:    subs x2, x2, #8
-; CHECK-NEXT:    smull2 v2.8h, v1.16b, v0.16b
-; CHECK-NEXT:    smull v1.8h, v1.8b, v0.8b
-; CHECK-NEXT:    cmlt v2.8h, v2.8h, #0
+; CHECK-NEXT:    add x8, x8, #8
+; CHECK-NEXT:    smull v2.8h, v1.8b, v0.8b
+; CHECK-NEXT:    smull2 v1.8h, v1.16b, v0.16b
 ; CHECK-NEXT:    cmlt v1.8h, v1.8h, #0
-; CHECK-NEXT:    uzp1 v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    cmlt v2.8h, v2.8h, #0
+; CHECK-NEXT:    uzp1 v1.16b, v2.16b, v1.16b
 ; CHECK-NEXT:    str q1, [x0], #32
 ; CHECK-NEXT:    b.ne .LBB9_1
 ; CHECK-NEXT:  // %bb.2: // %exit
@@ -765,24 +765,24 @@ for.end12:                                        ; preds = %vector.body
 define void @matrix_mul_unsigned_and_double(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i32 %val) {
 ; CHECK-LABEL: matrix_mul_unsigned_and_double:
 ; CHECK:       // %bb.0: // %vector.header
-; CHECK-NEXT:    and w9, w3, #0xffff
+; CHECK-NEXT:    and w8, w3, #0xffff
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    dup v0.8h, w8
 ; CHECK-NEXT:    and x8, x0, #0xfffffff0
-; CHECK-NEXT:    dup v0.8h, w9
 ; CHECK-NEXT:  .LBB11_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x9, x2, w0, uxtw #1
-; CHECK-NEXT:    add x10, x1, w0, uxtw #2
 ; CHECK-NEXT:    subs x8, x8, #16
-; CHECK-NEXT:    add w0, w0, #16
 ; CHECK-NEXT:    ldr q1, [x9]
 ; CHECK-NEXT:    ldur q2, [x9, #8]
+; CHECK-NEXT:    add x9, x1, w0, uxtw #2
+; CHECK-NEXT:    add w0, w0, #16
 ; CHECK-NEXT:    umull2 v3.4s, v0.8h, v1.8h
 ; CHECK-NEXT:    umull v1.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    umull2 v4.4s, v0.8h, v2.8h
 ; CHECK-NEXT:    umull v2.4s, v0.4h, v2.4h
-; CHECK-NEXT:    stp q1, q3, [x10]
-; CHECK-NEXT:    stp q2, q4, [x10, #32]
+; CHECK-NEXT:    stp q1, q3, [x9]
+; CHECK-NEXT:    stp q2, q4, [x9, #32]
 ; CHECK-NEXT:    b.ne .LBB11_1
 ; CHECK-NEXT:  // %bb.2: // %for.end12
 ; CHECK-NEXT:    ret
@@ -833,10 +833,10 @@ for.end12:                                        ; preds = %vector.body
 define void @matrix_mul_signed_and(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i32 %val) {
 ; CHECK-LABEL: matrix_mul_signed_and:
 ; CHECK:       // %bb.0: // %vector.header
-; CHECK-NEXT:    and w9, w3, #0xffff
+; CHECK-NEXT:    and w8, w3, #0xffff
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    dup v0.4s, w8
 ; CHECK-NEXT:    and x8, x0, #0xfffffff8
-; CHECK-NEXT:    dup v0.4s, w9
 ; CHECK-NEXT:  .LBB12_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x9, x2, w0, uxtw #1
@@ -899,10 +899,10 @@ for.end12:                                        ; preds = %vector.body
 define void @matrix_mul_signed_and_double(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i32 %val) {
 ; CHECK-LABEL: matrix_mul_signed_and_double:
 ; CHECK:       // %bb.0: // %vector.header
-; CHECK-NEXT:    and w9, w3, #0xffff
+; CHECK-NEXT:    and w8, w3, #0xffff
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    dup v0.4s, w8
 ; CHECK-NEXT:    and x8, x0, #0xfffffff0
-; CHECK-NEXT:    dup v0.4s, w9
 ; CHECK-NEXT:  .LBB13_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x9, x2, w0, uxtw #1

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-mops-consecutive.ll b/llvm/test/CodeGen/AArch64/aarch64-mops-consecutive.ll
index 89a1db79356896..60ba4f22c97f69 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-mops-consecutive.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-mops-consecutive.ll
@@ -14,23 +14,23 @@ define void @consecutive() {
 ; CHECK-MOPS-NEXT:    .cfi_def_cfa_offset 2032
 ; CHECK-MOPS-NEXT:    .cfi_offset w30, -8
 ; CHECK-MOPS-NEXT:    .cfi_offset w29, -16
-; CHECK-MOPS-NEXT:    mov w8, #1000
+; CHECK-MOPS-NEXT:    mov w8, #1000 // =0x3e8
 ; CHECK-MOPS-NEXT:    add x9, sp, #8
 ; CHECK-MOPS-NEXT:    adrp x10, .LCPI0_0
 ; CHECK-MOPS-NEXT:    adrp x11, .LCPI0_1
-; CHECK-MOPS-NEXT:    mov w12, #6424
-; CHECK-MOPS-NEXT:    mov w13, #7452
 ; CHECK-MOPS-NEXT:    setp [x9]!, x8!, xzr
 ; CHECK-MOPS-NEXT:    setm [x9]!, x8!, xzr
 ; CHECK-MOPS-NEXT:    sete [x9]!, x8!, xzr
-; CHECK-MOPS-NEXT:    movk w12, #6938, lsl #16
+; CHECK-MOPS-NEXT:    mov w12, #6424 // =0x1918
 ; CHECK-MOPS-NEXT:    ldr q0, [x10, :lo12:.LCPI0_0]
-; CHECK-MOPS-NEXT:    mov w8, #30
 ; CHECK-MOPS-NEXT:    ldr d1, [x11, :lo12:.LCPI0_1]
+; CHECK-MOPS-NEXT:    mov w8, #7452 // =0x1d1c
+; CHECK-MOPS-NEXT:    movk w12, #6938, lsl #16
+; CHECK-MOPS-NEXT:    strh w8, [sp, #1036]
+; CHECK-MOPS-NEXT:    mov w8, #30 // =0x1e
 ; CHECK-MOPS-NEXT:    add x0, sp, #1008
 ; CHECK-MOPS-NEXT:    add x1, sp, #8
 ; CHECK-MOPS-NEXT:    str w12, [sp, #1032]
-; CHECK-MOPS-NEXT:    strh w13, [sp, #1036]
 ; CHECK-MOPS-NEXT:    str q0, [sp, #1008]
 ; CHECK-MOPS-NEXT:    str d1, [sp, #1024]
 ; CHECK-MOPS-NEXT:    strb w8, [sp, #1038]

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-mops.ll b/llvm/test/CodeGen/AArch64/aarch64-mops.ll
index 0e7014ac8cbb3c..ff7872c922e32f 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-mops.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-mops.ll
@@ -92,7 +92,7 @@ define void @memset_10_zeroval_volatile(ptr %dst) {
 ; GISel-WITHOUT-MOPS-O0-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_def_cfa_offset 16
 ; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #10
+; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #10 // =0xa
 ; GISel-WITHOUT-MOPS-O0-NEXT:    mov w2, w8
 ; GISel-WITHOUT-MOPS-O0-NEXT:    mov w1, wzr
 ; GISel-WITHOUT-MOPS-O0-NEXT:    bl memset
@@ -105,14 +105,14 @@ define void @memset_10_zeroval_volatile(ptr %dst) {
 ; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_def_cfa_offset 16
 ; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_offset w30, -16
 ; GISel-WITHOUT-MOPS-O3-NEXT:    mov w1, wzr
-; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #10
+; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #10 // =0xa
 ; GISel-WITHOUT-MOPS-O3-NEXT:    bl memset
 ; GISel-WITHOUT-MOPS-O3-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; GISel-WITHOUT-MOPS-O3-NEXT:    ret
 ;
 ; GISel-MOPS-O0-LABEL: memset_10_zeroval_volatile:
 ; GISel-MOPS-O0:       // %bb.0: // %entry
-; GISel-MOPS-O0-NEXT:    mov w8, #10
+; GISel-MOPS-O0-NEXT:    mov w8, #10 // =0xa
 ; GISel-MOPS-O0-NEXT:    // kill: def $x8 killed $w8
 ; GISel-MOPS-O0-NEXT:    mov x9, xzr
 ; GISel-MOPS-O0-NEXT:    setp [x0]!, x8!, x9
@@ -122,7 +122,7 @@ define void @memset_10_zeroval_volatile(ptr %dst) {
 ;
 ; GISel-MOPS-O3-LABEL: memset_10_zeroval_volatile:
 ; GISel-MOPS-O3:       // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT:    mov w8, #10
+; GISel-MOPS-O3-NEXT:    mov w8, #10 // =0xa
 ; GISel-MOPS-O3-NEXT:    setp [x0]!, x8!, xzr
 ; GISel-MOPS-O3-NEXT:    setm [x0]!, x8!, xzr
 ; GISel-MOPS-O3-NEXT:    sete [x0]!, x8!, xzr
@@ -150,7 +150,7 @@ define void @memset_10000_zeroval(ptr %dst) {
 ; GISel-WITHOUT-MOPS-O0-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_def_cfa_offset 16
 ; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #10000
+; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #10000 // =0x2710
 ; GISel-WITHOUT-MOPS-O0-NEXT:    mov w2, w8
 ; GISel-WITHOUT-MOPS-O0-NEXT:    mov w1, wzr
 ; GISel-WITHOUT-MOPS-O0-NEXT:    bl memset
@@ -163,14 +163,14 @@ define void @memset_10000_zeroval(ptr %dst) {
 ; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_def_cfa_offset 16
 ; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_offset w30, -16
 ; GISel-WITHOUT-MOPS-O3-NEXT:    mov w1, wzr
-; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #10000
+; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #10000 // =0x2710
 ; GISel-WITHOUT-MOPS-O3-NEXT:    bl memset
 ; GISel-WITHOUT-MOPS-O3-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; GISel-WITHOUT-MOPS-O3-NEXT:    ret
 ;
 ; GISel-MOPS-O0-LABEL: memset_10000_zeroval:
 ; GISel-MOPS-O0:       // %bb.0: // %entry
-; GISel-MOPS-O0-NEXT:    mov w8, #10000
+; GISel-MOPS-O0-NEXT:    mov w8, #10000 // =0x2710
 ; GISel-MOPS-O0-NEXT:    // kill: def $x8 killed $w8
 ; GISel-MOPS-O0-NEXT:    mov x9, xzr
 ; GISel-MOPS-O0-NEXT:    setp [x0]!, x8!, x9
@@ -180,7 +180,7 @@ define void @memset_10000_zeroval(ptr %dst) {
 ;
 ; GISel-MOPS-O3-LABEL: memset_10000_zeroval:
 ; GISel-MOPS-O3:       // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT:    mov w8, #10000
+; GISel-MOPS-O3-NEXT:    mov w8, #10000 // =0x2710
 ; GISel-MOPS-O3-NEXT:    setp [x0]!, x8!, xzr
 ; GISel-MOPS-O3-NEXT:    setm [x0]!, x8!, xzr
 ; GISel-MOPS-O3-NEXT:    sete [x0]!, x8!, xzr
@@ -192,14 +192,14 @@ define void @memset_10000_zeroval(ptr %dst) {
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    .cfi_def_cfa_offset 16
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    .cfi_offset w30, -16
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    mov w1, wzr
-; SDAG-WITHOUT-MOPS-O2-NEXT:    mov w2, #10000
+; SDAG-WITHOUT-MOPS-O2-NEXT:    mov w2, #10000 // =0x2710
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    bl memset
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ret
 ;
 ; SDAG-MOPS-O2-LABEL: memset_10000_zeroval:
 ; SDAG-MOPS-O2:       // %bb.0: // %entry
-; SDAG-MOPS-O2-NEXT:    mov w8, #10000
+; SDAG-MOPS-O2-NEXT:    mov w8, #10000 // =0x2710
 ; SDAG-MOPS-O2-NEXT:    setp [x0]!, x8!, xzr
 ; SDAG-MOPS-O2-NEXT:    setm [x0]!, x8!, xzr
 ; SDAG-MOPS-O2-NEXT:    sete [x0]!, x8!, xzr
@@ -215,7 +215,7 @@ define void @memset_10000_zeroval_volatile(ptr %dst) {
 ; GISel-WITHOUT-MOPS-O0-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_def_cfa_offset 16
 ; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #10000
+; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #10000 // =0x2710
 ; GISel-WITHOUT-MOPS-O0-NEXT:    mov w2, w8
 ; GISel-WITHOUT-MOPS-O0-NEXT:    mov w1, wzr
 ; GISel-WITHOUT-MOPS-O0-NEXT:    bl memset
@@ -228,14 +228,14 @@ define void @memset_10000_zeroval_volatile(ptr %dst) {
 ; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_def_cfa_offset 16
 ; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_offset w30, -16
 ; GISel-WITHOUT-MOPS-O3-NEXT:    mov w1, wzr
-; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #10000
+; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #10000 // =0x2710
 ; GISel-WITHOUT-MOPS-O3-NEXT:    bl memset
 ; GISel-WITHOUT-MOPS-O3-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; GISel-WITHOUT-MOPS-O3-NEXT:    ret
 ;
 ; GISel-MOPS-O0-LABEL: memset_10000_zeroval_volatile:
 ; GISel-MOPS-O0:       // %bb.0: // %entry
-; GISel-MOPS-O0-NEXT:    mov w8, #10000
+; GISel-MOPS-O0-NEXT:    mov w8, #10000 // =0x2710
 ; GISel-MOPS-O0-NEXT:    // kill: def $x8 killed $w8
 ; GISel-MOPS-O0-NEXT:    mov x9, xzr
 ; GISel-MOPS-O0-NEXT:    setp [x0]!, x8!, x9
@@ -245,7 +245,7 @@ define void @memset_10000_zeroval_volatile(ptr %dst) {
 ;
 ; GISel-MOPS-O3-LABEL: memset_10000_zeroval_volatile:
 ; GISel-MOPS-O3:       // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT:    mov w8, #10000
+; GISel-MOPS-O3-NEXT:    mov w8, #10000 // =0x2710
 ; GISel-MOPS-O3-NEXT:    setp [x0]!, x8!, xzr
 ; GISel-MOPS-O3-NEXT:    setm [x0]!, x8!, xzr
 ; GISel-MOPS-O3-NEXT:    sete [x0]!, x8!, xzr
@@ -257,14 +257,14 @@ define void @memset_10000_zeroval_volatile(ptr %dst) {
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    .cfi_def_cfa_offset 16
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    .cfi_offset w30, -16
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    mov w1, wzr
-; SDAG-WITHOUT-MOPS-O2-NEXT:    mov w2, #10000
+; SDAG-WITHOUT-MOPS-O2-NEXT:    mov w2, #10000 // =0x2710
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    bl memset
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ret
 ;
 ; SDAG-MOPS-O2-LABEL: memset_10000_zeroval_volatile:
 ; SDAG-MOPS-O2:       // %bb.0: // %entry
-; SDAG-MOPS-O2-NEXT:    mov w8, #10000
+; SDAG-MOPS-O2-NEXT:    mov w8, #10000 // =0x2710
 ; SDAG-MOPS-O2-NEXT:    setp [x0]!, x8!, xzr
 ; SDAG-MOPS-O2-NEXT:    setm [x0]!, x8!, xzr
 ; SDAG-MOPS-O2-NEXT:    sete [x0]!, x8!, xzr
@@ -423,7 +423,7 @@ define void @memset_10(ptr %dst, i32 %value) {
 ; GISel-WITHOUT-MOPS-O0-NEXT:    // implicit-def: $x8
 ; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, w1
 ; GISel-WITHOUT-MOPS-O0-NEXT:    and x8, x8, #0xff
-; GISel-WITHOUT-MOPS-O0-NEXT:    mov x9, #72340172838076673
+; GISel-WITHOUT-MOPS-O0-NEXT:    mov x9, #72340172838076673 // =0x101010101010101
 ; GISel-WITHOUT-MOPS-O0-NEXT:    mul x8, x8, x9
 ; GISel-WITHOUT-MOPS-O0-NEXT:    str x8, [x0]
 ; GISel-WITHOUT-MOPS-O0-NEXT:    // kill: def $w8 killed $w8 killed $x8
@@ -433,7 +433,7 @@ define void @memset_10(ptr %dst, i32 %value) {
 ; GISel-WITHOUT-MOPS-O3-LABEL: memset_10:
 ; GISel-WITHOUT-MOPS-O3:       // %bb.0: // %entry
 ; GISel-WITHOUT-MOPS-O3-NEXT:    // kill: def $w1 killed $w1 def $x1
-; GISel-WITHOUT-MOPS-O3-NEXT:    mov x8, #72340172838076673
+; GISel-WITHOUT-MOPS-O3-NEXT:    mov x8, #72340172838076673 // =0x101010101010101
 ; GISel-WITHOUT-MOPS-O3-NEXT:    and x9, x1, #0xff
 ; GISel-WITHOUT-MOPS-O3-NEXT:    mul x8, x9, x8
 ; GISel-WITHOUT-MOPS-O3-NEXT:    str x8, [x0]
@@ -445,7 +445,7 @@ define void @memset_10(ptr %dst, i32 %value) {
 ; GISel-MOPS-O0-NEXT:    // implicit-def: $x8
 ; GISel-MOPS-O0-NEXT:    mov w8, w1
 ; GISel-MOPS-O0-NEXT:    and x8, x8, #0xff
-; GISel-MOPS-O0-NEXT:    mov x9, #72340172838076673
+; GISel-MOPS-O0-NEXT:    mov x9, #72340172838076673 // =0x101010101010101
 ; GISel-MOPS-O0-NEXT:    mul x8, x8, x9
 ; GISel-MOPS-O0-NEXT:    str x8, [x0]
 ; GISel-MOPS-O0-NEXT:    // kill: def $w8 killed $w8 killed $x8
@@ -455,7 +455,7 @@ define void @memset_10(ptr %dst, i32 %value) {
 ; GISel-MOPS-O3-LABEL: memset_10:
 ; GISel-MOPS-O3:       // %bb.0: // %entry
 ; GISel-MOPS-O3-NEXT:    // kill: def $w1 killed $w1 def $x1
-; GISel-MOPS-O3-NEXT:    mov x8, #72340172838076673
+; GISel-MOPS-O3-NEXT:    mov x8, #72340172838076673 // =0x101010101010101
 ; GISel-MOPS-O3-NEXT:    and x9, x1, #0xff
 ; GISel-MOPS-O3-NEXT:    mul x8, x9, x8
 ; GISel-MOPS-O3-NEXT:    str x8, [x0]
@@ -465,7 +465,7 @@ define void @memset_10(ptr %dst, i32 %value) {
 ; SDAG-WITHOUT-MOPS-O2-LABEL: memset_10:
 ; SDAG-WITHOUT-MOPS-O2:       // %bb.0: // %entry
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    // kill: def $w1 killed $w1 def $x1
-; SDAG-WITHOUT-MOPS-O2-NEXT:    mov x8, #72340172838076673
+; SDAG-WITHOUT-MOPS-O2-NEXT:    mov x8, #72340172838076673 // =0x101010101010101
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    and x9, x1, #0xff
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    mul x8, x9, x8
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    str x8, [x0]
@@ -475,7 +475,7 @@ define void @memset_10(ptr %dst, i32 %value) {
 ; SDAG-MOPS-O2-LABEL: memset_10:
 ; SDAG-MOPS-O2:       // %bb.0: // %entry
 ; SDAG-MOPS-O2-NEXT:    // kill: def $w1 killed $w1 def $x1
-; SDAG-MOPS-O2-NEXT:    mov x8, #72340172838076673
+; SDAG-MOPS-O2-NEXT:    mov x8, #72340172838076673 // =0x101010101010101
 ; SDAG-MOPS-O2-NEXT:    and x9, x1, #0xff
 ; SDAG-MOPS-O2-NEXT:    mul x8, x9, x8
 ; SDAG-MOPS-O2-NEXT:    str x8, [x0]
@@ -493,7 +493,7 @@ define void @memset_10_volatile(ptr %dst, i32 %value) {
 ; GISel-WITHOUT-MOPS-O0-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_def_cfa_offset 16
 ; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #10
+; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #10 // =0xa
 ; GISel-WITHOUT-MOPS-O0-NEXT:    mov w2, w8
 ; GISel-WITHOUT-MOPS-O0-NEXT:    bl memset
 ; GISel-WITHOUT-MOPS-O0-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -504,14 +504,14 @@ define void @memset_10_volatile(ptr %dst, i32 %value) {
 ; GISel-WITHOUT-MOPS-O3-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_def_cfa_offset 16
 ; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #10
+; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #10 // =0xa
 ; GISel-WITHOUT-MOPS-O3-NEXT:    bl memset
 ; GISel-WITHOUT-MOPS-O3-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; GISel-WITHOUT-MOPS-O3-NEXT:    ret
 ;
 ; GISel-MOPS-O0-LABEL: memset_10_volatile:
 ; GISel-MOPS-O0:       // %bb.0: // %entry
-; GISel-MOPS-O0-NEXT:    mov w8, #10
+; GISel-MOPS-O0-NEXT:    mov w8, #10 // =0xa
 ; GISel-MOPS-O0-NEXT:    // kill: def $x8 killed $w8
 ; GISel-MOPS-O0-NEXT:    // implicit-def: $x9
 ; GISel-MOPS-O0-NEXT:    mov w9, w1
@@ -522,7 +522,7 @@ define void @memset_10_volatile(ptr %dst, i32 %value) {
 ;
 ; GISel-MOPS-O3-LABEL: memset_10_volatile:
 ; GISel-MOPS-O3:       // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT:    mov w8, #10
+; GISel-MOPS-O3-NEXT:    mov w8, #10 // =0xa
 ; GISel-MOPS-O3-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; GISel-MOPS-O3-NEXT:    setp [x0]!, x8!, x1
 ; GISel-MOPS-O3-NEXT:    setm [x0]!, x8!, x1
@@ -532,7 +532,7 @@ define void @memset_10_volatile(ptr %dst, i32 %value) {
 ; SDAG-WITHOUT-MOPS-O2-LABEL: memset_10_volatile:
 ; SDAG-WITHOUT-MOPS-O2:       // %bb.0: // %entry
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    // kill: def $w1 killed $w1 def $x1
-; SDAG-WITHOUT-MOPS-O2-NEXT:    mov x8, #72340172838076673
+; SDAG-WITHOUT-MOPS-O2-NEXT:    mov x8, #72340172838076673 // =0x101010101010101
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    and x9, x1, #0xff
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    mul x8, x9, x8
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    str x8, [x0]
@@ -542,7 +542,7 @@ define void @memset_10_volatile(ptr %dst, i32 %value) {
 ; SDAG-MOPS-O2-LABEL: memset_10_volatile:
 ; SDAG-MOPS-O2:       // %bb.0: // %entry
 ; SDAG-MOPS-O2-NEXT:    // kill: def $w1 killed $w1 def $x1
-; SDAG-MOPS-O2-NEXT:    mov x8, #72340172838076673
+; SDAG-MOPS-O2-NEXT:    mov x8, #72340172838076673 // =0x101010101010101
 ; SDAG-MOPS-O2-NEXT:    and x9, x1, #0xff
 ; SDAG-MOPS-O2-NEXT:    mul x8, x9, x8
 ; SDAG-MOPS-O2-NEXT:    str x8, [x0]
@@ -560,7 +560,7 @@ define void @memset_10000(ptr %dst, i32 %value) {
 ; GISel-WITHOUT-MOPS-O0-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_def_cfa_offset 16
 ; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #10000
+; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #10000 // =0x2710
 ; GISel-WITHOUT-MOPS-O0-NEXT:    mov w2, w8
 ; GISel-WITHOUT-MOPS-O0-NEXT:    bl memset
 ; GISel-WITHOUT-MOPS-O0-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -571,14 +571,14 @@ define void @memset_10000(ptr %dst, i32 %value) {
 ; GISel-WITHOUT-MOPS-O3-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_def_cfa_offset 16
 ; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #10000
+; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #10000 // =0x2710
 ; GISel-WITHOUT-MOPS-O3-NEXT:    bl memset
 ; GISel-WITHOUT-MOPS-O3-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; GISel-WITHOUT-MOPS-O3-NEXT:    ret
 ;
 ; GISel-MOPS-O0-LABEL: memset_10000:
 ; GISel-MOPS-O0:       // %bb.0: // %entry
-; GISel-MOPS-O0-NEXT:    mov w8, #10000
+; GISel-MOPS-O0-NEXT:    mov w8, #10000 // =0x2710
 ; GISel-MOPS-O0-NEXT:    // kill: def $x8 killed $w8
 ; GISel-MOPS-O0-NEXT:    // implicit-def: $x9
 ; GISel-MOPS-O0-NEXT:    mov w9, w1
@@ -589,7 +589,7 @@ define void @memset_10000(ptr %dst, i32 %value) {
 ;
 ; GISel-MOPS-O3-LABEL: memset_10000:
 ; GISel-MOPS-O3:       // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT:    mov w8, #10000
+; GISel-MOPS-O3-NEXT:    mov w8, #10000 // =0x2710
 ; GISel-MOPS-O3-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; GISel-MOPS-O3-NEXT:    setp [x0]!, x8!, x1
 ; GISel-MOPS-O3-NEXT:    setm [x0]!, x8!, x1
@@ -601,14 +601,14 @@ define void @memset_10000(ptr %dst, i32 %value) {
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    .cfi_def_cfa_offset 16
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    .cfi_offset w30, -16
-; SDAG-WITHOUT-MOPS-O2-NEXT:    mov w2, #10000
+; SDAG-WITHOUT-MOPS-O2-NEXT:    mov w2, #10000 // =0x2710
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    bl memset
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ret
 ;
 ; SDAG-MOPS-O2-LABEL: memset_10000:
 ; SDAG-MOPS-O2:       // %bb.0: // %entry
-; SDAG-MOPS-O2-NEXT:    mov w8, #10000
+; SDAG-MOPS-O2-NEXT:    mov w8, #10000 // =0x2710
 ; SDAG-MOPS-O2-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; SDAG-MOPS-O2-NEXT:    setp [x0]!, x8!, x1
 ; SDAG-MOPS-O2-NEXT:    setm [x0]!, x8!, x1
@@ -626,7 +626,7 @@ define void @memset_10000_volatile(ptr %dst, i32 %value) {
 ; GISel-WITHOUT-MOPS-O0-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_def_cfa_offset 16
 ; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #10000
+; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #10000 // =0x2710
 ; GISel-WITHOUT-MOPS-O0-NEXT:    mov w2, w8
 ; GISel-WITHOUT-MOPS-O0-NEXT:    bl memset
 ; GISel-WITHOUT-MOPS-O0-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -637,14 +637,14 @@ define void @memset_10000_volatile(ptr %dst, i32 %value) {
 ; GISel-WITHOUT-MOPS-O3-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_def_cfa_offset 16
 ; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #10000
+; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #10000 // =0x2710
 ; GISel-WITHOUT-MOPS-O3-NEXT:    bl memset
 ; GISel-WITHOUT-MOPS-O3-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; GISel-WITHOUT-MOPS-O3-NEXT:    ret
 ;
 ; GISel-MOPS-O0-LABEL: memset_10000_volatile:
 ; GISel-MOPS-O0:       // %bb.0: // %entry
-; GISel-MOPS-O0-NEXT:    mov w8, #10000
+; GISel-MOPS-O0-NEXT:    mov w8, #10000 // =0x2710
 ; GISel-MOPS-O0-NEXT:    // kill: def $x8 killed $w8
 ; GISel-MOPS-O0-NEXT:    // implicit-def: $x9
 ; GISel-MOPS-O0-NEXT:    mov w9, w1
@@ -655,7 +655,7 @@ define void @memset_10000_volatile(ptr %dst, i32 %value) {
 ;
 ; GISel-MOPS-O3-LABEL: memset_10000_volatile:
 ; GISel-MOPS-O3:       // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT:    mov w8, #10000
+; GISel-MOPS-O3-NEXT:    mov w8, #10000 // =0x2710
 ; GISel-MOPS-O3-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; GISel-MOPS-O3-NEXT:    setp [x0]!, x8!, x1
 ; GISel-MOPS-O3-NEXT:    setm [x0]!, x8!, x1
@@ -667,14 +667,14 @@ define void @memset_10000_volatile(ptr %dst, i32 %value) {
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    .cfi_def_cfa_offset 16
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    .cfi_offset w30, -16
-; SDAG-WITHOUT-MOPS-O2-NEXT:    mov w2, #10000
+; SDAG-WITHOUT-MOPS-O2-NEXT:    mov w2, #10000 // =0x2710
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    bl memset
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ret
 ;
 ; SDAG-MOPS-O2-LABEL: memset_10000_volatile:
 ; SDAG-MOPS-O2:       // %bb.0: // %entry
-; SDAG-MOPS-O2-NEXT:    mov w8, #10000
+; SDAG-MOPS-O2-NEXT:    mov w8, #10000 // =0x2710
 ; SDAG-MOPS-O2-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; SDAG-MOPS-O2-NEXT:    setp [x0]!, x8!, x1
 ; SDAG-MOPS-O2-NEXT:    setm [x0]!, x8!, x1
@@ -910,7 +910,7 @@ define void @memcpy_10_volatile(ptr %dst, ptr %src, i32 %value) {
 ; GISel-WITHOUT-MOPS-O0-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_def_cfa_offset 16
 ; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #10
+; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #10 // =0xa
 ; GISel-WITHOUT-MOPS-O0-NEXT:    mov w2, w8
 ; GISel-WITHOUT-MOPS-O0-NEXT:    bl memcpy
 ; GISel-WITHOUT-MOPS-O0-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -921,14 +921,14 @@ define void @memcpy_10_volatile(ptr %dst, ptr %src, i32 %value) {
 ; GISel-WITHOUT-MOPS-O3-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_def_cfa_offset 16
 ; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #10
+; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #10 // =0xa
 ; GISel-WITHOUT-MOPS-O3-NEXT:    bl memcpy
 ; GISel-WITHOUT-MOPS-O3-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; GISel-WITHOUT-MOPS-O3-NEXT:    ret
 ;
 ; GISel-MOPS-O0-LABEL: memcpy_10_volatile:
 ; GISel-MOPS-O0:       // %bb.0: // %entry
-; GISel-MOPS-O0-NEXT:    mov w8, #10
+; GISel-MOPS-O0-NEXT:    mov w8, #10 // =0xa
 ; GISel-MOPS-O0-NEXT:    // kill: def $x8 killed $w8
 ; GISel-MOPS-O0-NEXT:    cpyfp [x0]!, [x1]!, x8!
 ; GISel-MOPS-O0-NEXT:    cpyfm [x0]!, [x1]!, x8!
@@ -937,7 +937,7 @@ define void @memcpy_10_volatile(ptr %dst, ptr %src, i32 %value) {
 ;
 ; GISel-MOPS-O3-LABEL: memcpy_10_volatile:
 ; GISel-MOPS-O3:       // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT:    mov w8, #10
+; GISel-MOPS-O3-NEXT:    mov w8, #10 // =0xa
 ; GISel-MOPS-O3-NEXT:    cpyfp [x0]!, [x1]!, x8!
 ; GISel-MOPS-O3-NEXT:    cpyfm [x0]!, [x1]!, x8!
 ; GISel-MOPS-O3-NEXT:    cpyfe [x0]!, [x1]!, x8!
@@ -969,7 +969,7 @@ define void @memcpy_1000(ptr %dst, ptr %src, i32 %value) {
 ; GISel-WITHOUT-MOPS-O0-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_def_cfa_offset 16
 ; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #1000
+; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #1000 // =0x3e8
 ; GISel-WITHOUT-MOPS-O0-NEXT:    mov w2, w8
 ; GISel-WITHOUT-MOPS-O0-NEXT:    bl memcpy
 ; GISel-WITHOUT-MOPS-O0-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -980,14 +980,14 @@ define void @memcpy_1000(ptr %dst, ptr %src, i32 %value) {
 ; GISel-WITHOUT-MOPS-O3-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_def_cfa_offset 16
 ; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #1000
+; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #1000 // =0x3e8
 ; GISel-WITHOUT-MOPS-O3-NEXT:    bl memcpy
 ; GISel-WITHOUT-MOPS-O3-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; GISel-WITHOUT-MOPS-O3-NEXT:    ret
 ;
 ; GISel-MOPS-O0-LABEL: memcpy_1000:
 ; GISel-MOPS-O0:       // %bb.0: // %entry
-; GISel-MOPS-O0-NEXT:    mov w8, #1000
+; GISel-MOPS-O0-NEXT:    mov w8, #1000 // =0x3e8
 ; GISel-MOPS-O0-NEXT:    // kill: def $x8 killed $w8
 ; GISel-MOPS-O0-NEXT:    cpyfp [x0]!, [x1]!, x8!
 ; GISel-MOPS-O0-NEXT:    cpyfm [x0]!, [x1]!, x8!
@@ -996,7 +996,7 @@ define void @memcpy_1000(ptr %dst, ptr %src, i32 %value) {
 ;
 ; GISel-MOPS-O3-LABEL: memcpy_1000:
 ; GISel-MOPS-O3:       // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT:    mov w8, #1000
+; GISel-MOPS-O3-NEXT:    mov w8, #1000 // =0x3e8
 ; GISel-MOPS-O3-NEXT:    cpyfp [x0]!, [x1]!, x8!
 ; GISel-MOPS-O3-NEXT:    cpyfm [x0]!, [x1]!, x8!
 ; GISel-MOPS-O3-NEXT:    cpyfe [x0]!, [x1]!, x8!
@@ -1007,14 +1007,14 @@ define void @memcpy_1000(ptr %dst, ptr %src, i32 %value) {
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    .cfi_def_cfa_offset 16
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    .cfi_offset w30, -16
-; SDAG-WITHOUT-MOPS-O2-NEXT:    mov w2, #1000
+; SDAG-WITHOUT-MOPS-O2-NEXT:    mov w2, #1000 // =0x3e8
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    bl memcpy
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ret
 ;
 ; SDAG-MOPS-O2-LABEL: memcpy_1000:
 ; SDAG-MOPS-O2:       // %bb.0: // %entry
-; SDAG-MOPS-O2-NEXT:    mov w8, #1000
+; SDAG-MOPS-O2-NEXT:    mov w8, #1000 // =0x3e8
 ; SDAG-MOPS-O2-NEXT:    cpyfp [x0]!, [x1]!, x8!
 ; SDAG-MOPS-O2-NEXT:    cpyfm [x0]!, [x1]!, x8!
 ; SDAG-MOPS-O2-NEXT:    cpyfe [x0]!, [x1]!, x8!
@@ -1030,7 +1030,7 @@ define void @memcpy_1000_volatile(ptr %dst, ptr %src, i32 %value) {
 ; GISel-WITHOUT-MOPS-O0-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_def_cfa_offset 16
 ; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #1000
+; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #1000 // =0x3e8
 ; GISel-WITHOUT-MOPS-O0-NEXT:    mov w2, w8
 ; GISel-WITHOUT-MOPS-O0-NEXT:    bl memcpy
 ; GISel-WITHOUT-MOPS-O0-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1041,14 +1041,14 @@ define void @memcpy_1000_volatile(ptr %dst, ptr %src, i32 %value) {
 ; GISel-WITHOUT-MOPS-O3-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_def_cfa_offset 16
 ; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #1000
+; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #1000 // =0x3e8
 ; GISel-WITHOUT-MOPS-O3-NEXT:    bl memcpy
 ; GISel-WITHOUT-MOPS-O3-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; GISel-WITHOUT-MOPS-O3-NEXT:    ret
 ;
 ; GISel-MOPS-O0-LABEL: memcpy_1000_volatile:
 ; GISel-MOPS-O0:       // %bb.0: // %entry
-; GISel-MOPS-O0-NEXT:    mov w8, #1000
+; GISel-MOPS-O0-NEXT:    mov w8, #1000 // =0x3e8
 ; GISel-MOPS-O0-NEXT:    // kill: def $x8 killed $w8
 ; GISel-MOPS-O0-NEXT:    cpyfp [x0]!, [x1]!, x8!
 ; GISel-MOPS-O0-NEXT:    cpyfm [x0]!, [x1]!, x8!
@@ -1057,7 +1057,7 @@ define void @memcpy_1000_volatile(ptr %dst, ptr %src, i32 %value) {
 ;
 ; GISel-MOPS-O3-LABEL: memcpy_1000_volatile:
 ; GISel-MOPS-O3:       // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT:    mov w8, #1000
+; GISel-MOPS-O3-NEXT:    mov w8, #1000 // =0x3e8
 ; GISel-MOPS-O3-NEXT:    cpyfp [x0]!, [x1]!, x8!
 ; GISel-MOPS-O3-NEXT:    cpyfm [x0]!, [x1]!, x8!
 ; GISel-MOPS-O3-NEXT:    cpyfe [x0]!, [x1]!, x8!
@@ -1068,14 +1068,14 @@ define void @memcpy_1000_volatile(ptr %dst, ptr %src, i32 %value) {
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    .cfi_def_cfa_offset 16
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    .cfi_offset w30, -16
-; SDAG-WITHOUT-MOPS-O2-NEXT:    mov w2, #1000
+; SDAG-WITHOUT-MOPS-O2-NEXT:    mov w2, #1000 // =0x3e8
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    bl memcpy
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ret
 ;
 ; SDAG-MOPS-O2-LABEL: memcpy_1000_volatile:
 ; SDAG-MOPS-O2:       // %bb.0: // %entry
-; SDAG-MOPS-O2-NEXT:    mov w8, #1000
+; SDAG-MOPS-O2-NEXT:    mov w8, #1000 // =0x3e8
 ; SDAG-MOPS-O2-NEXT:    cpyfp [x0]!, [x1]!, x8!
 ; SDAG-MOPS-O2-NEXT:    cpyfm [x0]!, [x1]!, x8!
 ; SDAG-MOPS-O2-NEXT:    cpyfe [x0]!, [x1]!, x8!
@@ -1461,29 +1461,29 @@ define void @memcpy_inline_300(ptr %dst, ptr %src, i32 %value) {
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    stp q1, q0, [x0, #16]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    str q2, [x0]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp q1, q0, [x1, #80]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp q3, q2, [x1, #48]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp q2, q3, [x1, #48]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    stp q1, q0, [x0, #80]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    stp q3, q2, [x0, #48]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    stp q2, q3, [x0, #48]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp q1, q0, [x1, #144]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp q3, q2, [x1, #112]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp q2, q3, [x1, #112]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    stp q1, q0, [x0, #144]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    stp q3, q2, [x0, #112]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    stp q2, q3, [x0, #112]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp q1, q0, [x1, #208]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp q3, q2, [x1, #176]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp q2, q3, [x1, #176]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    stp q1, q0, [x0, #208]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    stp q3, q2, [x0, #176]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp q2, q1, [x1, #256]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    stp q2, q3, [x0, #176]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldp q3, q1, [x1, #256]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q0, [x8]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q2, [x1, #240]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    add x8, x0, #284
-; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr q3, [x1, #240]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    str q0, [x8]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    stp q2, q1, [x0, #256]
-; SDAG-WITHOUT-MOPS-O2-NEXT:    str q3, [x0, #240]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    stp q3, q1, [x0, #256]
+; SDAG-WITHOUT-MOPS-O2-NEXT:    str q2, [x0, #240]
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ret
 ;
 ; SDAG-MOPS-O2-LABEL: memcpy_inline_300:
 ; SDAG-MOPS-O2:       // %bb.0: // %entry
-; SDAG-MOPS-O2-NEXT:    mov w8, #300
+; SDAG-MOPS-O2-NEXT:    mov w8, #300 // =0x12c
 ; SDAG-MOPS-O2-NEXT:    cpyfp [x0]!, [x1]!, x8!
 ; SDAG-MOPS-O2-NEXT:    cpyfm [x0]!, [x1]!, x8!
 ; SDAG-MOPS-O2-NEXT:    cpyfe [x0]!, [x1]!, x8!
@@ -1628,7 +1628,7 @@ define void @memcpy_inline_300_volatile(ptr %dst, ptr %src, i32 %value) {
 ;
 ; SDAG-MOPS-O2-LABEL: memcpy_inline_300_volatile:
 ; SDAG-MOPS-O2:       // %bb.0: // %entry
-; SDAG-MOPS-O2-NEXT:    mov w8, #300
+; SDAG-MOPS-O2-NEXT:    mov w8, #300 // =0x12c
 ; SDAG-MOPS-O2-NEXT:    cpyfp [x0]!, [x1]!, x8!
 ; SDAG-MOPS-O2-NEXT:    cpyfm [x0]!, [x1]!, x8!
 ; SDAG-MOPS-O2-NEXT:    cpyfe [x0]!, [x1]!, x8!
@@ -1739,7 +1739,7 @@ define void @memmove_10_volatile(ptr %dst, ptr %src, i32 %value) {
 ; GISel-WITHOUT-MOPS-O0-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_def_cfa_offset 16
 ; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #10
+; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #10 // =0xa
 ; GISel-WITHOUT-MOPS-O0-NEXT:    mov w2, w8
 ; GISel-WITHOUT-MOPS-O0-NEXT:    bl memmove
 ; GISel-WITHOUT-MOPS-O0-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1750,14 +1750,14 @@ define void @memmove_10_volatile(ptr %dst, ptr %src, i32 %value) {
 ; GISel-WITHOUT-MOPS-O3-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_def_cfa_offset 16
 ; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #10
+; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #10 // =0xa
 ; GISel-WITHOUT-MOPS-O3-NEXT:    bl memmove
 ; GISel-WITHOUT-MOPS-O3-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; GISel-WITHOUT-MOPS-O3-NEXT:    ret
 ;
 ; GISel-MOPS-O0-LABEL: memmove_10_volatile:
 ; GISel-MOPS-O0:       // %bb.0: // %entry
-; GISel-MOPS-O0-NEXT:    mov w8, #10
+; GISel-MOPS-O0-NEXT:    mov w8, #10 // =0xa
 ; GISel-MOPS-O0-NEXT:    // kill: def $x8 killed $w8
 ; GISel-MOPS-O0-NEXT:    cpyp [x0]!, [x1]!, x8!
 ; GISel-MOPS-O0-NEXT:    cpym [x0]!, [x1]!, x8!
@@ -1766,7 +1766,7 @@ define void @memmove_10_volatile(ptr %dst, ptr %src, i32 %value) {
 ;
 ; GISel-MOPS-O3-LABEL: memmove_10_volatile:
 ; GISel-MOPS-O3:       // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT:    mov w8, #10
+; GISel-MOPS-O3-NEXT:    mov w8, #10 // =0xa
 ; GISel-MOPS-O3-NEXT:    cpyp [x0]!, [x1]!, x8!
 ; GISel-MOPS-O3-NEXT:    cpym [x0]!, [x1]!, x8!
 ; GISel-MOPS-O3-NEXT:    cpye [x0]!, [x1]!, x8!
@@ -1798,7 +1798,7 @@ define void @memmove_1000(ptr %dst, ptr %src, i32 %value) {
 ; GISel-WITHOUT-MOPS-O0-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_def_cfa_offset 16
 ; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #1000
+; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #1000 // =0x3e8
 ; GISel-WITHOUT-MOPS-O0-NEXT:    mov w2, w8
 ; GISel-WITHOUT-MOPS-O0-NEXT:    bl memmove
 ; GISel-WITHOUT-MOPS-O0-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1809,14 +1809,14 @@ define void @memmove_1000(ptr %dst, ptr %src, i32 %value) {
 ; GISel-WITHOUT-MOPS-O3-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_def_cfa_offset 16
 ; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #1000
+; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #1000 // =0x3e8
 ; GISel-WITHOUT-MOPS-O3-NEXT:    bl memmove
 ; GISel-WITHOUT-MOPS-O3-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; GISel-WITHOUT-MOPS-O3-NEXT:    ret
 ;
 ; GISel-MOPS-O0-LABEL: memmove_1000:
 ; GISel-MOPS-O0:       // %bb.0: // %entry
-; GISel-MOPS-O0-NEXT:    mov w8, #1000
+; GISel-MOPS-O0-NEXT:    mov w8, #1000 // =0x3e8
 ; GISel-MOPS-O0-NEXT:    // kill: def $x8 killed $w8
 ; GISel-MOPS-O0-NEXT:    cpyp [x0]!, [x1]!, x8!
 ; GISel-MOPS-O0-NEXT:    cpym [x0]!, [x1]!, x8!
@@ -1825,7 +1825,7 @@ define void @memmove_1000(ptr %dst, ptr %src, i32 %value) {
 ;
 ; GISel-MOPS-O3-LABEL: memmove_1000:
 ; GISel-MOPS-O3:       // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT:    mov w8, #1000
+; GISel-MOPS-O3-NEXT:    mov w8, #1000 // =0x3e8
 ; GISel-MOPS-O3-NEXT:    cpyp [x0]!, [x1]!, x8!
 ; GISel-MOPS-O3-NEXT:    cpym [x0]!, [x1]!, x8!
 ; GISel-MOPS-O3-NEXT:    cpye [x0]!, [x1]!, x8!
@@ -1836,14 +1836,14 @@ define void @memmove_1000(ptr %dst, ptr %src, i32 %value) {
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    .cfi_def_cfa_offset 16
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    .cfi_offset w30, -16
-; SDAG-WITHOUT-MOPS-O2-NEXT:    mov w2, #1000
+; SDAG-WITHOUT-MOPS-O2-NEXT:    mov w2, #1000 // =0x3e8
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    bl memmove
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ret
 ;
 ; SDAG-MOPS-O2-LABEL: memmove_1000:
 ; SDAG-MOPS-O2:       // %bb.0: // %entry
-; SDAG-MOPS-O2-NEXT:    mov w8, #1000
+; SDAG-MOPS-O2-NEXT:    mov w8, #1000 // =0x3e8
 ; SDAG-MOPS-O2-NEXT:    cpyp [x0]!, [x1]!, x8!
 ; SDAG-MOPS-O2-NEXT:    cpym [x0]!, [x1]!, x8!
 ; SDAG-MOPS-O2-NEXT:    cpye [x0]!, [x1]!, x8!
@@ -1859,7 +1859,7 @@ define void @memmove_1000_volatile(ptr %dst, ptr %src, i32 %value) {
 ; GISel-WITHOUT-MOPS-O0-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_def_cfa_offset 16
 ; GISel-WITHOUT-MOPS-O0-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #1000
+; GISel-WITHOUT-MOPS-O0-NEXT:    mov w8, #1000 // =0x3e8
 ; GISel-WITHOUT-MOPS-O0-NEXT:    mov w2, w8
 ; GISel-WITHOUT-MOPS-O0-NEXT:    bl memmove
 ; GISel-WITHOUT-MOPS-O0-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -1870,14 +1870,14 @@ define void @memmove_1000_volatile(ptr %dst, ptr %src, i32 %value) {
 ; GISel-WITHOUT-MOPS-O3-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_def_cfa_offset 16
 ; GISel-WITHOUT-MOPS-O3-NEXT:    .cfi_offset w30, -16
-; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #1000
+; GISel-WITHOUT-MOPS-O3-NEXT:    mov w2, #1000 // =0x3e8
 ; GISel-WITHOUT-MOPS-O3-NEXT:    bl memmove
 ; GISel-WITHOUT-MOPS-O3-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; GISel-WITHOUT-MOPS-O3-NEXT:    ret
 ;
 ; GISel-MOPS-O0-LABEL: memmove_1000_volatile:
 ; GISel-MOPS-O0:       // %bb.0: // %entry
-; GISel-MOPS-O0-NEXT:    mov w8, #1000
+; GISel-MOPS-O0-NEXT:    mov w8, #1000 // =0x3e8
 ; GISel-MOPS-O0-NEXT:    // kill: def $x8 killed $w8
 ; GISel-MOPS-O0-NEXT:    cpyp [x0]!, [x1]!, x8!
 ; GISel-MOPS-O0-NEXT:    cpym [x0]!, [x1]!, x8!
@@ -1886,7 +1886,7 @@ define void @memmove_1000_volatile(ptr %dst, ptr %src, i32 %value) {
 ;
 ; GISel-MOPS-O3-LABEL: memmove_1000_volatile:
 ; GISel-MOPS-O3:       // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT:    mov w8, #1000
+; GISel-MOPS-O3-NEXT:    mov w8, #1000 // =0x3e8
 ; GISel-MOPS-O3-NEXT:    cpyp [x0]!, [x1]!, x8!
 ; GISel-MOPS-O3-NEXT:    cpym [x0]!, [x1]!, x8!
 ; GISel-MOPS-O3-NEXT:    cpye [x0]!, [x1]!, x8!
@@ -1897,14 +1897,14 @@ define void @memmove_1000_volatile(ptr %dst, ptr %src, i32 %value) {
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    .cfi_def_cfa_offset 16
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    .cfi_offset w30, -16
-; SDAG-WITHOUT-MOPS-O2-NEXT:    mov w2, #1000
+; SDAG-WITHOUT-MOPS-O2-NEXT:    mov w2, #1000 // =0x3e8
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    bl memmove
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; SDAG-WITHOUT-MOPS-O2-NEXT:    ret
 ;
 ; SDAG-MOPS-O2-LABEL: memmove_1000_volatile:
 ; SDAG-MOPS-O2:       // %bb.0: // %entry
-; SDAG-MOPS-O2-NEXT:    mov w8, #1000
+; SDAG-MOPS-O2-NEXT:    mov w8, #1000 // =0x3e8
 ; SDAG-MOPS-O2-NEXT:    cpyp [x0]!, [x1]!, x8!
 ; SDAG-MOPS-O2-NEXT:    cpym [x0]!, [x1]!, x8!
 ; SDAG-MOPS-O2-NEXT:    cpye [x0]!, [x1]!, x8!

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-mull-masks.ll b/llvm/test/CodeGen/AArch64/aarch64-mull-masks.ll
index e9baefc7509873..e41eb7d38c370d 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-mull-masks.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-mull-masks.ll
@@ -80,8 +80,8 @@ entry:
 define i64 @smull_ldrsb_b(ptr %x0, i8 %x1) {
 ; CHECK-LABEL: smull_ldrsb_b:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrsb x8, [x0]
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ldrsb x8, [x0]
 ; CHECK-NEXT:    sxtb x9, w1
 ; CHECK-NEXT:    smull x0, w8, w9
 ; CHECK-NEXT:    ret
@@ -96,8 +96,8 @@ entry:
 define i64 @smull_ldrsb_b_commuted(ptr %x0, i8 %x1) {
 ; CHECK-LABEL: smull_ldrsb_b_commuted:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrsb x8, [x0]
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ldrsb x8, [x0]
 ; CHECK-NEXT:    sxtb x9, w1
 ; CHECK-NEXT:    smull x0, w9, w8
 ; CHECK-NEXT:    ret
@@ -112,8 +112,8 @@ entry:
 define i64 @smull_ldrsb_h(ptr %x0, i16 %x1) {
 ; CHECK-LABEL: smull_ldrsb_h:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrsb x8, [x0]
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ldrsb x8, [x0]
 ; CHECK-NEXT:    sxth x9, w1
 ; CHECK-NEXT:    smull x0, w8, w9
 ; CHECK-NEXT:    ret
@@ -142,8 +142,8 @@ entry:
 define i64 @smull_ldrsh_b(ptr %x0, i8 %x1) {
 ; CHECK-LABEL: smull_ldrsh_b:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrsh x8, [x0]
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ldrsh x8, [x0]
 ; CHECK-NEXT:    sxtb x9, w1
 ; CHECK-NEXT:    smull x0, w8, w9
 ; CHECK-NEXT:    ret
@@ -158,8 +158,8 @@ entry:
 define i64 @smull_ldrsh_h(ptr %x0, i16 %x1) {
 ; CHECK-LABEL: smull_ldrsh_h:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrsh x8, [x0]
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ldrsh x8, [x0]
 ; CHECK-NEXT:    sxth x9, w1
 ; CHECK-NEXT:    smull x0, w8, w9
 ; CHECK-NEXT:    ret
@@ -174,8 +174,8 @@ entry:
 define i64 @smull_ldrsh_h_commuted(ptr %x0, i16 %x1) {
 ; CHECK-LABEL: smull_ldrsh_h_commuted:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrsh x8, [x0]
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ldrsh x8, [x0]
 ; CHECK-NEXT:    sxth x9, w1
 ; CHECK-NEXT:    smull x0, w9, w8
 ; CHECK-NEXT:    ret
@@ -204,8 +204,8 @@ entry:
 define i64 @smull_ldrsw_b(ptr %x0, i8 %x1) {
 ; CHECK-LABEL: smull_ldrsw_b:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrsw x8, [x0]
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ldrsw x8, [x0]
 ; CHECK-NEXT:    sxtb x9, w1
 ; CHECK-NEXT:    smull x0, w8, w9
 ; CHECK-NEXT:    ret
@@ -220,8 +220,8 @@ entry:
 define i64 @smull_ldrsw_h(ptr %x0, i16 %x1) {
 ; CHECK-LABEL: smull_ldrsw_h:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrsw x8, [x0]
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ldrsw x8, [x0]
 ; CHECK-NEXT:    sxth x9, w1
 ; CHECK-NEXT:    smull x0, w8, w9
 ; CHECK-NEXT:    ret
@@ -359,8 +359,8 @@ entry:
 define i64 @smaddl_ldrsb_h(ptr %x0, i16 %x1, i64 %x2) {
 ; CHECK-LABEL: smaddl_ldrsb_h:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrsb x8, [x0]
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ldrsb x8, [x0]
 ; CHECK-NEXT:    sxth x9, w1
 ; CHECK-NEXT:    smaddl x0, w8, w9, x2
 ; CHECK-NEXT:    ret
@@ -376,8 +376,8 @@ entry:
 define i64 @smaddl_ldrsb_h_commuted(ptr %x0, i16 %x1, i64 %x2) {
 ; CHECK-LABEL: smaddl_ldrsb_h_commuted:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrsb x8, [x0]
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ldrsb x8, [x0]
 ; CHECK-NEXT:    sxth x9, w1
 ; CHECK-NEXT:    smaddl x0, w9, w8, x2
 ; CHECK-NEXT:    ret
@@ -423,8 +423,8 @@ entry:
 define i64 @smaddl_ldrsw_b(ptr %x0, i8 %x1, i64 %x2) {
 ; CHECK-LABEL: smaddl_ldrsw_b:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrsw x8, [x0]
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ldrsw x8, [x0]
 ; CHECK-NEXT:    sxtb x9, w1
 ; CHECK-NEXT:    smaddl x0, w8, w9, x2
 ; CHECK-NEXT:    ret
@@ -439,8 +439,8 @@ define i64 @smaddl_ldrsw_b(ptr %x0, i8 %x1, i64 %x2) {
 define i64 @smaddl_ldrsw_b_commuted(ptr %x0, i8 %x1, i64 %x2) {
 ; CHECK-LABEL: smaddl_ldrsw_b_commuted:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrsw x8, [x0]
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ldrsw x8, [x0]
 ; CHECK-NEXT:    sxtb x9, w1
 ; CHECK-NEXT:    smaddl x0, w9, w8, x2
 ; CHECK-NEXT:    ret
@@ -523,8 +523,8 @@ entry:
 define i64 @smnegl_ldrsb_h(ptr %x0, i16 %x1) {
 ; CHECK-LABEL: smnegl_ldrsb_h:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrsb x8, [x0]
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ldrsb x8, [x0]
 ; CHECK-NEXT:    sxth x9, w1
 ; CHECK-NEXT:    smnegl x0, w8, w9
 ; CHECK-NEXT:    ret
@@ -540,8 +540,8 @@ entry:
 define i64 @smnegl_ldrsb_h_commuted(ptr %x0, i16 %x1) {
 ; CHECK-LABEL: smnegl_ldrsb_h_commuted:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrsb x8, [x0]
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ldrsb x8, [x0]
 ; CHECK-NEXT:    sxth x9, w1
 ; CHECK-NEXT:    smnegl x0, w9, w8
 ; CHECK-NEXT:    ret
@@ -587,8 +587,8 @@ entry:
 define i64 @smnegl_ldrsw_b(ptr %x0, i8 %x1) {
 ; CHECK-LABEL: smnegl_ldrsw_b:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrsw x8, [x0]
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ldrsw x8, [x0]
 ; CHECK-NEXT:    sxtb x9, w1
 ; CHECK-NEXT:    smnegl x0, w8, w9
 ; CHECK-NEXT:    ret
@@ -603,8 +603,8 @@ define i64 @smnegl_ldrsw_b(ptr %x0, i8 %x1) {
 define i64 @smnegl_ldrsw_b_commuted(ptr %x0, i8 %x1) {
 ; CHECK-LABEL: smnegl_ldrsw_b_commuted:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrsw x8, [x0]
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ldrsw x8, [x0]
 ; CHECK-NEXT:    sxtb x9, w1
 ; CHECK-NEXT:    smnegl x0, w9, w8
 ; CHECK-NEXT:    ret
@@ -687,8 +687,8 @@ entry:
 define i64 @smsubl_ldrsb_h(ptr %x0, i16 %x1, i64 %x2) {
 ; CHECK-LABEL: smsubl_ldrsb_h:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrsb x8, [x0]
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ldrsb x8, [x0]
 ; CHECK-NEXT:    sxth x9, w1
 ; CHECK-NEXT:    smsubl x0, w8, w9, x2
 ; CHECK-NEXT:    ret
@@ -704,8 +704,8 @@ entry:
 define i64 @smsubl_ldrsb_h_commuted(ptr %x0, i16 %x1, i64 %x2) {
 ; CHECK-LABEL: smsubl_ldrsb_h_commuted:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrsb x8, [x0]
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ldrsb x8, [x0]
 ; CHECK-NEXT:    sxth x9, w1
 ; CHECK-NEXT:    smsubl x0, w9, w8, x2
 ; CHECK-NEXT:    ret
@@ -751,8 +751,8 @@ entry:
 define i64 @smsubl_ldrsw_b(ptr %x0, i8 %x1, i64 %x2) {
 ; CHECK-LABEL: smsubl_ldrsw_b:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrsw x8, [x0]
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ldrsw x8, [x0]
 ; CHECK-NEXT:    sxtb x9, w1
 ; CHECK-NEXT:    smsubl x0, w8, w9, x2
 ; CHECK-NEXT:    ret
@@ -767,8 +767,8 @@ define i64 @smsubl_ldrsw_b(ptr %x0, i8 %x1, i64 %x2) {
 define i64 @smsubl_ldrsw_b_commuted(ptr %x0, i8 %x1, i64 %x2) {
 ; CHECK-LABEL: smsubl_ldrsw_b_commuted:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrsw x8, [x0]
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ldrsw x8, [x0]
 ; CHECK-NEXT:    sxtb x9, w1
 ; CHECK-NEXT:    smsubl x0, w9, w8, x2
 ; CHECK-NEXT:    ret
@@ -1372,10 +1372,10 @@ entry:
 define i64 @umull_ldr2_w_cc2(ptr %x0, i32 %x1) {
 ; CHECK-LABEL: umull_ldr2_w_cc2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    mov w8, w1
-; CHECK-NEXT:    and x9, x9, #0x1ffffffff
-; CHECK-NEXT:    mul x0, x9, x8
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    mov w9, w1
+; CHECK-NEXT:    and x8, x8, #0x1ffffffff
+; CHECK-NEXT:    mul x0, x8, x9
 ; CHECK-NEXT:    ret
 entry:
   %ext64 = load i64, ptr %x0

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
index 5d4df94807d063..ee349b2db4a941 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
@@ -48,12 +48,12 @@ define void @insert_vec_v16i32_uaddlv_from_v8i16(ptr %0) {
 ; CHECK-LABEL: insert_vec_v16i32_uaddlv_from_v8i16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
-; CHECK-NEXT:    movi.2d v1, #0000000000000000
-; CHECK-NEXT:    uaddlv.8h s2, v0
+; CHECK-NEXT:    movi.2d v2, #0000000000000000
+; CHECK-NEXT:    uaddlv.8h s1, v0
 ; CHECK-NEXT:    stp q0, q0, [x0, #32]
-; CHECK-NEXT:    mov.s v1[0], v2[0]
-; CHECK-NEXT:    ucvtf.4s v1, v1
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    mov.s v2[0], v1[0]
+; CHECK-NEXT:    ucvtf.4s v2, v2
+; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
 
 entry:
@@ -68,16 +68,16 @@ define void @insert_vec_v23i32_uaddlv_from_v8i16(ptr %0) {
 ; CHECK-LABEL: insert_vec_v23i32_uaddlv_from_v8i16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
+; CHECK-NEXT:    movi.2d v2, #0000000000000000
 ; CHECK-NEXT:    add x8, x0, #88
-; CHECK-NEXT:    movi.2d v1, #0000000000000000
-; CHECK-NEXT:    uaddlv.8h s2, v0
+; CHECK-NEXT:    uaddlv.8h s1, v0
 ; CHECK-NEXT:    stp q0, q0, [x0, #16]
 ; CHECK-NEXT:    stp q0, q0, [x0, #48]
 ; CHECK-NEXT:    st1.s { v0 }[2], [x8]
-; CHECK-NEXT:    mov.s v1[0], v2[0]
 ; CHECK-NEXT:    str d0, [x0, #80]
-; CHECK-NEXT:    ucvtf.4s v1, v1
-; CHECK-NEXT:    str q1, [x0]
+; CHECK-NEXT:    mov.s v2[0], v1[0]
+; CHECK-NEXT:    ucvtf.4s v2, v2
+; CHECK-NEXT:    str q2, [x0]
 ; CHECK-NEXT:    ret
 
 entry:
@@ -207,8 +207,8 @@ define void @insert_vec_v8i16_uaddlv_from_v8i16(ptr %0) {
 ; CHECK-LABEL: insert_vec_v8i16_uaddlv_from_v8i16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
-; CHECK-NEXT:    stp xzr, xzr, [x0, #16]
 ; CHECK-NEXT:    movi.2d v1, #0000000000000000
+; CHECK-NEXT:    stp xzr, xzr, [x0, #16]
 ; CHECK-NEXT:    uaddlv.8h s0, v0
 ; CHECK-NEXT:    mov.h v1[0], v0[0]
 ; CHECK-NEXT:    ushll.4s v1, v1, #0
@@ -229,8 +229,8 @@ define void @insert_vec_v3i16_uaddlv_from_v8i16(ptr %0) {
 ; CHECK-LABEL: insert_vec_v3i16_uaddlv_from_v8i16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
-; CHECK-NEXT:    add x8, x0, #8
 ; CHECK-NEXT:    movi.2d v1, #0000000000000000
+; CHECK-NEXT:    add x8, x0, #8
 ; CHECK-NEXT:    uaddlv.8h s0, v0
 ; CHECK-NEXT:    mov.h v1[0], v0[0]
 ; CHECK-NEXT:    ushll.4s v1, v1, #0
@@ -252,13 +252,13 @@ define void @insert_vec_v16i64_uaddlv_from_v4i16(ptr %0) {
 ; CHECK-LABEL: insert_vec_v16i64_uaddlv_from_v4i16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
-; CHECK-NEXT:    movi.2d v1, #0000000000000000
-; CHECK-NEXT:    uaddlv.4h s2, v0
+; CHECK-NEXT:    movi.2d v2, #0000000000000000
+; CHECK-NEXT:    uaddlv.4h s1, v0
 ; CHECK-NEXT:    stp q0, q0, [x0, #32]
-; CHECK-NEXT:    mov.s v1[0], v2[0]
-; CHECK-NEXT:    ucvtf.2d v1, v1
-; CHECK-NEXT:    fcvtn v1.2s, v1.2d
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    mov.s v2[0], v1[0]
+; CHECK-NEXT:    ucvtf.2d v2, v2
+; CHECK-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
 
 entry:
@@ -274,14 +274,14 @@ define void @insert_vec_v16i8_uaddlv_from_v8i8(ptr %0) {
 ; CHECK-LABEL: insert_vec_v16i8_uaddlv_from_v8i8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
-; CHECK-NEXT:    movi.2d v1, #0000000000000000
-; CHECK-NEXT:    uaddlv.8b h2, v0
+; CHECK-NEXT:    movi.2d v2, #0000000000000000
+; CHECK-NEXT:    uaddlv.8b h1, v0
 ; CHECK-NEXT:    stp q0, q0, [x0, #32]
-; CHECK-NEXT:    mov.h v1[0], v2[0]
-; CHECK-NEXT:    bic.4h v1, #255, lsl #8
-; CHECK-NEXT:    ushll.4s v1, v1, #0
-; CHECK-NEXT:    ucvtf.4s v1, v1
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    mov.h v2[0], v1[0]
+; CHECK-NEXT:    bic.4h v2, #255, lsl #8
+; CHECK-NEXT:    ushll.4s v2, v2, #0
+; CHECK-NEXT:    ucvtf.4s v2, v2
+; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
 
 entry:
@@ -361,11 +361,11 @@ define void @insert_vec_v16i32_uaddlv_from_v4i32(ptr %0) {
 ; CHECK-LABEL: insert_vec_v16i32_uaddlv_from_v4i32:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
-; CHECK-NEXT:    movi.2d v1, #0000000000000000
-; CHECK-NEXT:    uaddlv.4s d2, v0
+; CHECK-NEXT:    movi.2d v2, #0000000000000000
+; CHECK-NEXT:    uaddlv.4s d1, v0
 ; CHECK-NEXT:    stp q0, q0, [x0, #32]
-; CHECK-NEXT:    mov.s v1[0], v2[0]
-; CHECK-NEXT:    ucvtf.4s v1, v1
+; CHECK-NEXT:    mov.s v2[0], v1[0]
+; CHECK-NEXT:    ucvtf.4s v1, v2
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
 
@@ -426,8 +426,8 @@ define void @insert_vec_v8i8_uaddlv_from_v4i32(ptr %0) {
 ; CHECK-LABEL: insert_vec_v8i8_uaddlv_from_v4i32:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
-; CHECK-NEXT:    stp xzr, xzr, [x0, #16]
 ; CHECK-NEXT:    movi.2d v1, #0000000000000000
+; CHECK-NEXT:    stp xzr, xzr, [x0, #16]
 ; CHECK-NEXT:    uaddlv.4s d0, v0
 ; CHECK-NEXT:    mov.h v1[0], v0[0]
 ; CHECK-NEXT:    bic.4h v1, #255, lsl #8
@@ -454,8 +454,8 @@ define void @insert_vec_v16i8_uaddlv_from_v4i32(ptr %0) {
 ; CHECK-NEXT:    mov.h v1[0], v0[0]
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
 ; CHECK-NEXT:    bic.4h v1, #255, lsl #8
-; CHECK-NEXT:    ushll.4s v1, v1, #0
 ; CHECK-NEXT:    stp q0, q0, [x0, #32]
+; CHECK-NEXT:    ushll.4s v1, v1, #0
 ; CHECK-NEXT:    ucvtf.4s v1, v1
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-pmull2.ll b/llvm/test/CodeGen/AArch64/aarch64-pmull2.ll
index a1997f2b774cd4..9d7aa78ec139f6 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-pmull2.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-pmull2.ll
@@ -8,19 +8,19 @@
 define void @test1(ptr %0, ptr %1) {
 ; CHECK-LABEL: test1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w9, #61186
-; CHECK-NEXT:    mov w8, #56824
-; CHECK-NEXT:    movk w9, #29710, lsl #16
+; CHECK-NEXT:    mov w8, #56824 // =0xddf8
+; CHECK-NEXT:    mov w9, #61186 // =0xef02
 ; CHECK-NEXT:    movk w8, #40522, lsl #16
+; CHECK-NEXT:    movk w9, #29710, lsl #16
 ; CHECK-NEXT:    ldp q0, q1, [x1]
-; CHECK-NEXT:    fmov d3, x9
 ; CHECK-NEXT:    dup v2.2d, x8
-; CHECK-NEXT:    pmull2 v4.1q, v0.2d, v2.2d
-; CHECK-NEXT:    pmull v0.1q, v0.1d, v3.1d
-; CHECK-NEXT:    pmull2 v2.1q, v1.2d, v2.2d
-; CHECK-NEXT:    pmull v1.1q, v1.1d, v3.1d
-; CHECK-NEXT:    eor v0.16b, v0.16b, v4.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    fmov d3, x9
+; CHECK-NEXT:    pmull v4.1q, v0.1d, v3.1d
+; CHECK-NEXT:    pmull v3.1q, v1.1d, v3.1d
+; CHECK-NEXT:    pmull2 v0.1q, v0.2d, v2.2d
+; CHECK-NEXT:    pmull2 v1.1q, v1.2d, v2.2d
+; CHECK-NEXT:    eor v0.16b, v4.16b, v0.16b
+; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
 ; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
   %3 = load <2 x i64>, ptr %1

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index cf2bc9c2e58968..a490556f1a97ef 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -87,12 +87,12 @@ define <8 x i32> @smull_zext_v8i8_v8i32_top_bit_is_1(ptr %A, ptr %B) nounwind {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    orr v0.8h, #128, lsl #8
-; CHECK-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-NEXT:    sshll v3.4s, v1.4h, #0
 ; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    ushll2 v3.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    mul v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    mul v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-NEXT:    mul v1.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mul v0.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    ret
   %load.A = load <8 x i16>, ptr %A
   %or.A = or <8 x i16> %load.A, <i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000>
@@ -123,26 +123,26 @@ define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
 ; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64:
 ; CHECK-NEON:       // %bb.0:
 ; CHECK-NEON-NEXT:    ldr d0, [x1]
-; CHECK-NEON-NEXT:    ldrh w8, [x0]
-; CHECK-NEON-NEXT:    ldrh w11, [x0, #2]
+; CHECK-NEON-NEXT:    ldrh w9, [x0]
+; CHECK-NEON-NEXT:    ldrh w10, [x0, #2]
 ; CHECK-NEON-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-NEON-NEXT:    fmov x9, d0
-; CHECK-NEON-NEXT:    mov x10, v0.d[1]
-; CHECK-NEON-NEXT:    smull x8, w8, w9
-; CHECK-NEON-NEXT:    smull x9, w11, w10
-; CHECK-NEON-NEXT:    fmov d0, x8
-; CHECK-NEON-NEXT:    mov v0.d[1], x9
+; CHECK-NEON-NEXT:    fmov x11, d0
+; CHECK-NEON-NEXT:    mov x8, v0.d[1]
+; CHECK-NEON-NEXT:    smull x9, w9, w11
+; CHECK-NEON-NEXT:    smull x8, w10, w8
+; CHECK-NEON-NEXT:    fmov d0, x9
+; CHECK-NEON-NEXT:    mov v0.d[1], x8
 ; CHECK-NEON-NEXT:    ret
 ;
 ; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64:
 ; CHECK-SVE:       // %bb.0:
 ; CHECK-SVE-NEXT:    ldrh w8, [x0]
 ; CHECK-SVE-NEXT:    ptrue p0.d, vl2
-; CHECK-SVE-NEXT:    ldr d1, [x1]
-; CHECK-SVE-NEXT:    fmov d0, x8
-; CHECK-SVE-NEXT:    ldrh w8, [x0, #2]
-; CHECK-SVE-NEXT:    sshll v1.2d, v1.2s, #0
-; CHECK-SVE-NEXT:    mov v0.d[1], x8
+; CHECK-SVE-NEXT:    ldrh w9, [x0, #2]
+; CHECK-SVE-NEXT:    ldr d0, [x1]
+; CHECK-SVE-NEXT:    fmov d1, x8
+; CHECK-SVE-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-SVE-NEXT:    mov v1.d[1], x9
 ; CHECK-SVE-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-SVE-NEXT:    ret
@@ -272,8 +272,8 @@ define <2 x i64> @amull_v2i32_v2i64(ptr %A, ptr %B) nounwind {
 define <8 x i16> @smlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: smlal_v8i8_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr d2, [x2]
 ; CHECK-NEXT:    smlal v0.8h, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
@@ -290,8 +290,8 @@ define <8 x i16> @smlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
 define <4 x i32> @smlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: smlal_v4i16_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr d2, [x2]
 ; CHECK-NEXT:    smlal v0.4s, v1.4h, v2.4h
 ; CHECK-NEXT:    ret
@@ -308,8 +308,8 @@ define <4 x i32> @smlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
 define <2 x i64> @smlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: smlal_v2i32_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr d2, [x2]
 ; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.2s
 ; CHECK-NEXT:    ret
@@ -326,8 +326,8 @@ define <2 x i64> @smlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
 define <8 x i16> @umlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: umlal_v8i8_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr d2, [x2]
 ; CHECK-NEXT:    umlal v0.8h, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
@@ -344,8 +344,8 @@ define <8 x i16> @umlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
 define <4 x i32> @umlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: umlal_v4i16_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr d2, [x2]
 ; CHECK-NEXT:    umlal v0.4s, v1.4h, v2.4h
 ; CHECK-NEXT:    ret
@@ -362,8 +362,8 @@ define <4 x i32> @umlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
 define <2 x i64> @umlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: umlal_v2i32_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr d2, [x2]
 ; CHECK-NEXT:    umlal v0.2d, v1.2s, v2.2s
 ; CHECK-NEXT:    ret
@@ -380,8 +380,8 @@ define <2 x i64> @umlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
 define <8 x i16> @amlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: amlal_v8i8_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr d2, [x2]
 ; CHECK-NEXT:    smlal v0.8h, v1.8b, v2.8b
 ; CHECK-NEXT:    bic v0.8h, #255, lsl #8
@@ -400,12 +400,12 @@ define <8 x i16> @amlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
 define <4 x i32> @amlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: amlal_v4i16_v4i32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr q2, [x0]
-; CHECK-NEXT:    ldr d3, [x2]
-; CHECK-NEXT:    movi v0.2d, #0x00ffff0000ffff
-; CHECK-NEXT:    smlal v2.4s, v1.4h, v3.4h
-; CHECK-NEXT:    and v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    ldr d2, [x2]
+; CHECK-NEXT:    smlal v0.4s, v1.4h, v2.4h
+; CHECK-NEXT:    movi v1.2d, #0x00ffff0000ffff
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
@@ -421,12 +421,12 @@ define <4 x i32> @amlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
 define <2 x i64> @amlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: amlal_v2i32_v2i64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr q2, [x0]
-; CHECK-NEXT:    ldr d3, [x2]
-; CHECK-NEXT:    movi v0.2d, #0x000000ffffffff
-; CHECK-NEXT:    smlal v2.2d, v1.2s, v3.2s
-; CHECK-NEXT:    and v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    ldr d2, [x2]
+; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.2s
+; CHECK-NEXT:    movi v1.2d, #0x000000ffffffff
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
@@ -442,8 +442,8 @@ define <2 x i64> @amlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
 define <8 x i16> @smlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: smlsl_v8i8_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr d2, [x2]
 ; CHECK-NEXT:    smlsl v0.8h, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
@@ -460,8 +460,8 @@ define <8 x i16> @smlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
 define <4 x i32> @smlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: smlsl_v4i16_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr d2, [x2]
 ; CHECK-NEXT:    smlsl v0.4s, v1.4h, v2.4h
 ; CHECK-NEXT:    ret
@@ -478,8 +478,8 @@ define <4 x i32> @smlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
 define <2 x i64> @smlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: smlsl_v2i32_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr d2, [x2]
 ; CHECK-NEXT:    smlsl v0.2d, v1.2s, v2.2s
 ; CHECK-NEXT:    ret
@@ -496,8 +496,8 @@ define <2 x i64> @smlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
 define <8 x i16> @umlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: umlsl_v8i8_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr d2, [x2]
 ; CHECK-NEXT:    umlsl v0.8h, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
@@ -514,8 +514,8 @@ define <8 x i16> @umlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
 define <4 x i32> @umlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: umlsl_v4i16_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr d2, [x2]
 ; CHECK-NEXT:    umlsl v0.4s, v1.4h, v2.4h
 ; CHECK-NEXT:    ret
@@ -532,8 +532,8 @@ define <4 x i32> @umlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
 define <2 x i64> @umlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: umlsl_v2i32_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr d2, [x2]
 ; CHECK-NEXT:    umlsl v0.2d, v1.2s, v2.2s
 ; CHECK-NEXT:    ret
@@ -550,8 +550,8 @@ define <2 x i64> @umlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
 define <8 x i16> @amlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: amlsl_v8i8_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr d2, [x2]
 ; CHECK-NEXT:    smlsl v0.8h, v1.8b, v2.8b
 ; CHECK-NEXT:    bic v0.8h, #255, lsl #8
@@ -570,12 +570,12 @@ define <8 x i16> @amlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
 define <4 x i32> @amlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: amlsl_v4i16_v4i32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr q2, [x0]
-; CHECK-NEXT:    ldr d3, [x2]
-; CHECK-NEXT:    movi v0.2d, #0x00ffff0000ffff
-; CHECK-NEXT:    smlsl v2.4s, v1.4h, v3.4h
-; CHECK-NEXT:    and v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    ldr d2, [x2]
+; CHECK-NEXT:    smlsl v0.4s, v1.4h, v2.4h
+; CHECK-NEXT:    movi v1.2d, #0x00ffff0000ffff
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
@@ -591,12 +591,12 @@ define <4 x i32> @amlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
 define <2 x i64> @amlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: amlsl_v2i32_v2i64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr q2, [x0]
-; CHECK-NEXT:    ldr d3, [x2]
-; CHECK-NEXT:    movi v0.2d, #0x000000ffffffff
-; CHECK-NEXT:    smlsl v2.2d, v1.2s, v3.2s
-; CHECK-NEXT:    and v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    ldr d2, [x2]
+; CHECK-NEXT:    smlsl v0.2d, v1.2s, v2.2s
+; CHECK-NEXT:    movi v1.2d, #0x000000ffffffff
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
@@ -724,9 +724,9 @@ define <4 x i32> @amull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
 ; CHECK-LABEL: amull_extvec_v4i16_v4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #1234 // =0x4d2
+; CHECK-NEXT:    dup v1.4h, w8
+; CHECK-NEXT:    smull v0.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    movi v1.2d, #0x00ffff0000ffff
-; CHECK-NEXT:    dup v2.4h, w8
-; CHECK-NEXT:    smull v0.4s, v0.4h, v2.4h
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp3 = zext <4 x i16> %arg to <4 x i32>
@@ -739,9 +739,9 @@ define <2 x i64> @amull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
 ; CHECK-LABEL: amull_extvec_v2i32_v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #1234 // =0x4d2
+; CHECK-NEXT:    dup v1.2s, w8
+; CHECK-NEXT:    smull v0.2d, v0.2s, v1.2s
 ; CHECK-NEXT:    movi v1.2d, #0x000000ffffffff
-; CHECK-NEXT:    dup v2.2s, w8
-; CHECK-NEXT:    smull v0.2d, v0.2s, v2.2s
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %tmp3 = zext <2 x i32> %arg to <2 x i64>
@@ -897,11 +897,11 @@ define <4 x i64> @smull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
 define <16 x i16> @amull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
 ; CHECK-LABEL: amull2_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    smull2 v2.8h, v0.16b, v1.16b
-; CHECK-NEXT:    smull v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    smull v2.8h, v0.8b, v1.8b
+; CHECK-NEXT:    smull2 v1.8h, v0.16b, v1.16b
 ; CHECK-NEXT:    bic v2.8h, #255, lsl #8
-; CHECK-NEXT:    bic v0.8h, #255, lsl #8
-; CHECK-NEXT:    mov v1.16b, v2.16b
+; CHECK-NEXT:    bic v1.8h, #255, lsl #8
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %arg1_ext = zext <16 x i8> %arg1 to <16 x i16>
   %arg2_ext = zext <16 x i8> %arg2 to <16 x i16>
@@ -914,10 +914,10 @@ define <8 x i32> @amull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
 ; CHECK-LABEL: amull2_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v2.2d, #0x00ffff0000ffff
-; CHECK-NEXT:    smull2 v3.4s, v0.8h, v1.8h
-; CHECK-NEXT:    smull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    and v1.16b, v3.16b, v2.16b
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    smull v3.4s, v0.4h, v1.4h
+; CHECK-NEXT:    smull2 v0.4s, v0.8h, v1.8h
+; CHECK-NEXT:    and v1.16b, v0.16b, v2.16b
+; CHECK-NEXT:    and v0.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    ret
   %arg1_ext = zext <8 x i16> %arg1 to <8 x i32>
   %arg2_ext = zext <8 x i16> %arg2 to <8 x i32>
@@ -930,10 +930,10 @@ define <4 x i64> @amull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
 ; CHECK-LABEL: amull2_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v2.2d, #0x000000ffffffff
-; CHECK-NEXT:    smull2 v3.2d, v0.4s, v1.4s
-; CHECK-NEXT:    smull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    and v1.16b, v3.16b, v2.16b
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    smull v3.2d, v0.2s, v1.2s
+; CHECK-NEXT:    smull2 v0.2d, v0.4s, v1.4s
+; CHECK-NEXT:    and v1.16b, v0.16b, v2.16b
+; CHECK-NEXT:    and v0.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    ret
   %arg1_ext = zext <4 x i32> %arg1 to <4 x i64>
   %arg2_ext = zext <4 x i32> %arg2 to <4 x i64>

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-uzp1-combine.ll b/llvm/test/CodeGen/AArch64/aarch64-uzp1-combine.ll
index f96bdc1ed63cd1..b2cb38c72bae83 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-uzp1-combine.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-uzp1-combine.ll
@@ -13,10 +13,10 @@ define <4 x i16> @test_combine_v4i16_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-BE:       // %bb.0:
 ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
 ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-BE-NEXT:    xtn v1.2s, v1.2d
 ; CHECK-BE-NEXT:    xtn v0.2s, v0.2d
-; CHECK-BE-NEXT:    rev32 v1.4h, v1.4h
+; CHECK-BE-NEXT:    xtn v1.2s, v1.2d
 ; CHECK-BE-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT:    rev32 v1.4h, v1.4h
 ; CHECK-BE-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
 ; CHECK-BE-NEXT:    rev64 v0.4h, v0.4h
 ; CHECK-BE-NEXT:    ret
@@ -43,8 +43,8 @@ define <4 x i16> @test_combine_v4i16_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
 ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
 ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-BE-NEXT:    xtn v1.4h, v1.4s
 ; CHECK-BE-NEXT:    xtn v0.4h, v0.4s
+; CHECK-BE-NEXT:    xtn v1.4h, v1.4s
 ; CHECK-BE-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
 ; CHECK-BE-NEXT:    rev64 v0.4h, v0.4h
 ; CHECK-BE-NEXT:    ret
@@ -68,10 +68,10 @@ define <4 x i16> @test_combine_v4i16_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
 ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
 ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-BE-NEXT:    xtn v1.8b, v1.8h
 ; CHECK-BE-NEXT:    xtn v0.8b, v0.8h
-; CHECK-BE-NEXT:    rev16 v1.8b, v1.8b
+; CHECK-BE-NEXT:    xtn v1.8b, v1.8h
 ; CHECK-BE-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT:    rev16 v1.8b, v1.8b
 ; CHECK-BE-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
 ; CHECK-BE-NEXT:    rev64 v0.4h, v0.4h
 ; CHECK-BE-NEXT:    ret
@@ -97,10 +97,10 @@ define <8 x i8> @test_combine_v8i8_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-BE:       // %bb.0:
 ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
 ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-BE-NEXT:    xtn v1.2s, v1.2d
 ; CHECK-BE-NEXT:    xtn v0.2s, v0.2d
-; CHECK-BE-NEXT:    rev32 v1.8b, v1.8b
+; CHECK-BE-NEXT:    xtn v1.2s, v1.2d
 ; CHECK-BE-NEXT:    rev32 v0.8b, v0.8b
+; CHECK-BE-NEXT:    rev32 v1.8b, v1.8b
 ; CHECK-BE-NEXT:    uzp1 v0.8b, v0.8b, v1.8b
 ; CHECK-BE-NEXT:    rev64 v0.8b, v0.8b
 ; CHECK-BE-NEXT:    ret
@@ -127,10 +127,10 @@ define <8 x i8> @test_combine_v8i8_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
 ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
 ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-BE-NEXT:    xtn v1.4h, v1.4s
 ; CHECK-BE-NEXT:    xtn v0.4h, v0.4s
-; CHECK-BE-NEXT:    rev16 v1.8b, v1.8b
+; CHECK-BE-NEXT:    xtn v1.4h, v1.4s
 ; CHECK-BE-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT:    rev16 v1.8b, v1.8b
 ; CHECK-BE-NEXT:    uzp1 v0.8b, v0.8b, v1.8b
 ; CHECK-BE-NEXT:    rev64 v0.8b, v0.8b
 ; CHECK-BE-NEXT:    ret
@@ -157,8 +157,8 @@ define <8 x i8> @test_combine_v8i8_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
 ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
 ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-BE-NEXT:    xtn v1.8b, v1.8h
 ; CHECK-BE-NEXT:    xtn v0.8b, v0.8h
+; CHECK-BE-NEXT:    xtn v1.8b, v1.8h
 ; CHECK-BE-NEXT:    uzp1 v0.8b, v0.8b, v1.8b
 ; CHECK-BE-NEXT:    rev64 v0.8b, v0.8b
 ; CHECK-BE-NEXT:    ret
@@ -181,8 +181,8 @@ define <2 x i32> @test_combine_v2i32_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-BE:       // %bb.0:
 ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
 ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-BE-NEXT:    xtn v1.2s, v1.2d
 ; CHECK-BE-NEXT:    xtn v0.2s, v0.2d
+; CHECK-BE-NEXT:    xtn v1.2s, v1.2d
 ; CHECK-BE-NEXT:    zip1 v0.2s, v0.2s, v1.2s
 ; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-BE-NEXT:    ret
@@ -207,10 +207,10 @@ define <2 x i32> @test_combine_v2i32_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
 ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
 ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-BE-NEXT:    xtn v1.4h, v1.4s
 ; CHECK-BE-NEXT:    xtn v0.4h, v0.4s
-; CHECK-BE-NEXT:    rev32 v1.4h, v1.4h
+; CHECK-BE-NEXT:    xtn v1.4h, v1.4s
 ; CHECK-BE-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT:    rev32 v1.4h, v1.4h
 ; CHECK-BE-NEXT:    zip1 v0.2s, v0.2s, v1.2s
 ; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-BE-NEXT:    ret
@@ -238,10 +238,10 @@ define <2 x i32> @test_combine_v2i32_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
 ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
 ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-BE-NEXT:    xtn v1.8b, v1.8h
 ; CHECK-BE-NEXT:    xtn v0.8b, v0.8h
-; CHECK-BE-NEXT:    rev32 v1.8b, v1.8b
+; CHECK-BE-NEXT:    xtn v1.8b, v1.8h
 ; CHECK-BE-NEXT:    rev32 v0.8b, v0.8b
+; CHECK-BE-NEXT:    rev32 v1.8b, v1.8b
 ; CHECK-BE-NEXT:    zip1 v0.2s, v0.2s, v1.2s
 ; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-BE-NEXT:    ret
@@ -268,10 +268,10 @@ define i8 @trunc_v4i64_v4i8(<4 x i64> %input) {
 ; CHECK-BE:       // %bb.0:
 ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-BE-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-BE-NEXT:    xtn v1.2s, v1.2d
-; CHECK-BE-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-BE-NEXT:    rev32 v1.4h, v1.4h
+; CHECK-BE-NEXT:    rev32 v0.4h, v0.4h
 ; CHECK-BE-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
 ; CHECK-BE-NEXT:    addv h0, v0.4h
 ; CHECK-BE-NEXT:    fmov w0, s0

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll b/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
index 1c52b359156f69..a5154641400309 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
@@ -21,13 +21,13 @@ define <16 x i32> @mul_i32(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: mul_i32:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-NEXT:    ushll2 v4.8h, v0.16b, #0
-; CHECK-NEXT:    ushll2 v5.8h, v1.16b, #0
-; CHECK-NEXT:    ushll v0.8h, v1.8b, #0
-; CHECK-NEXT:    umull2 v3.4s, v4.8h, v5.8h
-; CHECK-NEXT:    umull2 v1.4s, v2.8h, v0.8h
-; CHECK-NEXT:    umull v0.4s, v2.4h, v0.4h
-; CHECK-NEXT:    umull v2.4s, v4.4h, v5.4h
+; CHECK-NEXT:    ushll v4.8h, v1.8b, #0
+; CHECK-NEXT:    ushll2 v5.8h, v0.16b, #0
+; CHECK-NEXT:    ushll2 v6.8h, v1.16b, #0
+; CHECK-NEXT:    umull v0.4s, v2.4h, v4.4h
+; CHECK-NEXT:    umull2 v1.4s, v2.8h, v4.8h
+; CHECK-NEXT:    umull2 v3.4s, v5.8h, v6.8h
+; CHECK-NEXT:    umull v2.4s, v5.4h, v6.4h
 ; CHECK-NEXT:    ret
 entry:
   %ea = zext <16 x i8> %a to <16 x i32>
@@ -41,24 +41,24 @@ define <16 x i64> @mul_i64(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ushll v2.8h, v0.8b, #0
 ; CHECK-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-NEXT:    ushll v4.4s, v2.4h, #0
-; CHECK-NEXT:    ushll v6.4s, v0.4h, #0
-; CHECK-NEXT:    ushll2 v16.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.8h, v1.8b, #0
+; CHECK-NEXT:    ushll v3.8h, v1.8b, #0
 ; CHECK-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-NEXT:    ushll v4.4s, v2.4h, #0
 ; CHECK-NEXT:    ushll v5.4s, v0.4h, #0
-; CHECK-NEXT:    ushll v17.4s, v1.4h, #0
+; CHECK-NEXT:    ushll v6.4s, v3.4h, #0
+; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-NEXT:    ushll v16.4s, v1.4h, #0
+; CHECK-NEXT:    ushll2 v7.4s, v3.8h, #0
+; CHECK-NEXT:    ushll2 v17.4s, v0.8h, #0
 ; CHECK-NEXT:    ushll2 v18.4s, v1.8h, #0
-; CHECK-NEXT:    ushll2 v19.4s, v0.8h, #0
-; CHECK-NEXT:    umull2 v7.2d, v16.4s, v18.4s
-; CHECK-NEXT:    umull2 v3.2d, v2.4s, v19.4s
-; CHECK-NEXT:    umull2 v1.2d, v4.4s, v5.4s
-; CHECK-NEXT:    umull v0.2d, v4.2s, v5.2s
-; CHECK-NEXT:    umull2 v5.2d, v6.4s, v17.4s
-; CHECK-NEXT:    umull v2.2d, v2.2s, v19.2s
-; CHECK-NEXT:    umull v4.2d, v6.2s, v17.2s
-; CHECK-NEXT:    umull v6.2d, v16.2s, v18.2s
+; CHECK-NEXT:    umull2 v1.2d, v4.4s, v6.4s
+; CHECK-NEXT:    umull v0.2d, v4.2s, v6.2s
+; CHECK-NEXT:    umull2 v3.2d, v2.4s, v7.4s
+; CHECK-NEXT:    umull v2.2d, v2.2s, v7.2s
+; CHECK-NEXT:    umull v4.2d, v5.2s, v16.2s
+; CHECK-NEXT:    umull2 v7.2d, v17.4s, v18.4s
+; CHECK-NEXT:    umull2 v5.2d, v5.4s, v16.4s
+; CHECK-NEXT:    umull v6.2d, v17.2s, v18.2s
 ; CHECK-NEXT:    ret
 entry:
   %ea = zext <16 x i8> %a to <16 x i64>
@@ -73,8 +73,8 @@ define <16 x i16> @mla_i16(<16 x i8> %a, <16 x i8> %b, <16 x i16> %c) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    umlal2 v3.8h, v0.16b, v1.16b
 ; CHECK-NEXT:    umlal v2.8h, v0.8b, v1.8b
-; CHECK-NEXT:    mov v1.16b, v3.16b
 ; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    mov v1.16b, v3.16b
 ; CHECK-NEXT:    ret
 entry:
   %ea = zext <16 x i8> %a to <16 x i16>
@@ -88,15 +88,15 @@ define <16 x i32> @mla_i32(<16 x i8> %a, <16 x i8> %b, <16 x i32> %c) {
 ; CHECK-LABEL: mla_i32:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ushll v6.8h, v0.8b, #0
+; CHECK-NEXT:    ushll v7.8h, v1.8b, #0
 ; CHECK-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-NEXT:    ushll2 v7.8h, v1.16b, #0
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    umlal2 v5.4s, v0.8h, v7.8h
-; CHECK-NEXT:    umlal2 v3.4s, v6.8h, v1.8h
-; CHECK-NEXT:    umlal v2.4s, v6.4h, v1.4h
-; CHECK-NEXT:    umlal v4.4s, v0.4h, v7.4h
-; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-NEXT:    umlal v2.4s, v6.4h, v7.4h
+; CHECK-NEXT:    umlal2 v3.4s, v6.8h, v7.8h
+; CHECK-NEXT:    umlal2 v5.4s, v0.8h, v1.8h
+; CHECK-NEXT:    umlal v4.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    mov v1.16b, v3.16b
 ; CHECK-NEXT:    mov v2.16b, v4.16b
 ; CHECK-NEXT:    mov v3.16b, v5.16b
 ; CHECK-NEXT:    ret
@@ -113,25 +113,25 @@ define <16 x i64> @mla_i64(<16 x i8> %a, <16 x i8> %b, <16 x i64> %c) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov v17.16b, v7.16b
 ; CHECK-NEXT:    mov v16.16b, v6.16b
-; CHECK-NEXT:    ldp q6, q7, [sp]
-; CHECK-NEXT:    ushll v18.8h, v0.8b, #0
+; CHECK-NEXT:    ushll v6.8h, v0.8b, #0
 ; CHECK-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-NEXT:    ushll v21.8h, v1.8b, #0
+; CHECK-NEXT:    ushll v7.8h, v1.8b, #0
 ; CHECK-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-NEXT:    ushll v19.4s, v18.4h, #0
-; CHECK-NEXT:    ushll v20.4s, v0.4h, #0
-; CHECK-NEXT:    ushll2 v18.4s, v18.8h, #0
-; CHECK-NEXT:    ushll v22.4s, v21.4h, #0
-; CHECK-NEXT:    ushll v23.4s, v1.4h, #0
-; CHECK-NEXT:    ushll2 v21.4s, v21.8h, #0
+; CHECK-NEXT:    ushll v18.4s, v6.4h, #0
+; CHECK-NEXT:    ushll2 v21.4s, v6.8h, #0
+; CHECK-NEXT:    ushll v19.4s, v0.4h, #0
+; CHECK-NEXT:    ushll v20.4s, v7.4h, #0
+; CHECK-NEXT:    ushll v22.4s, v1.4h, #0
+; CHECK-NEXT:    ushll2 v23.4s, v7.8h, #0
+; CHECK-NEXT:    ldp q6, q7, [sp]
 ; CHECK-NEXT:    ushll2 v0.4s, v0.8h, #0
 ; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    umlal2 v5.2d, v18.4s, v21.4s
-; CHECK-NEXT:    umlal2 v17.2d, v20.4s, v23.4s
-; CHECK-NEXT:    umlal2 v3.2d, v19.4s, v22.4s
-; CHECK-NEXT:    umlal v2.2d, v19.2s, v22.2s
-; CHECK-NEXT:    umlal v4.2d, v18.2s, v21.2s
-; CHECK-NEXT:    umlal v16.2d, v20.2s, v23.2s
+; CHECK-NEXT:    umlal2 v3.2d, v18.4s, v20.4s
+; CHECK-NEXT:    umlal v2.2d, v18.2s, v20.2s
+; CHECK-NEXT:    umlal v16.2d, v19.2s, v22.2s
+; CHECK-NEXT:    umlal2 v5.2d, v21.4s, v23.4s
+; CHECK-NEXT:    umlal v4.2d, v21.2s, v23.2s
+; CHECK-NEXT:    umlal2 v17.2d, v19.4s, v22.4s
 ; CHECK-NEXT:    umlal2 v7.2d, v0.4s, v1.4s
 ; CHECK-NEXT:    umlal v6.2d, v0.2s, v1.2s
 ; CHECK-NEXT:    mov v0.16b, v2.16b

diff  --git a/llvm/test/CodeGen/AArch64/aarch64_fnmadd.ll b/llvm/test/CodeGen/AArch64/aarch64_fnmadd.ll
index b47736907e1e2c..f7d9311a4154c8 100644
--- a/llvm/test/CodeGen/AArch64/aarch64_fnmadd.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64_fnmadd.ll
@@ -4,10 +4,10 @@
 define void @fnmaddd(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fnmaddd:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ldr d2, [x2]
-; CHECK-NEXT:    fnmadd d0, d0, d1, d2
+; CHECK-NEXT:    fnmadd d0, d1, d0, d2
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
 entry:
@@ -47,10 +47,10 @@ entry:
 define void @fnmadds(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fnmadds:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr s0, [x1]
-; CHECK-NEXT:    ldr s1, [x0]
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ldr s1, [x1]
 ; CHECK-NEXT:    ldr s2, [x2]
-; CHECK-NEXT:    fnmadd s0, s0, s1, s2
+; CHECK-NEXT:    fnmadd s0, s1, s0, s2
 ; CHECK-NEXT:    str s0, [x0]
 ; CHECK-NEXT:    ret
 entry:
@@ -67,10 +67,10 @@ entry:
 define void @fnmadds_nsz_contract(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fnmadds_nsz_contract:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr s0, [x1]
-; CHECK-NEXT:    ldr s1, [x0]
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ldr s1, [x1]
 ; CHECK-NEXT:    ldr s2, [x2]
-; CHECK-NEXT:    fnmadd s0, s0, s1, s2
+; CHECK-NEXT:    fnmadd s0, s1, s0, s2
 ; CHECK-NEXT:    str s0, [x0]
 ; CHECK-NEXT:    ret
 entry:
@@ -88,10 +88,10 @@ entry:
 define void @fnmadds_contract(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fnmadds_contract:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr s0, [x1]
-; CHECK-NEXT:    ldr s1, [x0]
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ldr s1, [x1]
 ; CHECK-NEXT:    ldr s2, [x2]
-; CHECK-NEXT:    fmadd s0, s0, s1, s2
+; CHECK-NEXT:    fmadd s0, s1, s0, s2
 ; CHECK-NEXT:    fneg s0, s0
 ; CHECK-NEXT:    str s0, [x0]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll b/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll
index ab3802127ffbb1..7d488c9ca20022 100644
--- a/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll
@@ -9,8 +9,8 @@ define win64cc void @pass_va(i32 %count, ...) nounwind {
 ; CHECK-NEXT:    add x8, sp, #40
 ; CHECK-NEXT:    add x0, sp, #40
 ; CHECK-NEXT:    stp x30, x18, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x3, x4, [sp, #56]
 ; CHECK-NEXT:    stp x1, x2, [sp, #40]
+; CHECK-NEXT:    stp x3, x4, [sp, #56]
 ; CHECK-NEXT:    stp x5, x6, [sp, #72]
 ; CHECK-NEXT:    str x7, [sp, #88]
 ; CHECK-NEXT:    str x8, [sp, #8]
@@ -19,11 +19,22 @@ define win64cc void @pass_va(i32 %count, ...) nounwind {
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
 ;
+; DARWIN-LABEL: pass_va:
 ; DARWIN:       ; %bb.0: ; %entry
-; DARWIN-DAG:     stp x3, x4, [sp, #56]
-; DARWIN-DAG:     stp x1, x2, [sp, #40]
-; DARWIN-DAG:     stp x5, x6, [sp, #72]
-; DARWIN-DAG:     str x7, [sp, #88]
+; DARWIN-NEXT:    str x18, [sp, #-96]! ; 8-byte Folded Spill
+; DARWIN-NEXT:    add x8, sp, #8
+; DARWIN-NEXT:    add x9, sp, #40
+; DARWIN-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; DARWIN-NEXT:    str x9, [x8]
+; DARWIN-NEXT:    ldr x0, [sp, #8]
+; DARWIN-NEXT:    stp x1, x2, [sp, #40]
+; DARWIN-NEXT:    stp x3, x4, [sp, #56]
+; DARWIN-NEXT:    stp x5, x6, [sp, #72]
+; DARWIN-NEXT:    str x7, [sp, #88]
+; DARWIN-NEXT:    bl _other_func
+; DARWIN-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; DARWIN-NEXT:    ldr x18, [sp], #96 ; 8-byte Folded Reload
+; DARWIN-NEXT:    ret
 entry:
   %ap = alloca ptr, align 8
   call void @llvm.va_start(ptr %ap)
@@ -47,15 +58,15 @@ define win64cc ptr @f9(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64
 ; CHECK-NEXT:    ldr x18, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 ;
-; DARWIN-LABEL: _f9:
-; DARWIN:      ; %bb.0:                                ; %entry
-; DARWIN-NEXT:   str x18, [sp, #-16]!                ; 8-byte Folded Spill
-; DARWIN-NEXT:   add x8, sp, #8
-; DARWIN-NEXT:   add x9, sp, #24
-; DARWIN-NEXT:   str x9, [x8]
-; DARWIN-NEXT:   ldr x0, [sp, #8]
-; DARWIN-NEXT:   ldr x18, [sp], #16                  ; 8-byte Folded Reload
-; DARWIN-NEXT:   ret
+; DARWIN-LABEL: f9:
+; DARWIN:       ; %bb.0: ; %entry
+; DARWIN-NEXT:    str x18, [sp, #-16]! ; 8-byte Folded Spill
+; DARWIN-NEXT:    add x8, sp, #8
+; DARWIN-NEXT:    add x9, sp, #24
+; DARWIN-NEXT:    str x9, [x8]
+; DARWIN-NEXT:    ldr x0, [sp, #8]
+; DARWIN-NEXT:    ldr x18, [sp], #16 ; 8-byte Folded Reload
+; DARWIN-NEXT:    ret
 entry:
   %ap = alloca ptr, align 8
   call void @llvm.va_start(ptr %ap)
@@ -73,15 +84,15 @@ define win64cc ptr @f8(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64
 ; CHECK-NEXT:    ldr x18, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 ;
-; DARWIN-LABEL: _f8:
-; DARWIN:      ; %bb.0:                                ; %entry
-; DARWIN-NEXT:   str x18, [sp, #-16]!                ; 8-byte Folded Spill
-; DARWIN-NEXT:   add x8, sp, #8
-; DARWIN-NEXT:   add x9, sp, #16
-; DARWIN-NEXT:   str x9, [x8]
-; DARWIN-NEXT:   ldr x0, [sp, #8]
-; DARWIN-NEXT:   ldr x18, [sp], #16                  ; 8-byte Folded Reload
-; DARWIN-NEXT:   ret
+; DARWIN-LABEL: f8:
+; DARWIN:       ; %bb.0: ; %entry
+; DARWIN-NEXT:    str x18, [sp, #-16]! ; 8-byte Folded Spill
+; DARWIN-NEXT:    add x8, sp, #8
+; DARWIN-NEXT:    add x9, sp, #16
+; DARWIN-NEXT:    str x9, [x8]
+; DARWIN-NEXT:    ldr x0, [sp, #8]
+; DARWIN-NEXT:    ldr x18, [sp], #16 ; 8-byte Folded Reload
+; DARWIN-NEXT:    ret
 entry:
   %ap = alloca ptr, align 8
   call void @llvm.va_start(ptr %ap)
@@ -100,16 +111,16 @@ define win64cc ptr @f7(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64
 ; CHECK-NEXT:    ldr x18, [sp], #32 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 ;
-; DARWIN-LABEL: _f7:
-; DARWIN:      ; %bb.0:                                ; %entry
-; DARWIN-NEXT:   str x18, [sp, #-32]!                ; 8-byte Folded Spill
-; DARWIN-NEXT:   add x8, sp, #8
-; DARWIN-NEXT:   add x9, sp, #24
-; DARWIN-NEXT:   str x7, [sp, #24]
-; DARWIN-NEXT:   str x9, [x8]
-; DARWIN-NEXT:   ldr x0, [sp, #8]
-; DARWIN-NEXT:   ldr x18, [sp], #32                  ; 8-byte Folded Reload
-; DARWIN-NEXT:   ret
+; DARWIN-LABEL: f7:
+; DARWIN:       ; %bb.0: ; %entry
+; DARWIN-NEXT:    str x18, [sp, #-32]! ; 8-byte Folded Spill
+; DARWIN-NEXT:    add x8, sp, #8
+; DARWIN-NEXT:    add x9, sp, #24
+; DARWIN-NEXT:    str x7, [sp, #24]
+; DARWIN-NEXT:    str x9, [x8]
+; DARWIN-NEXT:    ldr x0, [sp, #8]
+; DARWIN-NEXT:    ldr x18, [sp], #32 ; 8-byte Folded Reload
+; DARWIN-NEXT:    ret
 entry:
   %ap = alloca ptr, align 8
   call void @llvm.va_start(ptr %ap)

diff  --git a/llvm/test/CodeGen/AArch64/abd-combine.ll b/llvm/test/CodeGen/AArch64/abd-combine.ll
index bb0e9ae503ed48..cc29a59288c7dd 100644
--- a/llvm/test/CodeGen/AArch64/abd-combine.ll
+++ b/llvm/test/CodeGen/AArch64/abd-combine.ll
@@ -20,9 +20,9 @@ define <8 x i16> @abdu_const(<8 x i16> %src1) {
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    ushll2 v2.4s, v0.8h, #0
 ; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    sub v2.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    abs v1.4s, v2.4s
+; CHECK-NEXT:    sub v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    abs v1.4s, v1.4s
 ; CHECK-NEXT:    abs v0.4s, v0.4s
 ; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
@@ -37,11 +37,11 @@ define <8 x i16> @abdu_const_lhs(<8 x i16> %src1) {
 ; CHECK-LABEL: abdu_const_lhs:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    usubw2 v2.4s, v1.4s, v0.8h
-; CHECK-NEXT:    usubw v0.4s, v1.4s, v0.4h
-; CHECK-NEXT:    abs v1.4s, v2.4s
+; CHECK-NEXT:    usubw v2.4s, v1.4s, v0.4h
+; CHECK-NEXT:    usubw2 v0.4s, v1.4s, v0.8h
 ; CHECK-NEXT:    abs v0.4s, v0.4s
-; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    abs v1.4s, v2.4s
+; CHECK-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
   %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
   %sub = sub <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, %zextsrc1
@@ -330,9 +330,9 @@ define <8 x i16> @abds_const(<8 x i16> %src1) {
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    sshll2 v2.4s, v0.8h, #0
 ; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    sub v2.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    abs v1.4s, v2.4s
+; CHECK-NEXT:    sub v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    abs v1.4s, v1.4s
 ; CHECK-NEXT:    abs v0.4s, v0.4s
 ; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
@@ -347,11 +347,11 @@ define <8 x i16> @abds_const_lhs(<8 x i16> %src1) {
 ; CHECK-LABEL: abds_const_lhs:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    ssubw2 v2.4s, v1.4s, v0.8h
-; CHECK-NEXT:    ssubw v0.4s, v1.4s, v0.4h
-; CHECK-NEXT:    abs v1.4s, v2.4s
+; CHECK-NEXT:    ssubw v2.4s, v1.4s, v0.4h
+; CHECK-NEXT:    ssubw2 v0.4s, v1.4s, v0.8h
 ; CHECK-NEXT:    abs v0.4s, v0.4s
-; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    abs v1.4s, v2.4s
+; CHECK-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
   %zextsrc1 = sext <8 x i16> %src1 to <8 x i32>
   %sub = sub <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, %zextsrc1
@@ -405,11 +405,11 @@ define <8 x i16> @abds_const_bothhigh() {
 define <8 x i16> @abds_undef(<8 x i16> %src1) {
 ; CHECK-LABEL: abds_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    abs v1.4s, v1.4s
+; CHECK-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-NEXT:    sshll2 v0.4s, v0.8h, #0
 ; CHECK-NEXT:    abs v0.4s, v0.4s
-; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    abs v1.4s, v1.4s
+; CHECK-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
   %zextsrc1 = sext <8 x i16> %src1 to <8 x i32>
   %zextsrc2 = sext <8 x i16> undef to <8 x i32>
@@ -530,10 +530,10 @@ define <8 x i16> @abds_i_reassoc(<8 x i16> %src1) {
 define <1 x i64> @recursive() {
 ; CHECK-LABEL: recursive:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v0.8b, #1
-; CHECK-NEXT:    movi v1.2d, #0xffffffffffffffff
-; CHECK-NEXT:    uabd v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    uabdl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    movi v0.2d, #0xffffffffffffffff
+; CHECK-NEXT:    movi v1.8b, #1
+; CHECK-NEXT:    uabd v2.8b, v1.8b, v0.8b
+; CHECK-NEXT:    uabdl v0.8h, v1.8b, v0.8b
 ; CHECK-NEXT:    dup v1.8b, v2.b[0]
 ; CHECK-NEXT:    saddlp v0.1d, v0.2s
 ; CHECK-NEXT:    orr v0.8b, v1.8b, v0.8b

diff  --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
index 8de5edc7b017a0..e8437b5cd801f5 100644
--- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll
+++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
@@ -80,9 +80,9 @@ define <vscale x 16 x i1> @lane_mask_nxv16i1_i8(i8 %index, i8 %TC) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    index z0.b, #0, #1
 ; CHECK-NEXT:    mov z1.b, w0
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    uqadd z0.b, z0.b, z1.b
 ; CHECK-NEXT:    mov z1.b, w1
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    cmphi p0.b, p0/z, z1.b, z0.b
 ; CHECK-NEXT:    ret
   %active.lane.mask = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i8(i8 %index, i8 %TC)
@@ -94,13 +94,13 @@ define <vscale x 8 x i1> @lane_mask_nxv8i1_i8(i8 %index, i8 %TC) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    index z0.h, #0, #1
 ; CHECK-NEXT:    mov z1.h, w0
-; CHECK-NEXT:    and z0.h, z0.h, #0xff
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-NEXT:    and z0.h, z0.h, #0xff
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    mov z1.h, w1
 ; CHECK-NEXT:    umin z0.h, z0.h, #255
 ; CHECK-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    cmphi p0.h, p0/z, z1.h, z0.h
 ; CHECK-NEXT:    ret
   %active.lane.mask = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i8(i8 %index, i8 %TC)
@@ -110,14 +110,14 @@ define <vscale x 8 x i1> @lane_mask_nxv8i1_i8(i8 %index, i8 %TC) {
 define <vscale x 4 x i1> @lane_mask_nxv4i1_i8(i8 %index, i8 %TC) {
 ; CHECK-LABEL: lane_mask_nxv4i1_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xff
 ; CHECK-NEXT:    index z0.s, #0, #1
-; CHECK-NEXT:    and w9, w1, #0xff
-; CHECK-NEXT:    and z0.s, z0.s, #0xff
+; CHECK-NEXT:    and w8, w0, #0xff
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    and w8, w1, #0xff
+; CHECK-NEXT:    and z0.s, z0.s, #0xff
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEXT:    mov z1.s, w9
+; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    umin z0.s, z0.s, #255
 ; CHECK-NEXT:    cmphi p0.s, p0/z, z1.s, z0.s
 ; CHECK-NEXT:    ret
@@ -128,18 +128,18 @@ define <vscale x 4 x i1> @lane_mask_nxv4i1_i8(i8 %index, i8 %TC) {
 define <vscale x 2 x i1> @lane_mask_nxv2i1_i8(i8 %index, i8 %TC) {
 ; CHECK-LABEL: lane_mask_nxv2i1_i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    index z0.d, #0, #1
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    and x8, x0, #0xff
-; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    and x9, x1, #0xff
-; CHECK-NEXT:    index z0.d, #0, #1
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    and z0.d, z0.d, #0xff
 ; CHECK-NEXT:    mov z1.d, x8
-; CHECK-NEXT:    mov z2.d, x9
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    and x8, x1, #0xff
+; CHECK-NEXT:    and z0.d, z0.d, #0xff
 ; CHECK-NEXT:    add z0.d, z0.d, z1.d
+; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    umin z0.d, z0.d, #255
-; CHECK-NEXT:    cmphi p0.d, p0/z, z2.d, z0.d
+; CHECK-NEXT:    cmphi p0.d, p0/z, z1.d, z0.d
 ; CHECK-NEXT:    ret
   %active.lane.mask = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i8(i8 %index, i8 %TC)
   ret <vscale x 2 x i1> %active.lane.mask
@@ -153,47 +153,49 @@ define <vscale x 32 x i1> @lane_mask_nxv32i1_i32(i32 %index, i32 %TC) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    index z0.s, #0, #1
-; CHECK-NEXT:    mov z3.s, w0
-; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    mov z1.s, w0
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov z25.s, w1
 ; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z4.s, w1
-; CHECK-NEXT:    incw z1.s
-; CHECK-NEXT:    uqadd z5.s, z0.s, z3.s
-; CHECK-NEXT:    incw z2.s, all, mul #2
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    cmphi p1.s, p0/z, z4.s, z5.s
-; CHECK-NEXT:    uqadd z5.s, z1.s, z3.s
-; CHECK-NEXT:    cmphi p2.s, p0/z, z4.s, z5.s
-; CHECK-NEXT:    uqadd z5.s, z2.s, z3.s
-; CHECK-NEXT:    incw z6.s, all, mul #2
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    uqadd z6.s, z0.s, z1.s
 ; CHECK-NEXT:    incw z0.s, all, mul #4
-; CHECK-NEXT:    cmphi p3.s, p0/z, z4.s, z5.s
-; CHECK-NEXT:    uqadd z5.s, z6.s, z3.s
-; CHECK-NEXT:    incw z1.s, all, mul #4
-; CHECK-NEXT:    cmphi p4.s, p0/z, z4.s, z5.s
-; CHECK-NEXT:    uqadd z0.s, z0.s, z3.s
-; CHECK-NEXT:    uqadd z1.s, z1.s, z3.s
+; CHECK-NEXT:    incw z2.s
+; CHECK-NEXT:    incw z3.s, all, mul #2
+; CHECK-NEXT:    cmphi p2.s, p0/z, z25.s, z6.s
+; CHECK-NEXT:    uqadd z0.s, z0.s, z1.s
+; CHECK-NEXT:    mov z4.d, z2.d
+; CHECK-NEXT:    uqadd z5.s, z2.s, z1.s
+; CHECK-NEXT:    uqadd z7.s, z3.s, z1.s
 ; CHECK-NEXT:    incw z2.s, all, mul #4
-; CHECK-NEXT:    incw z6.s, all, mul #4
-; CHECK-NEXT:    uzp1 p1.h, p1.h, p2.h
-; CHECK-NEXT:    uzp1 p2.h, p3.h, p4.h
-; CHECK-NEXT:    cmphi p3.s, p0/z, z4.s, z0.s
-; CHECK-NEXT:    cmphi p4.s, p0/z, z4.s, z1.s
-; CHECK-NEXT:    uqadd z0.s, z2.s, z3.s
-; CHECK-NEXT:    uqadd z1.s, z6.s, z3.s
-; CHECK-NEXT:    cmphi p5.s, p0/z, z4.s, z0.s
-; CHECK-NEXT:    cmphi p0.s, p0/z, z4.s, z1.s
+; CHECK-NEXT:    incw z3.s, all, mul #4
+; CHECK-NEXT:    cmphi p5.s, p0/z, z25.s, z0.s
+; CHECK-NEXT:    incw z4.s, all, mul #2
+; CHECK-NEXT:    cmphi p1.s, p0/z, z25.s, z5.s
+; CHECK-NEXT:    cmphi p3.s, p0/z, z25.s, z7.s
+; CHECK-NEXT:    uqadd z2.s, z2.s, z1.s
+; CHECK-NEXT:    uqadd z3.s, z3.s, z1.s
+; CHECK-NEXT:    uqadd z24.s, z4.s, z1.s
+; CHECK-NEXT:    incw z4.s, all, mul #4
+; CHECK-NEXT:    uzp1 p1.h, p2.h, p1.h
+; CHECK-NEXT:    cmphi p6.s, p0/z, z25.s, z2.s
+; CHECK-NEXT:    cmphi p2.s, p0/z, z25.s, z3.s
+; CHECK-NEXT:    uqadd z1.s, z4.s, z1.s
+; CHECK-NEXT:    cmphi p4.s, p0/z, z25.s, z24.s
 ; CHECK-NEXT:    uzp1 p3.h, p3.h, p4.h
-; CHECK-NEXT:    uzp1 p4.h, p5.h, p0.h
-; CHECK-NEXT:    uzp1 p0.b, p1.b, p2.b
-; CHECK-NEXT:    uzp1 p1.b, p3.b, p4.b
+; CHECK-NEXT:    cmphi p0.s, p0/z, z25.s, z1.s
+; CHECK-NEXT:    uzp1 p4.h, p5.h, p6.h
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    uzp1 p2.h, p2.h, p0.h
+; CHECK-NEXT:    uzp1 p0.b, p1.b, p3.b
+; CHECK-NEXT:    uzp1 p1.b, p4.b, p2.b
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -207,87 +209,93 @@ define <vscale x 32 x i1> @lane_mask_nxv32i1_i64(i64 %index, i64 %TC) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    index z0.d, #0, #1
-; CHECK-NEXT:    mov z3.d, x0
-; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    index z1.d, #0, #1
+; CHECK-NEXT:    mov z0.d, x0
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z4.d, x1
-; CHECK-NEXT:    incd z1.d
-; CHECK-NEXT:    uqadd z5.d, z0.d, z3.d
-; CHECK-NEXT:    uqadd z6.d, z1.d, z3.d
-; CHECK-NEXT:    cmphi p1.d, p0/z, z4.d, z5.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    incd z2.d, all, mul #2
-; CHECK-NEXT:    cmphi p2.d, p0/z, z4.d, z6.d
-; CHECK-NEXT:    uqadd z6.d, z2.d, z3.d
-; CHECK-NEXT:    mov z7.d, z0.d
-; CHECK-NEXT:    incd z5.d, all, mul #2
-; CHECK-NEXT:    uzp1 p1.s, p1.s, p2.s
-; CHECK-NEXT:    cmphi p2.d, p0/z, z4.d, z6.d
-; CHECK-NEXT:    uqadd z6.d, z5.d, z3.d
-; CHECK-NEXT:    mov z24.d, z1.d
-; CHECK-NEXT:    incd z7.d, all, mul #4
-; CHECK-NEXT:    cmphi p3.d, p0/z, z4.d, z6.d
-; CHECK-NEXT:    uqadd z6.d, z7.d, z3.d
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    incd z24.d, all, mul #4
-; CHECK-NEXT:    mov z26.d, z5.d
-; CHECK-NEXT:    cmphi p4.d, p0/z, z4.d, z6.d
-; CHECK-NEXT:    uqadd z6.d, z24.d, z3.d
-; CHECK-NEXT:    incd z25.d, all, mul #4
-; CHECK-NEXT:    cmphi p5.d, p0/z, z4.d, z6.d
-; CHECK-NEXT:    uqadd z6.d, z25.d, z3.d
-; CHECK-NEXT:    incd z26.d, all, mul #4
-; CHECK-NEXT:    cmphi p6.d, p0/z, z4.d, z6.d
-; CHECK-NEXT:    uqadd z6.d, z26.d, z3.d
-; CHECK-NEXT:    uzp1 p2.s, p2.s, p3.s
-; CHECK-NEXT:    cmphi p3.d, p0/z, z4.d, z6.d
-; CHECK-NEXT:    incd z0.d, all, mul #8
+; CHECK-NEXT:    mov z7.d, x1
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    mov z3.d, z1.d
+; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    uqadd z5.d, z1.d, z0.d
 ; CHECK-NEXT:    incd z1.d, all, mul #8
-; CHECK-NEXT:    uzp1 p4.s, p4.s, p5.s
-; CHECK-NEXT:    uzp1 p3.s, p6.s, p3.s
-; CHECK-NEXT:    uqadd z0.d, z0.d, z3.d
-; CHECK-NEXT:    uqadd z1.d, z1.d, z3.d
+; CHECK-NEXT:    incd z2.d
+; CHECK-NEXT:    incd z3.d, all, mul #2
+; CHECK-NEXT:    incd z6.d, all, mul #4
+; CHECK-NEXT:    cmphi p1.d, p0/z, z7.d, z5.d
+; CHECK-NEXT:    uqadd z1.d, z1.d, z0.d
+; CHECK-NEXT:    mov z4.d, z2.d
+; CHECK-NEXT:    uqadd z24.d, z2.d, z0.d
+; CHECK-NEXT:    mov z25.d, z2.d
+; CHECK-NEXT:    mov z27.d, z3.d
+; CHECK-NEXT:    uqadd z26.d, z3.d, z0.d
+; CHECK-NEXT:    uqadd z28.d, z6.d, z0.d
 ; CHECK-NEXT:    incd z2.d, all, mul #8
-; CHECK-NEXT:    incd z5.d, all, mul #8
-; CHECK-NEXT:    uzp1 p1.h, p1.h, p2.h
-; CHECK-NEXT:    uzp1 p2.h, p4.h, p3.h
-; CHECK-NEXT:    cmphi p3.d, p0/z, z4.d, z0.d
-; CHECK-NEXT:    cmphi p4.d, p0/z, z4.d, z1.d
-; CHECK-NEXT:    uqadd z0.d, z2.d, z3.d
-; CHECK-NEXT:    uqadd z1.d, z5.d, z3.d
-; CHECK-NEXT:    incd z7.d, all, mul #8
-; CHECK-NEXT:    incd z24.d, all, mul #8
-; CHECK-NEXT:    cmphi p5.d, p0/z, z4.d, z0.d
-; CHECK-NEXT:    cmphi p6.d, p0/z, z4.d, z1.d
-; CHECK-NEXT:    uqadd z0.d, z7.d, z3.d
-; CHECK-NEXT:    uqadd z1.d, z24.d, z3.d
+; CHECK-NEXT:    incd z3.d, all, mul #8
+; CHECK-NEXT:    incd z6.d, all, mul #8
+; CHECK-NEXT:    incd z4.d, all, mul #2
+; CHECK-NEXT:    incd z25.d, all, mul #4
+; CHECK-NEXT:    cmphi p2.d, p0/z, z7.d, z24.d
+; CHECK-NEXT:    incd z27.d, all, mul #4
+; CHECK-NEXT:    cmphi p3.d, p0/z, z7.d, z26.d
+; CHECK-NEXT:    cmphi p5.d, p0/z, z7.d, z28.d
+; CHECK-NEXT:    uqadd z2.d, z2.d, z0.d
+; CHECK-NEXT:    uqadd z3.d, z3.d, z0.d
+; CHECK-NEXT:    mov z24.d, z4.d
+; CHECK-NEXT:    uqadd z5.d, z4.d, z0.d
+; CHECK-NEXT:    uqadd z26.d, z25.d, z0.d
+; CHECK-NEXT:    incd z4.d, all, mul #8
 ; CHECK-NEXT:    incd z25.d, all, mul #8
-; CHECK-NEXT:    incd z26.d, all, mul #8
+; CHECK-NEXT:    uzp1 p1.s, p1.s, p2.s
+; CHECK-NEXT:    incd z24.d, all, mul #4
+; CHECK-NEXT:    cmphi p8.d, p0/z, z7.d, z2.d
+; CHECK-NEXT:    cmphi p4.d, p0/z, z7.d, z5.d
+; CHECK-NEXT:    uqadd z5.d, z27.d, z0.d
+; CHECK-NEXT:    incd z27.d, all, mul #8
+; CHECK-NEXT:    uqadd z4.d, z4.d, z0.d
+; CHECK-NEXT:    cmphi p6.d, p0/z, z7.d, z26.d
+; CHECK-NEXT:    uqadd z28.d, z24.d, z0.d
+; CHECK-NEXT:    incd z24.d, all, mul #8
 ; CHECK-NEXT:    uzp1 p3.s, p3.s, p4.s
-; CHECK-NEXT:    uzp1 p4.s, p5.s, p6.s
-; CHECK-NEXT:    cmphi p5.d, p0/z, z4.d, z0.d
-; CHECK-NEXT:    cmphi p6.d, p0/z, z4.d, z1.d
-; CHECK-NEXT:    uqadd z0.d, z25.d, z3.d
-; CHECK-NEXT:    uqadd z1.d, z26.d, z3.d
-; CHECK-NEXT:    cmphi p7.d, p0/z, z4.d, z0.d
-; CHECK-NEXT:    cmphi p0.d, p0/z, z4.d, z1.d
+; CHECK-NEXT:    cmphi p7.d, p0/z, z7.d, z5.d
+; CHECK-NEXT:    uqadd z5.d, z6.d, z0.d
+; CHECK-NEXT:    uqadd z6.d, z25.d, z0.d
+; CHECK-NEXT:    uqadd z25.d, z27.d, z0.d
+; CHECK-NEXT:    cmphi p4.d, p0/z, z7.d, z1.d
 ; CHECK-NEXT:    uzp1 p5.s, p5.s, p6.s
-; CHECK-NEXT:    uzp1 p0.s, p7.s, p0.s
-; CHECK-NEXT:    uzp1 p3.h, p3.h, p4.h
-; CHECK-NEXT:    uzp1 p4.h, p5.h, p0.h
-; CHECK-NEXT:    uzp1 p0.b, p1.b, p2.b
-; CHECK-NEXT:    uzp1 p1.b, p3.b, p4.b
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    cmphi p6.d, p0/z, z7.d, z3.d
+; CHECK-NEXT:    cmphi p9.d, p0/z, z7.d, z4.d
+; CHECK-NEXT:    uqadd z0.d, z24.d, z0.d
+; CHECK-NEXT:    cmphi p2.d, p0/z, z7.d, z28.d
+; CHECK-NEXT:    cmphi p10.d, p0/z, z7.d, z6.d
+; CHECK-NEXT:    uzp1 p4.s, p4.s, p8.s
+; CHECK-NEXT:    cmphi p8.d, p0/z, z7.d, z25.d
+; CHECK-NEXT:    uzp1 p6.s, p6.s, p9.s
+; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    uzp1 p2.s, p7.s, p2.s
+; CHECK-NEXT:    cmphi p7.d, p0/z, z7.d, z5.d
+; CHECK-NEXT:    cmphi p0.d, p0/z, z7.d, z0.d
+; CHECK-NEXT:    uzp1 p1.h, p1.h, p3.h
+; CHECK-NEXT:    uzp1 p7.s, p7.s, p10.s
+; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    uzp1 p0.s, p8.s, p0.s
+; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    uzp1 p3.h, p4.h, p6.h
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    uzp1 p2.h, p5.h, p2.h
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    uzp1 p4.h, p7.h, p0.h
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    uzp1 p0.b, p1.b, p2.b
+; CHECK-NEXT:    uzp1 p1.b, p3.b, p4.b
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -299,17 +307,17 @@ define <vscale x 32 x i1> @lane_mask_nxv32i1_i64(i64 %index, i64 %TC) {
 define <vscale x 32 x i1> @lane_mask_nxv32i1_i8(i8 %index, i8 %TC) {
 ; CHECK-LABEL: lane_mask_nxv32i1_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    rdvl x8, #1
 ; CHECK-NEXT:    index z0.b, #0, #1
-; CHECK-NEXT:    mov z1.b, w8
+; CHECK-NEXT:    rdvl x8, #1
 ; CHECK-NEXT:    mov z2.b, w0
+; CHECK-NEXT:    mov z1.b, w8
+; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    add z1.b, z0.b, z1.b
-; CHECK-NEXT:    mov z3.b, w1
 ; CHECK-NEXT:    uqadd z0.b, z0.b, z2.b
-; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    uqadd z1.b, z1.b, z2.b
-; CHECK-NEXT:    cmphi p0.b, p1/z, z3.b, z0.b
-; CHECK-NEXT:    cmphi p1.b, p1/z, z3.b, z1.b
+; CHECK-NEXT:    mov z2.b, w1
+; CHECK-NEXT:    cmphi p0.b, p1/z, z2.b, z0.b
+; CHECK-NEXT:    cmphi p1.b, p1/z, z2.b, z1.b
 ; CHECK-NEXT:    ret
   %active.lane.mask = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i8(i8 %index, i8 %TC)
   ret <vscale x 32 x i1> %active.lane.mask
@@ -420,9 +428,9 @@ define <16 x i1> @lane_mask_v16i1_i8(i8 %index, i8 %TC) {
 ; CHECK-LABEL: lane_mask_v16i1_i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI24_0
-; CHECK-NEXT:    dup v1.16b, w0
-; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI24_0]
-; CHECK-NEXT:    uqadd v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    dup v0.16b, w0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI24_0]
+; CHECK-NEXT:    uqadd v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    dup v1.16b, w1
 ; CHECK-NEXT:    cmhi v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    ret
@@ -433,12 +441,12 @@ define <16 x i1> @lane_mask_v16i1_i8(i8 %index, i8 %TC) {
 define <8 x i1> @lane_mask_v8i1_i8(i8 %index, i8 %TC) {
 ; CHECK-LABEL: lane_mask_v8i1_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI25_0
 ; CHECK-NEXT:    dup v0.8b, w0
-; CHECK-NEXT:    dup v2.8b, w1
+; CHECK-NEXT:    adrp x8, .LCPI25_0
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI25_0]
 ; CHECK-NEXT:    uqadd v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    cmhi v0.8b, v2.8b, v0.8b
+; CHECK-NEXT:    dup v1.8b, w1
+; CHECK-NEXT:    cmhi v0.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    ret
   %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i8(i8 %index, i8 %TC)
   ret <8 x i1> %active.lane.mask
@@ -447,16 +455,16 @@ define <8 x i1> @lane_mask_v8i1_i8(i8 %index, i8 %TC) {
 define <4 x i1> @lane_mask_v4i1_i8(i8 %index, i8 %TC) {
 ; CHECK-LABEL: lane_mask_v4i1_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI26_0
 ; CHECK-NEXT:    dup v0.4h, w0
+; CHECK-NEXT:    adrp x8, .LCPI26_0
 ; CHECK-NEXT:    movi d2, #0xff00ff00ff00ff
-; CHECK-NEXT:    dup v3.4h, w1
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI26_0]
 ; CHECK-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-NEXT:    bic v3.4h, #255, lsl #8
 ; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    dup v1.4h, w1
 ; CHECK-NEXT:    umin v0.4h, v0.4h, v2.4h
-; CHECK-NEXT:    cmhi v0.4h, v3.4h, v0.4h
+; CHECK-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-NEXT:    cmhi v0.4h, v1.4h, v0.4h
 ; CHECK-NEXT:    ret
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i8(i8 %index, i8 %TC)
   ret <4 x i1> %active.lane.mask
@@ -465,11 +473,11 @@ define <4 x i1> @lane_mask_v4i1_i8(i8 %index, i8 %TC) {
 define <2 x i1> @lane_mask_v2i1_i8(i8 %index, i8 %TC) {
 ; CHECK-LABEL: lane_mask_v2i1_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI27_0
 ; CHECK-NEXT:    movi d0, #0x0000ff000000ff
 ; CHECK-NEXT:    dup v1.2s, w0
-; CHECK-NEXT:    dup v3.2s, w1
+; CHECK-NEXT:    adrp x8, .LCPI27_0
 ; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI27_0]
+; CHECK-NEXT:    dup v3.2s, w1
 ; CHECK-NEXT:    and v1.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    add v1.2s, v1.2s, v2.2s
 ; CHECK-NEXT:    umin v1.2s, v1.2s, v0.2s

diff  --git a/llvm/test/CodeGen/AArch64/add-extract.ll b/llvm/test/CodeGen/AArch64/add-extract.ll
index 58b833529cc640..67c9f74ee02988 100644
--- a/llvm/test/CodeGen/AArch64/add-extract.ll
+++ b/llvm/test/CodeGen/AArch64/add-extract.ll
@@ -44,8 +44,8 @@ define void @add_i64_ext_load_store(<1 x i64> %A, ptr %B) nounwind {
 define i64 @add_v2i64_ext_load(<2 x i64> %A, ptr %B) nounwind {
 ; CHECK-LABEL: add_v2i64_ext_load:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    add x0, x9, x8
 ; CHECK-NEXT:    ret
   %a = extractelement <2 x i64> %A, i32 0
@@ -70,8 +70,8 @@ define i32 @add_i32_ext_load(<1 x i32> %A, ptr %B) nounwind {
 ; CHECK-LABEL: add_i32_ext_load:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    add w0, w9, w8
 ; CHECK-NEXT:    ret
   %a = extractelement <1 x i32> %A, i32 0

diff  --git a/llvm/test/CodeGen/AArch64/addcarry-crash.ll b/llvm/test/CodeGen/AArch64/addcarry-crash.ll
index 4d07e048c13e55..be75ab101c858b 100644
--- a/llvm/test/CodeGen/AArch64/addcarry-crash.ll
+++ b/llvm/test/CodeGen/AArch64/addcarry-crash.ll
@@ -5,10 +5,10 @@ target triple = "arm64-apple-ios7.0"
 define i64 @foo(ptr nocapture readonly %ptr, i64 %a, i64 %b, i64 %c) local_unnamed_addr #0 {
 ; CHECK-LABEL: foo:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    ldr w8, [x0, #4]
-; CHECK-NEXT:    lsr x9, x1, #32
+; CHECK-NEXT:    lsr x8, x1, #32
+; CHECK-NEXT:    ldr w9, [x0, #4]
 ; CHECK-NEXT:    cmn x3, x2
-; CHECK-NEXT:    umull x8, w8, w9
+; CHECK-NEXT:    umull x8, w9, w8
 ; CHECK-NEXT:    cinc x0, x8, hs
 ; CHECK-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/addsub-constant-folding.ll b/llvm/test/CodeGen/AArch64/addsub-constant-folding.ll
index 75fdac707b834e..10b30b0265de8e 100644
--- a/llvm/test/CodeGen/AArch64/addsub-constant-folding.ll
+++ b/llvm/test/CodeGen/AArch64/addsub-constant-folding.ll
@@ -213,9 +213,9 @@ define <4 x i32> @vec_add_const_const_sub_extrause(<4 x i32> %arg) {
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    bl vec_use
+; CHECK-NEXT:    mvni v0.4s, #5
 ; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    mvni v0.4s, #5
 ; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
@@ -290,9 +290,9 @@ define <4 x i32> @vec_sub_const_add_const_extrause(<4 x i32> %arg) {
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    bl vec_use
+; CHECK-NEXT:    mvni v0.4s, #5
 ; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    mvni v0.4s, #5
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/addsub.ll b/llvm/test/CodeGen/AArch64/addsub.ll
index a483d9a8e97d36..1b86fe6c707c8e 100644
--- a/llvm/test/CodeGen/AArch64/addsub.ll
+++ b/llvm/test/CodeGen/AArch64/addsub.ll
@@ -232,7 +232,7 @@ define i32 @sub_two_parts_imm_i32_neg(i32 %a) {
 define i32 @add_27962026(i32 %a) {
 ; CHECK-LABEL: add_27962026:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #43690
+; CHECK-NEXT:    mov w8, #43690 // =0xaaaa
 ; CHECK-NEXT:    movk w8, #426, lsl #16
 ; CHECK-NEXT:    add w0, w0, w8
 ; CHECK-NEXT:    ret
@@ -243,7 +243,7 @@ define i32 @add_27962026(i32 %a) {
 define i32 @add_65534(i32 %a) {
 ; CHECK-LABEL: add_65534:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #65534
+; CHECK-NEXT:    mov w8, #65534 // =0xfffe
 ; CHECK-NEXT:    add w0, w0, w8
 ; CHECK-NEXT:    ret
   %b = add i32 %a, 65534
@@ -259,7 +259,7 @@ define void @add_in_loop(i32 %0) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov w19, #43690
+; CHECK-NEXT:    mov w19, #43690 // =0xaaaa
 ; CHECK-NEXT:    movk w19, #170, lsl #16
 ; CHECK-NEXT:  .LBB15_1: // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add w0, w0, w19
@@ -373,7 +373,7 @@ declare {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a, i8 %b)
 define i1 @uadd_add(i8 %a, i8 %b, ptr %p) {
 ; CHECK-LABEL: uadd_add:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #255
+; CHECK-NEXT:    mov w8, #255 // =0xff
 ; CHECK-NEXT:    bic w8, w8, w0
 ; CHECK-NEXT:    add w8, w8, w1, uxtb
 ; CHECK-NEXT:    lsr w0, w8, #8
@@ -398,7 +398,7 @@ define i1 @uadd_add(i8 %a, i8 %b, ptr %p) {
 define i64 @addl_0x80000000(i64 %a) {
 ; CHECK-LABEL: addl_0x80000000:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #48576
+; CHECK-NEXT:    mov w8, #48576 // =0xbdc0
 ; CHECK-NEXT:    movk w8, #65520, lsl #16
 ; CHECK-NEXT:    add x0, x0, x8
 ; CHECK-NEXT:    ret
@@ -499,7 +499,7 @@ define i1 @ne_ln(i64 %0) {
 define i1 @reject_eq(i32 %0) {
 ; CHECK-LABEL: reject_eq:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #51712
+; CHECK-NEXT:    mov w8, #51712 // =0xca00
 ; CHECK-NEXT:    movk w8, #15258, lsl #16
 ; CHECK-NEXT:    cmp w0, w8
 ; CHECK-NEXT:    cset w0, eq
@@ -511,7 +511,7 @@ define i1 @reject_eq(i32 %0) {
 define i1 @reject_non_eqne_csinc(i32 %0) {
 ; CHECK-LABEL: reject_non_eqne_csinc:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #4369
+; CHECK-NEXT:    mov w8, #4369 // =0x1111
 ; CHECK-NEXT:    movk w8, #17, lsl #16
 ; CHECK-NEXT:    cmp w0, w8
 ; CHECK-NEXT:    cset w0, lo
@@ -524,9 +524,9 @@ define i32 @accept_csel(i32 %0) {
 ; CHECK-LABEL: accept_csel:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub w9, w0, #273, lsl #12 // =1118208
-; CHECK-NEXT:    mov w8, #17
+; CHECK-NEXT:    mov w8, #17 // =0x11
 ; CHECK-NEXT:    cmp w9, #273
-; CHECK-NEXT:    mov w9, #11
+; CHECK-NEXT:    mov w9, #11 // =0xb
 ; CHECK-NEXT:    csel w0, w9, w8, eq
 ; CHECK-NEXT:    ret
   %2 = icmp eq i32 %0, 1118481
@@ -537,11 +537,11 @@ define i32 @accept_csel(i32 %0) {
 define i32 @reject_non_eqne_csel(i32 %0) {
 ; CHECK-LABEL: reject_non_eqne_csel:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #4369
-; CHECK-NEXT:    mov w9, #11
+; CHECK-NEXT:    mov w8, #4369 // =0x1111
+; CHECK-NEXT:    mov w9, #11 // =0xb
 ; CHECK-NEXT:    movk w8, #17, lsl #16
 ; CHECK-NEXT:    cmp w0, w8
-; CHECK-NEXT:    mov w8, #17
+; CHECK-NEXT:    mov w8, #17 // =0x11
 ; CHECK-NEXT:    csel w0, w9, w8, lo
 ; CHECK-NEXT:    ret
   %2 = icmp ult i32 %0, 1118481
@@ -573,7 +573,7 @@ define void @accept_branch(i32 %0) {
 define void @reject_non_eqne_branch(i32 %0) {
 ; CHECK-LABEL: reject_non_eqne_branch:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #13398
+; CHECK-NEXT:    mov w8, #13398 // =0x3456
 ; CHECK-NEXT:    movk w8, #18, lsl #16
 ; CHECK-NEXT:    cmp w0, w8
 ; CHECK-NEXT:    b.le .LBB33_2
@@ -593,20 +593,20 @@ define void @reject_non_eqne_branch(i32 %0) {
 define i32 @reject_multiple_usages(i32 %0) {
 ; CHECK-LABEL: reject_multiple_usages:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #4369
-; CHECK-NEXT:    mov w9, #3
+; CHECK-NEXT:    mov w8, #4369 // =0x1111
+; CHECK-NEXT:    mov w9, #3 // =0x3
+; CHECK-NEXT:    mov w10, #17 // =0x11
 ; CHECK-NEXT:    movk w8, #17, lsl #16
-; CHECK-NEXT:    mov w10, #17
+; CHECK-NEXT:    mov w11, #12 // =0xc
 ; CHECK-NEXT:    cmp w0, w8
-; CHECK-NEXT:    mov w8, #9
-; CHECK-NEXT:    mov w11, #12
+; CHECK-NEXT:    mov w8, #9 // =0x9
 ; CHECK-NEXT:    csel w8, w8, w9, eq
 ; CHECK-NEXT:    csel w9, w11, w10, hi
+; CHECK-NEXT:    mov w10, #53312 // =0xd040
+; CHECK-NEXT:    movk w10, #2, lsl #16
 ; CHECK-NEXT:    add w8, w8, w9
-; CHECK-NEXT:    mov w9, #53312
-; CHECK-NEXT:    movk w9, #2, lsl #16
-; CHECK-NEXT:    cmp w0, w9
-; CHECK-NEXT:    mov w9, #26304
+; CHECK-NEXT:    mov w9, #26304 // =0x66c0
+; CHECK-NEXT:    cmp w0, w10
 ; CHECK-NEXT:    movk w9, #1433, lsl #16
 ; CHECK-NEXT:    csel w0, w8, w9, hi
 ; CHECK-NEXT:    ret
@@ -666,11 +666,11 @@ define dso_local i32 @_extract_crng_crng() {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
 ; CHECK-NEXT:    adrp x8, primary_crng
-; CHECK-NEXT:    adrp x9, input_pool
-; CHECK-NEXT:    add x9, x9, :lo12:input_pool
 ; CHECK-NEXT:    ldr w8, [x8, :lo12:primary_crng]
 ; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    csel x0, xzr, x9, eq
+; CHECK-NEXT:    adrp x8, input_pool
+; CHECK-NEXT:    add x8, x8, :lo12:input_pool
+; CHECK-NEXT:    csel x0, xzr, x8, eq
 ; CHECK-NEXT:    bl crng_reseed
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:  .LBB36_3: // %if.end
@@ -778,7 +778,7 @@ define i32 @commute_subop0_zext(i16 %x, i32 %y, i32 %z) {
 define i8 @commute_subop0_anyext(i16 %a, i16 %b, i32 %c) {
 ; CHECK-LABEL: commute_subop0_anyext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #111
+; CHECK-NEXT:    mov w8, #111 // =0x6f
 ; CHECK-NEXT:    sub w9, w2, w1
 ; CHECK-NEXT:    madd w8, w0, w8, w9
 ; CHECK-NEXT:    lsl w8, w8, #3

diff  --git a/llvm/test/CodeGen/AArch64/align-down.ll b/llvm/test/CodeGen/AArch64/align-down.ll
index cb3b1f0cbb6484..cda0d1304b1c5d 100644
--- a/llvm/test/CodeGen/AArch64/align-down.ll
+++ b/llvm/test/CodeGen/AArch64/align-down.ll
@@ -84,8 +84,8 @@ define i32 @n5_extrause2(i32 %ptr, i32 %alignment, i32* %mask_storage, i32* %bia
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub w8, w1, #1
 ; CHECK-NEXT:    and w9, w0, w8
-; CHECK-NEXT:    sub w0, w0, w9
 ; CHECK-NEXT:    str w8, [x2]
+; CHECK-NEXT:    sub w0, w0, w9
 ; CHECK-NEXT:    str w9, [x3]
 ; CHECK-NEXT:    ret
   %mask = add i32 %alignment, -1

diff  --git a/llvm/test/CodeGen/AArch64/and-mask-removal.ll b/llvm/test/CodeGen/AArch64/and-mask-removal.ll
index b86c600e41acfc..fa618ef6ac37bb 100644
--- a/llvm/test/CodeGen/AArch64/and-mask-removal.ll
+++ b/llvm/test/CodeGen/AArch64/and-mask-removal.ll
@@ -22,8 +22,8 @@ define void @new_position(i32 %pos) {
 ; CHECK-SD-NEXT:    adrp x9, _next_string at GOTPAGE
 ; CHECK-SD-NEXT:    adrp x10, _string_number at GOTPAGE
 ; CHECK-SD-NEXT:    ldr x9, [x9, _next_string at GOTPAGEOFF]
-; CHECK-SD-NEXT:    ldr w9, [x9]
 ; CHECK-SD-NEXT:    ldr x10, [x10, _string_number at GOTPAGEOFF]
+; CHECK-SD-NEXT:    ldr w9, [x9]
 ; CHECK-SD-NEXT:    str w9, [x10, x8, lsl #2]
 ; CHECK-SD-NEXT:  LBB0_2: ; %if.end
 ; CHECK-SD-NEXT:    ret
@@ -40,8 +40,8 @@ define void @new_position(i32 %pos) {
 ; CHECK-GI-NEXT:    adrp x8, _next_string at GOTPAGE
 ; CHECK-GI-NEXT:    adrp x9, _string_number at GOTPAGE
 ; CHECK-GI-NEXT:    ldr x8, [x8, _next_string at GOTPAGEOFF]
-; CHECK-GI-NEXT:    ldr w8, [x8]
 ; CHECK-GI-NEXT:    ldr x9, [x9, _string_number at GOTPAGEOFF]
+; CHECK-GI-NEXT:    ldr w8, [x8]
 ; CHECK-GI-NEXT:    str w8, [x9, w0, sxtw #2]
 ; CHECK-GI-NEXT:  LBB0_2: ; %if.end
 ; CHECK-GI-NEXT:    ret
@@ -270,15 +270,15 @@ ret_true:
 define zeroext i1 @test16_0(i16 zeroext %x)  align 2 {
 ; CHECK-SD-LABEL: test16_0:
 ; CHECK-SD:       ; %bb.0: ; %entry
-; CHECK-SD-NEXT:    mov w8, #5086
+; CHECK-SD-NEXT:    mov w8, #5086 ; =0x13de
 ; CHECK-SD-NEXT:    cmp w0, w8
 ; CHECK-SD-NEXT:    cset w0, ne
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: test16_0:
 ; CHECK-GI:       ; %bb.0: ; %entry
-; CHECK-GI-NEXT:    mov w8, #18547
-; CHECK-GI-NEXT:    mov w9, #23633
+; CHECK-GI-NEXT:    mov w8, #18547 ; =0x4873
+; CHECK-GI-NEXT:    mov w9, #23633 ; =0x5c51
 ; CHECK-GI-NEXT:    add w8, w0, w8
 ; CHECK-GI-NEXT:    cmp w9, w8, uxth
 ; CHECK-GI-NEXT:    cset w0, ne
@@ -296,8 +296,8 @@ ret_true:
 define zeroext i1 @test16_2(i16 zeroext %x)  align 2 {
 ; CHECK-SD-LABEL: test16_2:
 ; CHECK-SD:       ; %bb.0: ; %entry
-; CHECK-SD-NEXT:    mov w8, #16882
-; CHECK-SD-NEXT:    mov w9, #40700
+; CHECK-SD-NEXT:    mov w8, #16882 ; =0x41f2
+; CHECK-SD-NEXT:    mov w9, #40700 ; =0x9efc
 ; CHECK-SD-NEXT:    add w8, w0, w8
 ; CHECK-SD-NEXT:    cmp w9, w8, uxth
 ; CHECK-SD-NEXT:    cset w0, hi
@@ -305,8 +305,8 @@ define zeroext i1 @test16_2(i16 zeroext %x)  align 2 {
 ;
 ; CHECK-GI-LABEL: test16_2:
 ; CHECK-GI:       ; %bb.0: ; %entry
-; CHECK-GI-NEXT:    mov w8, #16882
-; CHECK-GI-NEXT:    mov w9, #40699
+; CHECK-GI-NEXT:    mov w8, #16882 ; =0x41f2
+; CHECK-GI-NEXT:    mov w9, #40699 ; =0x9efb
 ; CHECK-GI-NEXT:    add w8, w0, w8
 ; CHECK-GI-NEXT:    cmp w9, w8, uxth
 ; CHECK-GI-NEXT:    cset w0, hs
@@ -324,15 +324,15 @@ ret_true:
 define zeroext i1 @test16_3(i16 zeroext %x)  align 2 {
 ; CHECK-SD-LABEL: test16_3:
 ; CHECK-SD:       ; %bb.0: ; %entry
-; CHECK-SD-NEXT:    mov w8, #53200
+; CHECK-SD-NEXT:    mov w8, #53200 ; =0xcfd0
 ; CHECK-SD-NEXT:    cmp w0, w8
 ; CHECK-SD-NEXT:    cset w0, ne
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: test16_3:
 ; CHECK-GI:       ; %bb.0: ; %entry
-; CHECK-GI-NEXT:    mov w8, #29283
-; CHECK-GI-NEXT:    mov w9, #16947
+; CHECK-GI-NEXT:    mov w8, #29283 ; =0x7263
+; CHECK-GI-NEXT:    mov w9, #16947 ; =0x4233
 ; CHECK-GI-NEXT:    add w8, w0, w8
 ; CHECK-GI-NEXT:    cmp w9, w8, uxth
 ; CHECK-GI-NEXT:    cset w0, ne
@@ -350,8 +350,8 @@ ret_true:
 define zeroext i1 @test16_4(i16 zeroext %x)  align 2 {
 ; CHECK-SD-LABEL: test16_4:
 ; CHECK-SD:       ; %bb.0: ; %entry
-; CHECK-SD-NEXT:    mov w8, #29985
-; CHECK-SD-NEXT:    mov w9, #15676
+; CHECK-SD-NEXT:    mov w8, #29985 ; =0x7521
+; CHECK-SD-NEXT:    mov w9, #15676 ; =0x3d3c
 ; CHECK-SD-NEXT:    add w8, w0, w8
 ; CHECK-SD-NEXT:    cmp w9, w8, uxth
 ; CHECK-SD-NEXT:    cset w0, lo
@@ -359,8 +359,8 @@ define zeroext i1 @test16_4(i16 zeroext %x)  align 2 {
 ;
 ; CHECK-GI-LABEL: test16_4:
 ; CHECK-GI:       ; %bb.0: ; %entry
-; CHECK-GI-NEXT:    mov w8, #29985
-; CHECK-GI-NEXT:    mov w9, #15677
+; CHECK-GI-NEXT:    mov w8, #29985 ; =0x7521
+; CHECK-GI-NEXT:    mov w9, #15677 ; =0x3d3d
 ; CHECK-GI-NEXT:    add w8, w0, w8
 ; CHECK-GI-NEXT:    cmp w9, w8, uxth
 ; CHECK-GI-NEXT:    cset w0, ls
@@ -378,15 +378,15 @@ ret_true:
 define zeroext i1 @test16_5(i16 zeroext %x)  align 2 {
 ; CHECK-SD-LABEL: test16_5:
 ; CHECK-SD:       ; %bb.0: ; %entry
-; CHECK-SD-NEXT:    mov w8, #23282
+; CHECK-SD-NEXT:    mov w8, #23282 ; =0x5af2
 ; CHECK-SD-NEXT:    cmp w0, w8
 ; CHECK-SD-NEXT:    cset w0, ne
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: test16_5:
 ; CHECK-GI:       ; %bb.0: ; %entry
-; CHECK-GI-NEXT:    mov w8, #-25214
-; CHECK-GI-NEXT:    mov w9, #63604
+; CHECK-GI-NEXT:    mov w8, #-25214 ; =0xffff9d82
+; CHECK-GI-NEXT:    mov w9, #63604 ; =0xf874
 ; CHECK-GI-NEXT:    add w8, w0, w8
 ; CHECK-GI-NEXT:    cmp w9, w8, uxth
 ; CHECK-GI-NEXT:    cset w0, ne
@@ -404,8 +404,8 @@ ret_true:
 define zeroext i1 @test16_6(i16 zeroext %x)  align 2 {
 ; CHECK-SD-LABEL: test16_6:
 ; CHECK-SD:       ; %bb.0: ; %entry
-; CHECK-SD-NEXT:    mov w8, #-32194
-; CHECK-SD-NEXT:    mov w9, #24320
+; CHECK-SD-NEXT:    mov w8, #-32194 ; =0xffff823e
+; CHECK-SD-NEXT:    mov w9, #24320 ; =0x5f00
 ; CHECK-SD-NEXT:    add w8, w0, w8
 ; CHECK-SD-NEXT:    cmp w8, w9
 ; CHECK-SD-NEXT:    cset w0, hi
@@ -413,8 +413,8 @@ define zeroext i1 @test16_6(i16 zeroext %x)  align 2 {
 ;
 ; CHECK-GI-LABEL: test16_6:
 ; CHECK-GI:       ; %bb.0: ; %entry
-; CHECK-GI-NEXT:    mov w8, #-32194
-; CHECK-GI-NEXT:    mov w9, #24321
+; CHECK-GI-NEXT:    mov w8, #-32194 ; =0xffff823e
+; CHECK-GI-NEXT:    mov w9, #24321 ; =0x5f01
 ; CHECK-GI-NEXT:    add w8, w0, w8
 ; CHECK-GI-NEXT:    cmp w8, w9
 ; CHECK-GI-NEXT:    cset w0, hs
@@ -432,8 +432,8 @@ ret_true:
 define zeroext i1 @test16_7(i16 zeroext %x)  align 2 {
 ; CHECK-SD-LABEL: test16_7:
 ; CHECK-SD:       ; %bb.0: ; %entry
-; CHECK-SD-NEXT:    mov w8, #9272
-; CHECK-SD-NEXT:    mov w9, #22619
+; CHECK-SD-NEXT:    mov w8, #9272 ; =0x2438
+; CHECK-SD-NEXT:    mov w9, #22619 ; =0x585b
 ; CHECK-SD-NEXT:    add w8, w0, w8
 ; CHECK-SD-NEXT:    cmp w9, w8, uxth
 ; CHECK-SD-NEXT:    cset w0, lo
@@ -441,8 +441,8 @@ define zeroext i1 @test16_7(i16 zeroext %x)  align 2 {
 ;
 ; CHECK-GI-LABEL: test16_7:
 ; CHECK-GI:       ; %bb.0: ; %entry
-; CHECK-GI-NEXT:    mov w8, #9272
-; CHECK-GI-NEXT:    mov w9, #22620
+; CHECK-GI-NEXT:    mov w8, #9272 ; =0x2438
+; CHECK-GI-NEXT:    mov w9, #22620 ; =0x585c
 ; CHECK-GI-NEXT:    add w8, w0, w8
 ; CHECK-GI-NEXT:    cmp w9, w8, uxth
 ; CHECK-GI-NEXT:    cset w0, ls
@@ -460,16 +460,16 @@ ret_true:
 define zeroext i1 @test16_8(i16 zeroext %x)  align 2 {
 ; CHECK-SD-LABEL: test16_8:
 ; CHECK-SD:       ; %bb.0: ; %entry
-; CHECK-SD-NEXT:    mov w8, #4919
+; CHECK-SD-NEXT:    mov w8, #4919 ; =0x1337
 ; CHECK-SD-NEXT:    cmp w0, w8
 ; CHECK-SD-NEXT:    cset w0, ne
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: test16_8:
 ; CHECK-GI:       ; %bb.0: ; %entry
-; CHECK-GI-NEXT:    add w8, w0, #1787
-; CHECK-GI-NEXT:    mov w9, #6706
-; CHECK-GI-NEXT:    cmp w9, w8, uxth
+; CHECK-GI-NEXT:    mov w8, #6706 ; =0x1a32
+; CHECK-GI-NEXT:    add w9, w0, #1787
+; CHECK-GI-NEXT:    cmp w8, w9, uxth
 ; CHECK-GI-NEXT:    cset w0, ne
 ; CHECK-GI-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/andorbrcompare.ll b/llvm/test/CodeGen/AArch64/andorbrcompare.ll
index 117d8ba82f9c38..951a5cdb9571ca 100644
--- a/llvm/test/CodeGen/AArch64/andorbrcompare.ll
+++ b/llvm/test/CodeGen/AArch64/andorbrcompare.ll
@@ -9,15 +9,17 @@ define i32 @and_eq_ne_ult(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5,
 ; SDISEL:       // %bb.0: // %entry
 ; SDISEL-NEXT:    cmp w2, w3
 ; SDISEL-NEXT:    ccmp w0, w1, #0, ne
-; SDISEL-NEXT:    ccmp w4, w5, #0, ne
-; SDISEL-NEXT:    b.hs .LBB0_2
-; SDISEL-NEXT:  // %bb.1: // %if
-; SDISEL-NEXT:    mov w0, #1
-; SDISEL-NEXT:    str w0, [x6]
-; SDISEL-NEXT:    ret
-; SDISEL-NEXT:  .LBB0_2:
+; SDISEL-NEXT:    b.eq .LBB0_3
+; SDISEL-NEXT:  // %bb.1: // %entry
+; SDISEL-NEXT:    cmp w4, w5
+; SDISEL-NEXT:    b.lo .LBB0_3
+; SDISEL-NEXT:  // %bb.2:
 ; SDISEL-NEXT:    mov w0, wzr
 ; SDISEL-NEXT:    ret
+; SDISEL-NEXT:  .LBB0_3: // %if
+; SDISEL-NEXT:    mov w0, #1 // =0x1
+; SDISEL-NEXT:    str w0, [x6]
+; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: and_eq_ne_ult:
 ; GISEL:       // %bb.0: // %entry
@@ -28,13 +30,13 @@ define i32 @and_eq_ne_ult(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5,
 ; GISEL-NEXT:    and w8, w8, w9
 ; GISEL-NEXT:    tbnz w8, #0, .LBB0_3
 ; GISEL-NEXT:  // %bb.1: // %entry
-; GISEL-NEXT:    mov w0, wzr
 ; GISEL-NEXT:    cmp w4, w5
+; GISEL-NEXT:    mov w0, wzr
 ; GISEL-NEXT:    b.lo .LBB0_3
 ; GISEL-NEXT:  // %bb.2: // %common.ret
 ; GISEL-NEXT:    ret
 ; GISEL-NEXT:  .LBB0_3: // %if
-; GISEL-NEXT:    mov w0, #1
+; GISEL-NEXT:    mov w0, #1 // =0x1
 ; GISEL-NEXT:    str w0, [x6]
 ; GISEL-NEXT:    ret
 entry:
@@ -58,15 +60,17 @@ define i32 @and_ne_ult_ule(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5,
 ; SDISEL:       // %bb.0: // %entry
 ; SDISEL-NEXT:    cmp w2, w3
 ; SDISEL-NEXT:    ccmp w0, w1, #4, lo
-; SDISEL-NEXT:    ccmp w4, w5, #0, eq
-; SDISEL-NEXT:    b.hi .LBB1_2
-; SDISEL-NEXT:  // %bb.1: // %if
-; SDISEL-NEXT:    mov w0, #1
-; SDISEL-NEXT:    str w0, [x6]
-; SDISEL-NEXT:    ret
-; SDISEL-NEXT:  .LBB1_2:
+; SDISEL-NEXT:    b.ne .LBB1_3
+; SDISEL-NEXT:  // %bb.1: // %entry
+; SDISEL-NEXT:    cmp w4, w5
+; SDISEL-NEXT:    b.ls .LBB1_3
+; SDISEL-NEXT:  // %bb.2:
 ; SDISEL-NEXT:    mov w0, wzr
 ; SDISEL-NEXT:    ret
+; SDISEL-NEXT:  .LBB1_3: // %if
+; SDISEL-NEXT:    mov w0, #1 // =0x1
+; SDISEL-NEXT:    str w0, [x6]
+; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: and_ne_ult_ule:
 ; GISEL:       // %bb.0: // %entry
@@ -77,13 +81,13 @@ define i32 @and_ne_ult_ule(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5,
 ; GISEL-NEXT:    and w8, w8, w9
 ; GISEL-NEXT:    tbnz w8, #0, .LBB1_3
 ; GISEL-NEXT:  // %bb.1: // %entry
-; GISEL-NEXT:    mov w0, wzr
 ; GISEL-NEXT:    cmp w4, w5
+; GISEL-NEXT:    mov w0, wzr
 ; GISEL-NEXT:    b.ls .LBB1_3
 ; GISEL-NEXT:  // %bb.2: // %common.ret
 ; GISEL-NEXT:    ret
 ; GISEL-NEXT:  .LBB1_3: // %if
-; GISEL-NEXT:    mov w0, #1
+; GISEL-NEXT:    mov w0, #1 // =0x1
 ; GISEL-NEXT:    str w0, [x6]
 ; GISEL-NEXT:    ret
 entry:
@@ -107,15 +111,17 @@ define i32 @and_ult_ule_ugt(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5
 ; SDISEL:       // %bb.0: // %entry
 ; SDISEL-NEXT:    cmp w2, w3
 ; SDISEL-NEXT:    ccmp w0, w1, #2, ls
-; SDISEL-NEXT:    ccmp w4, w5, #2, hs
-; SDISEL-NEXT:    b.ls .LBB2_2
-; SDISEL-NEXT:  // %bb.1: // %if
-; SDISEL-NEXT:    mov w0, #1
-; SDISEL-NEXT:    str w0, [x6]
-; SDISEL-NEXT:    ret
-; SDISEL-NEXT:  .LBB2_2:
+; SDISEL-NEXT:    b.lo .LBB2_3
+; SDISEL-NEXT:  // %bb.1: // %entry
+; SDISEL-NEXT:    cmp w4, w5
+; SDISEL-NEXT:    b.hi .LBB2_3
+; SDISEL-NEXT:  // %bb.2:
 ; SDISEL-NEXT:    mov w0, wzr
 ; SDISEL-NEXT:    ret
+; SDISEL-NEXT:  .LBB2_3: // %if
+; SDISEL-NEXT:    mov w0, #1 // =0x1
+; SDISEL-NEXT:    str w0, [x6]
+; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: and_ult_ule_ugt:
 ; GISEL:       // %bb.0: // %entry
@@ -126,13 +132,13 @@ define i32 @and_ult_ule_ugt(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5
 ; GISEL-NEXT:    and w8, w8, w9
 ; GISEL-NEXT:    tbnz w8, #0, .LBB2_3
 ; GISEL-NEXT:  // %bb.1: // %entry
-; GISEL-NEXT:    mov w0, wzr
 ; GISEL-NEXT:    cmp w4, w5
+; GISEL-NEXT:    mov w0, wzr
 ; GISEL-NEXT:    b.hi .LBB2_3
 ; GISEL-NEXT:  // %bb.2: // %common.ret
 ; GISEL-NEXT:    ret
 ; GISEL-NEXT:  .LBB2_3: // %if
-; GISEL-NEXT:    mov w0, #1
+; GISEL-NEXT:    mov w0, #1 // =0x1
 ; GISEL-NEXT:    str w0, [x6]
 ; GISEL-NEXT:    ret
 entry:
@@ -156,15 +162,17 @@ define i32 @and_ule_ugt_uge(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5
 ; SDISEL:       // %bb.0: // %entry
 ; SDISEL-NEXT:    cmp w2, w3
 ; SDISEL-NEXT:    ccmp w0, w1, #2, hi
-; SDISEL-NEXT:    ccmp w4, w5, #2, hi
-; SDISEL-NEXT:    b.lo .LBB3_2
-; SDISEL-NEXT:  // %bb.1: // %if
-; SDISEL-NEXT:    mov w0, #1
-; SDISEL-NEXT:    str w0, [x6]
-; SDISEL-NEXT:    ret
-; SDISEL-NEXT:  .LBB3_2:
+; SDISEL-NEXT:    b.ls .LBB3_3
+; SDISEL-NEXT:  // %bb.1: // %entry
+; SDISEL-NEXT:    cmp w4, w5
+; SDISEL-NEXT:    b.hs .LBB3_3
+; SDISEL-NEXT:  // %bb.2:
 ; SDISEL-NEXT:    mov w0, wzr
 ; SDISEL-NEXT:    ret
+; SDISEL-NEXT:  .LBB3_3: // %if
+; SDISEL-NEXT:    mov w0, #1 // =0x1
+; SDISEL-NEXT:    str w0, [x6]
+; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: and_ule_ugt_uge:
 ; GISEL:       // %bb.0: // %entry
@@ -175,13 +183,13 @@ define i32 @and_ule_ugt_uge(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5
 ; GISEL-NEXT:    and w8, w8, w9
 ; GISEL-NEXT:    tbnz w8, #0, .LBB3_3
 ; GISEL-NEXT:  // %bb.1: // %entry
-; GISEL-NEXT:    mov w0, wzr
 ; GISEL-NEXT:    cmp w4, w5
+; GISEL-NEXT:    mov w0, wzr
 ; GISEL-NEXT:    b.hs .LBB3_3
 ; GISEL-NEXT:  // %bb.2: // %common.ret
 ; GISEL-NEXT:    ret
 ; GISEL-NEXT:  .LBB3_3: // %if
-; GISEL-NEXT:    mov w0, #1
+; GISEL-NEXT:    mov w0, #1 // =0x1
 ; GISEL-NEXT:    str w0, [x6]
 ; GISEL-NEXT:    ret
 entry:
@@ -205,15 +213,17 @@ define i32 @and_ugt_uge_slt(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5
 ; SDISEL:       // %bb.0: // %entry
 ; SDISEL-NEXT:    cmp w2, w3
 ; SDISEL-NEXT:    ccmp w0, w1, #0, hs
-; SDISEL-NEXT:    ccmp w4, w5, #8, ls
-; SDISEL-NEXT:    b.ge .LBB4_2
-; SDISEL-NEXT:  // %bb.1: // %if
-; SDISEL-NEXT:    mov w0, #1
-; SDISEL-NEXT:    str w0, [x6]
-; SDISEL-NEXT:    ret
-; SDISEL-NEXT:  .LBB4_2:
+; SDISEL-NEXT:    b.hi .LBB4_3
+; SDISEL-NEXT:  // %bb.1: // %entry
+; SDISEL-NEXT:    cmp w4, w5
+; SDISEL-NEXT:    b.lt .LBB4_3
+; SDISEL-NEXT:  // %bb.2:
 ; SDISEL-NEXT:    mov w0, wzr
 ; SDISEL-NEXT:    ret
+; SDISEL-NEXT:  .LBB4_3: // %if
+; SDISEL-NEXT:    mov w0, #1 // =0x1
+; SDISEL-NEXT:    str w0, [x6]
+; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: and_ugt_uge_slt:
 ; GISEL:       // %bb.0: // %entry
@@ -224,13 +234,13 @@ define i32 @and_ugt_uge_slt(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5
 ; GISEL-NEXT:    and w8, w8, w9
 ; GISEL-NEXT:    tbnz w8, #0, .LBB4_3
 ; GISEL-NEXT:  // %bb.1: // %entry
-; GISEL-NEXT:    mov w0, wzr
 ; GISEL-NEXT:    cmp w4, w5
+; GISEL-NEXT:    mov w0, wzr
 ; GISEL-NEXT:    b.lt .LBB4_3
 ; GISEL-NEXT:  // %bb.2: // %common.ret
 ; GISEL-NEXT:    ret
 ; GISEL-NEXT:  .LBB4_3: // %if
-; GISEL-NEXT:    mov w0, #1
+; GISEL-NEXT:    mov w0, #1 // =0x1
 ; GISEL-NEXT:    str w0, [x6]
 ; GISEL-NEXT:    ret
 entry:
@@ -254,15 +264,17 @@ define i32 @and_uge_slt_sle(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5
 ; SDISEL:       // %bb.0: // %entry
 ; SDISEL-NEXT:    cmp w2, w3
 ; SDISEL-NEXT:    ccmp w0, w1, #0, lt
-; SDISEL-NEXT:    ccmp w4, w5, #4, lo
-; SDISEL-NEXT:    b.gt .LBB5_2
-; SDISEL-NEXT:  // %bb.1: // %if
-; SDISEL-NEXT:    mov w0, #1
-; SDISEL-NEXT:    str w0, [x6]
-; SDISEL-NEXT:    ret
-; SDISEL-NEXT:  .LBB5_2:
+; SDISEL-NEXT:    b.hs .LBB5_3
+; SDISEL-NEXT:  // %bb.1: // %entry
+; SDISEL-NEXT:    cmp w4, w5
+; SDISEL-NEXT:    b.le .LBB5_3
+; SDISEL-NEXT:  // %bb.2:
 ; SDISEL-NEXT:    mov w0, wzr
 ; SDISEL-NEXT:    ret
+; SDISEL-NEXT:  .LBB5_3: // %if
+; SDISEL-NEXT:    mov w0, #1 // =0x1
+; SDISEL-NEXT:    str w0, [x6]
+; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: and_uge_slt_sle:
 ; GISEL:       // %bb.0: // %entry
@@ -273,13 +285,13 @@ define i32 @and_uge_slt_sle(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5
 ; GISEL-NEXT:    and w8, w8, w9
 ; GISEL-NEXT:    tbnz w8, #0, .LBB5_3
 ; GISEL-NEXT:  // %bb.1: // %entry
-; GISEL-NEXT:    mov w0, wzr
 ; GISEL-NEXT:    cmp w4, w5
+; GISEL-NEXT:    mov w0, wzr
 ; GISEL-NEXT:    b.le .LBB5_3
 ; GISEL-NEXT:  // %bb.2: // %common.ret
 ; GISEL-NEXT:    ret
 ; GISEL-NEXT:  .LBB5_3: // %if
-; GISEL-NEXT:    mov w0, #1
+; GISEL-NEXT:    mov w0, #1 // =0x1
 ; GISEL-NEXT:    str w0, [x6]
 ; GISEL-NEXT:    ret
 entry:
@@ -303,15 +315,17 @@ define i32 @and_slt_sle_sgt(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5
 ; SDISEL:       // %bb.0: // %entry
 ; SDISEL-NEXT:    cmp w2, w3
 ; SDISEL-NEXT:    ccmp w0, w1, #0, le
-; SDISEL-NEXT:    ccmp w4, w5, #0, ge
-; SDISEL-NEXT:    b.le .LBB6_2
-; SDISEL-NEXT:  // %bb.1: // %if
-; SDISEL-NEXT:    mov w0, #1
-; SDISEL-NEXT:    str w0, [x6]
-; SDISEL-NEXT:    ret
-; SDISEL-NEXT:  .LBB6_2:
+; SDISEL-NEXT:    b.lt .LBB6_3
+; SDISEL-NEXT:  // %bb.1: // %entry
+; SDISEL-NEXT:    cmp w4, w5
+; SDISEL-NEXT:    b.gt .LBB6_3
+; SDISEL-NEXT:  // %bb.2:
 ; SDISEL-NEXT:    mov w0, wzr
 ; SDISEL-NEXT:    ret
+; SDISEL-NEXT:  .LBB6_3: // %if
+; SDISEL-NEXT:    mov w0, #1 // =0x1
+; SDISEL-NEXT:    str w0, [x6]
+; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: and_slt_sle_sgt:
 ; GISEL:       // %bb.0: // %entry
@@ -322,13 +336,13 @@ define i32 @and_slt_sle_sgt(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5
 ; GISEL-NEXT:    and w8, w8, w9
 ; GISEL-NEXT:    tbnz w8, #0, .LBB6_3
 ; GISEL-NEXT:  // %bb.1: // %entry
-; GISEL-NEXT:    mov w0, wzr
 ; GISEL-NEXT:    cmp w4, w5
+; GISEL-NEXT:    mov w0, wzr
 ; GISEL-NEXT:    b.gt .LBB6_3
 ; GISEL-NEXT:  // %bb.2: // %common.ret
 ; GISEL-NEXT:    ret
 ; GISEL-NEXT:  .LBB6_3: // %if
-; GISEL-NEXT:    mov w0, #1
+; GISEL-NEXT:    mov w0, #1 // =0x1
 ; GISEL-NEXT:    str w0, [x6]
 ; GISEL-NEXT:    ret
 entry:
@@ -352,15 +366,17 @@ define i32 @and_sle_sgt_sge(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5
 ; SDISEL:       // %bb.0: // %entry
 ; SDISEL-NEXT:    cmp w2, w3
 ; SDISEL-NEXT:    ccmp w0, w1, #0, gt
-; SDISEL-NEXT:    ccmp w4, w5, #0, gt
-; SDISEL-NEXT:    b.lt .LBB7_2
-; SDISEL-NEXT:  // %bb.1: // %if
-; SDISEL-NEXT:    mov w0, #1
-; SDISEL-NEXT:    str w0, [x6]
-; SDISEL-NEXT:    ret
-; SDISEL-NEXT:  .LBB7_2:
+; SDISEL-NEXT:    b.le .LBB7_3
+; SDISEL-NEXT:  // %bb.1: // %entry
+; SDISEL-NEXT:    cmp w4, w5
+; SDISEL-NEXT:    b.ge .LBB7_3
+; SDISEL-NEXT:  // %bb.2:
 ; SDISEL-NEXT:    mov w0, wzr
 ; SDISEL-NEXT:    ret
+; SDISEL-NEXT:  .LBB7_3: // %if
+; SDISEL-NEXT:    mov w0, #1 // =0x1
+; SDISEL-NEXT:    str w0, [x6]
+; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: and_sle_sgt_sge:
 ; GISEL:       // %bb.0: // %entry
@@ -371,13 +387,13 @@ define i32 @and_sle_sgt_sge(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5
 ; GISEL-NEXT:    and w8, w8, w9
 ; GISEL-NEXT:    tbnz w8, #0, .LBB7_3
 ; GISEL-NEXT:  // %bb.1: // %entry
-; GISEL-NEXT:    mov w0, wzr
 ; GISEL-NEXT:    cmp w4, w5
+; GISEL-NEXT:    mov w0, wzr
 ; GISEL-NEXT:    b.ge .LBB7_3
 ; GISEL-NEXT:  // %bb.2: // %common.ret
 ; GISEL-NEXT:    ret
 ; GISEL-NEXT:  .LBB7_3: // %if
-; GISEL-NEXT:    mov w0, #1
+; GISEL-NEXT:    mov w0, #1 // =0x1
 ; GISEL-NEXT:    str w0, [x6]
 ; GISEL-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll b/llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll
index 94e6a25aa73c15..f606d8914d0eda 100644
--- a/llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll
+++ b/llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll
@@ -283,8 +283,8 @@ define %T_NESTED_STRUCT_DIFFM @struct_nested_
diff erent_field_types() {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi d0, #0000000000000000
 ; CHECK-NEXT:    movi d1, #0000000000000000
-; CHECK-NEXT:    movi d2, #0000000000000000
 ; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    movi d2, #0000000000000000
 ; CHECK-NEXT:    ret
   ret %T_NESTED_STRUCT_DIFFM zeroinitializer
 }
@@ -294,8 +294,8 @@ define [ 1 x %T_NESTED_STRUCT_DIFFM ] @array_of_struct_nested_
diff erent_field_ty
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi d0, #0000000000000000
 ; CHECK-NEXT:    movi d1, #0000000000000000
-; CHECK-NEXT:    movi d2, #0000000000000000
 ; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    movi d2, #0000000000000000
 ; CHECK-NEXT:    ret
   ret [ 1 x %T_NESTED_STRUCT_DIFFM ] zeroinitializer
 }
@@ -305,12 +305,12 @@ define [ 2 x %T_NESTED_STRUCT_DIFFM ] @array_of_struct_nested_
diff erent_field_ty
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi d0, #0000000000000000
 ; CHECK-NEXT:    movi d1, #0000000000000000
+; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    movi d2, #0000000000000000
 ; CHECK-NEXT:    movi d3, #0000000000000000
+; CHECK-NEXT:    mov w1, wzr
 ; CHECK-NEXT:    movi d4, #0000000000000000
 ; CHECK-NEXT:    movi d5, #0000000000000000
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    mov w1, wzr
 ; CHECK-NEXT:    ret
   ret [ 2 x %T_NESTED_STRUCT_DIFFM ] zeroinitializer
 }
@@ -458,16 +458,16 @@ define void @caller_in_memory() {
 ; CHECK-NEXT:    add x8, sp, #8
 ; CHECK-NEXT:    bl return_in_memory
 ; CHECK-NEXT:    ldur q0, [sp, #24]
+; CHECK-NEXT:    ldur q1, [sp, #8]
 ; CHECK-NEXT:    adrp x8, in_memory_store
 ; CHECK-NEXT:    add x8, x8, :lo12:in_memory_store
-; CHECK-NEXT:    ldur q1, [sp, #8]
-; CHECK-NEXT:    ldur q2, [sp, #56]
-; CHECK-NEXT:    ldur q3, [sp, #40]
-; CHECK-NEXT:    ldr d4, [sp, #72]
-; CHECK-NEXT:    stp q1, q0, [x8]
+; CHECK-NEXT:    ldr d2, [sp, #72]
+; CHECK-NEXT:    ldur q3, [sp, #56]
+; CHECK-NEXT:    ldur q4, [sp, #40]
 ; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
-; CHECK-NEXT:    stp q3, q2, [x8, #32]
-; CHECK-NEXT:    str d4, [x8, #64]
+; CHECK-NEXT:    stp q1, q0, [x8]
+; CHECK-NEXT:    str d2, [x8, #64]
+; CHECK-NEXT:    stp q4, q3, [x8, #32]
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %1 = call %T_IN_MEMORY @return_in_memory()
@@ -478,15 +478,16 @@ define void @caller_in_memory() {
 define void @callee_in_memory(%T_IN_MEMORY %a) {
 ; CHECK-LABEL: callee_in_memory:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [sp, #64]
+; CHECK-NEXT:    ldp q1, q2, [sp, #32]
 ; CHECK-NEXT:    adrp x8, in_memory_store
 ; CHECK-NEXT:    add x8, x8, :lo12:in_memory_store
-; CHECK-NEXT:    ldr q3, [sp, #16]
-; CHECK-NEXT:    ldp q1, q2, [sp, #32]
+; CHECK-NEXT:    ldr d0, [sp, #64]
 ; CHECK-NEXT:    str d0, [x8, #64]
-; CHECK-NEXT:    ldr q0, [sp]
-; CHECK-NEXT:    stp q1, q2, [x8, #32]
-; CHECK-NEXT:    stp q0, q3, [x8]
+; CHECK-NEXT:    ldr q0, [sp, #16]
+; CHECK-NEXT:    str q2, [x8, #48]
+; CHECK-NEXT:    ldr q2, [sp]
+; CHECK-NEXT:    stp q0, q1, [x8, #16]
+; CHECK-NEXT:    str q2, [x8]
 ; CHECK-NEXT:    ret
   store %T_IN_MEMORY %a, ptr @in_memory_store
   ret void
@@ -502,11 +503,11 @@ define void @argument_in_memory() {
 ; CHECK-NEXT:    adrp x8, in_memory_store
 ; CHECK-NEXT:    add x8, x8, :lo12:in_memory_store
 ; CHECK-NEXT:    ldp q0, q1, [x8]
-; CHECK-NEXT:    ldp q2, q3, [x8, #32]
 ; CHECK-NEXT:    ldr d4, [x8, #64]
+; CHECK-NEXT:    ldp q2, q3, [x8, #32]
+; CHECK-NEXT:    str d4, [sp, #64]
 ; CHECK-NEXT:    stp q0, q1, [sp]
 ; CHECK-NEXT:    stp q2, q3, [sp, #32]
-; CHECK-NEXT:    str d4, [sp, #64]
 ; CHECK-NEXT:    bl callee_in_memory
 ; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96

diff  --git a/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll b/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
index 0b1b581d779250..7934e39b2b69f9 100644
--- a/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
@@ -68,16 +68,16 @@ define double @vaddd_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
 define double @add_sub_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
 ; CHECK-LABEL: add_sub_su64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov d2, xzr
 ; CHECK-NEXT:    add d0, d1, d0
-; CHECK-NEXT:    sub d0, d2, d0
+; CHECK-NEXT:    fmov d1, xzr
+; CHECK-NEXT:    sub d0, d1, d0
 ; CHECK-NEXT:    ret
 ;
 ; GENERIC-LABEL: add_sub_su64:
 ; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    fmov d2, xzr
 ; GENERIC-NEXT:    add d0, d1, d0
-; GENERIC-NEXT:    sub d0, d2, d0
+; GENERIC-NEXT:    fmov d1, xzr
+; GENERIC-NEXT:    sub d0, d1, d0
 ; GENERIC-NEXT:    ret
   %vecext = extractelement <2 x i64> %a, i32 0
   %vecext1 = extractelement <2 x i64> %b, i32 0

diff  --git a/llvm/test/CodeGen/AArch64/arm64-addr-type-promotion.ll b/llvm/test/CodeGen/AArch64/arm64-addr-type-promotion.ll
index 3163ca0fb891b8..091fb7f0c730a7 100644
--- a/llvm/test/CodeGen/AArch64/arm64-addr-type-promotion.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-addr-type-promotion.ll
@@ -10,28 +10,28 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 define zeroext i8 @fullGtU(i32 %i1, i32 %i2) {
 ; CHECK-LABEL: fullGtU:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    adrp x10, _block at GOTPAGE
+; CHECK-NEXT:    adrp x8, _block at GOTPAGE
 ; CHECK-NEXT:    ; kill: def $w1 killed $w1 def $x1
 ; CHECK-NEXT:    ; kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    sxtw x9, w1
-; CHECK-NEXT:    ldr x10, [x10, _block at GOTPAGEOFF]
-; CHECK-NEXT:    ldr x10, [x10]
-; CHECK-NEXT:    ldrb w11, [x10, x8]
-; CHECK-NEXT:    ldrb w12, [x10, x9]
+; CHECK-NEXT:    sxtw x9, w0
+; CHECK-NEXT:    sxtw x10, w1
+; CHECK-NEXT:    ldr x8, [x8, _block at GOTPAGEOFF]
+; CHECK-NEXT:    ldr x8, [x8]
+; CHECK-NEXT:    ldrb w11, [x8, x9]
+; CHECK-NEXT:    ldrb w12, [x8, x10]
 ; CHECK-NEXT:    cmp w11, w12
 ; CHECK-NEXT:    b.ne LBB0_3
 ; CHECK-NEXT:  ; %bb.1: ; %if.end
-; CHECK-NEXT:    add x8, x8, x10
-; CHECK-NEXT:    add x9, x9, x10
-; CHECK-NEXT:    ldrb w10, [x8, #1]
-; CHECK-NEXT:    ldrb w11, [x9, #1]
+; CHECK-NEXT:    add x9, x9, x8
+; CHECK-NEXT:    add x8, x10, x8
+; CHECK-NEXT:    ldrb w10, [x9, #1]
+; CHECK-NEXT:    ldrb w11, [x8, #1]
 ; CHECK-NEXT:    cmp w10, w11
 ; CHECK-NEXT:    b.ne LBB0_3
 ; CHECK-NEXT:  ; %bb.2: ; %if.end25
-; CHECK-NEXT:    ldrb w8, [x8, #2]
 ; CHECK-NEXT:    ldrb w9, [x9, #2]
-; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    ldrb w8, [x8, #2]
+; CHECK-NEXT:    cmp w9, w8
 ; CHECK-NEXT:    cset w8, hi
 ; CHECK-NEXT:    csel w0, wzr, w8, eq
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
index cc9b47c049d562..69c558d9d5599d 100644
--- a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
@@ -42,7 +42,7 @@ define void @t3(ptr %object) {
 define void @t4(ptr %object) {
 ; CHECK-LABEL: t4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32768
+; CHECK-NEXT:    mov w8, #32768 // =0x8000
 ; CHECK-NEXT:    ldr xzr, [x0, x8]
 ; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds i64, ptr %object, i64 4096
@@ -67,9 +67,9 @@ define void @t5(i64 %a) {
 define void @t6(i64 %a, ptr %object) {
 ; CHECK-LABEL: t6:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32768
-; CHECK-NEXT:    add x9, x1, x0, lsl #3
-; CHECK-NEXT:    ldr xzr, [x9, x8]
+; CHECK-NEXT:    add x8, x1, x0, lsl #3
+; CHECK-NEXT:    mov w9, #32768 // =0x8000
+; CHECK-NEXT:    ldr xzr, [x8, x9]
 ; CHECK-NEXT:    ret
   %tmp1 = getelementptr inbounds i64, ptr %object, i64 %a
   %incdec.ptr = getelementptr inbounds i64, ptr %tmp1, i64 4096
@@ -81,7 +81,7 @@ define void @t6(i64 %a, ptr %object) {
 define void @t7(i64 %a) {
 ; CHECK-LABEL: t7:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #65535
+; CHECK-NEXT:    mov w8, #65535 // =0xffff
 ; CHECK-NEXT:    ldr xzr, [x0, x8]
 ; CHECK-NEXT:    ret
   %1 = add i64 %a, 65535   ;0xffff
@@ -93,7 +93,7 @@ define void @t7(i64 %a) {
 define void @t8(i64 %a) {
 ; CHECK-LABEL: t8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-4662
+; CHECK-NEXT:    mov x8, #-4662 // =0xffffffffffffedca
 ; CHECK-NEXT:    ldr xzr, [x0, x8]
 ; CHECK-NEXT:    ret
   %1 = sub i64 %a, 4662   ;-4662 is 0xffffffffffffedca
@@ -105,7 +105,7 @@ define void @t8(i64 %a) {
 define void @t9(i64 %a) {
 ; CHECK-LABEL: t9:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-305463297
+; CHECK-NEXT:    mov x8, #-305463297 // =0xffffffffedcaffff
 ; CHECK-NEXT:    ldr xzr, [x0, x8]
 ; CHECK-NEXT:    ret
   %1 = add i64 -305463297, %a   ;-305463297 is 0xffffffffedcaffff
@@ -117,7 +117,7 @@ define void @t9(i64 %a) {
 define void @t10(i64 %a) {
 ; CHECK-LABEL: t10:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #81909218222800896
+; CHECK-NEXT:    mov x8, #81909218222800896 // =0x123000000000000
 ; CHECK-NEXT:    ldr xzr, [x0, x8]
 ; CHECK-NEXT:    ret
   %1 = add i64 %a, 81909218222800896   ;0x123000000000000
@@ -129,7 +129,7 @@ define void @t10(i64 %a) {
 define void @t11(i64 %a) {
 ; CHECK-LABEL: t11:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #17767
+; CHECK-NEXT:    mov w8, #17767 // =0x4567
 ; CHECK-NEXT:    movk w8, #291, lsl #16
 ; CHECK-NEXT:    ldr xzr, [x0, x8]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll b/llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll
index caa5a7f9ead14a..6041904dc0f310 100644
--- a/llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll
@@ -22,6 +22,7 @@ define void @foo(ptr nocapture %x, ptr nocapture %y) nounwind optsize ssp {
 ; OPT-NEXT:    [[FROMBOOL:%.*]] = trunc i32 [[BF_CLEAR_LOBIT]] to i8
 ; OPT-NEXT:    store i8 [[FROMBOOL]], ptr [[B]], align 1
 ; OPT-NEXT:    ret void
+;
   %tmp1 = load i32, ptr %x, align 4
   %b = getelementptr inbounds %struct.Y, ptr %y, i64 0, i32 1
   %bf.clear = lshr i32 %tmp1, 3
@@ -41,6 +42,7 @@ define i32 @baz(i64 %cav1.coerce) nounwind {
 ; OPT-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP]], 28
 ; OPT-NEXT:    [[BF_VAL_SEXT:%.*]] = ashr exact i32 [[TMP1]], 28
 ; OPT-NEXT:    ret i32 [[BF_VAL_SEXT]]
+;
   %tmp = trunc i64 %cav1.coerce to i32
   %tmp1 = shl i32 %tmp, 28
   %bf.val.sext = ashr exact i32 %tmp1, 28
@@ -57,6 +59,7 @@ define i32 @bar(i64 %cav1.coerce) nounwind {
 ; OPT-NEXT:    [[CAV1_SROA_0_1_INSERT:%.*]] = shl i32 [[TMP]], 22
 ; OPT-NEXT:    [[TMP1:%.*]] = ashr i32 [[CAV1_SROA_0_1_INSERT]], 26
 ; OPT-NEXT:    ret i32 [[TMP1]]
+;
   %tmp = trunc i64 %cav1.coerce to i32
   %cav1.sroa.0.1.insert = shl i32 %tmp, 22
   %tmp1 = ashr i32 %cav1.sroa.0.1.insert, 26
@@ -76,6 +79,7 @@ define void @fct1(ptr nocapture %x, ptr nocapture %y) nounwind optsize ssp {
 ; OPT-NEXT:    [[BF_CLEAR_LOBIT:%.*]] = and i64 [[BF_CLEAR]], 1
 ; OPT-NEXT:    store i64 [[BF_CLEAR_LOBIT]], ptr [[Y:%.*]], align 8
 ; OPT-NEXT:    ret void
+;
   %tmp1 = load i64, ptr %x, align 4
   %bf.clear = lshr i64 %tmp1, 3
   %bf.clear.lobit = and i64 %bf.clear, 1
@@ -92,6 +96,7 @@ define i64 @fct2(i64 %cav1.coerce) nounwind {
 ; OPT-NEXT:    [[TMP:%.*]] = shl i64 [[CAV1_COERCE:%.*]], 28
 ; OPT-NEXT:    [[BF_VAL_SEXT:%.*]] = ashr exact i64 [[TMP]], 28
 ; OPT-NEXT:    ret i64 [[BF_VAL_SEXT]]
+;
   %tmp = shl i64 %cav1.coerce, 28
   %bf.val.sext = ashr exact i64 %tmp, 28
   ret i64 %bf.val.sext
@@ -106,6 +111,7 @@ define i64 @fct3(i64 %cav1.coerce) nounwind {
 ; OPT-NEXT:    [[CAV1_SROA_0_1_INSERT:%.*]] = shl i64 [[CAV1_COERCE:%.*]], 22
 ; OPT-NEXT:    [[TMP1:%.*]] = ashr i64 [[CAV1_SROA_0_1_INSERT]], 26
 ; OPT-NEXT:    ret i64 [[TMP1]]
+;
   %cav1.sroa.0.1.insert = shl i64 %cav1.coerce, 22
   %tmp1 = ashr i64 %cav1.sroa.0.1.insert, 26
   ret i64 %tmp1
@@ -127,6 +133,7 @@ define void @fct4(ptr nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
 ; OPT-NEXT:    [[OR:%.*]] = or i64 [[AND]], [[AND1]]
 ; OPT-NEXT:    store i64 [[OR]], ptr [[Y]], align 8
 ; OPT-NEXT:    ret void
+;
 entry:
   %0 = load i64, ptr %y, align 8
   %and = and i64 %0, -16777216
@@ -153,6 +160,7 @@ define void @fct5(ptr nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
 ; OPT-NEXT:    [[OR:%.*]] = or i32 [[AND]], [[AND1]]
 ; OPT-NEXT:    store i32 [[OR]], ptr [[Y]], align 8
 ; OPT-NEXT:    ret void
+;
 entry:
   %0 = load i32, ptr %y, align 8
   %and = and i32 %0, -8
@@ -182,6 +190,7 @@ define void @fct6(ptr nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
 ; OPT-NEXT:    [[SHR1:%.*]] = lshr i32 [[OR]], 2
 ; OPT-NEXT:    store i32 [[SHR1]], ptr [[Y]], align 8
 ; OPT-NEXT:    ret void
+;
 entry:
 ; lsr is an alias of ubfm
   %0 = load i32, ptr %y, align 8
@@ -214,6 +223,7 @@ define void @fct7(ptr nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
 ; OPT-NEXT:    [[SHL:%.*]] = shl i32 [[OR]], 2
 ; OPT-NEXT:    store i32 [[SHL]], ptr [[Y]], align 8
 ; OPT-NEXT:    ret void
+;
 entry:
 ; lsl is an alias of ubfm
   %0 = load i32, ptr %y, align 8
@@ -247,6 +257,7 @@ define void @fct8(ptr nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
 ; OPT-NEXT:    [[SHR1:%.*]] = lshr i64 [[OR]], 2
 ; OPT-NEXT:    store i64 [[SHR1]], ptr [[Y]], align 8
 ; OPT-NEXT:    ret void
+;
 entry:
 ; lsr is an alias of ubfm
   %0 = load i64, ptr %y, align 8
@@ -280,6 +291,7 @@ define void @fct9(ptr nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
 ; OPT-NEXT:    [[SHL:%.*]] = shl i64 [[OR]], 2
 ; OPT-NEXT:    store i64 [[SHL]], ptr [[Y]], align 8
 ; OPT-NEXT:    ret void
+;
 entry:
 ; lsr is an alias of ubfm
   %0 = load i64, ptr %y, align 8
@@ -311,6 +323,7 @@ define void @fct10(ptr nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
 ; OPT-NEXT:    [[SHL:%.*]] = shl i32 [[OR]], 2
 ; OPT-NEXT:    store i32 [[SHL]], ptr [[Y]], align 8
 ; OPT-NEXT:    ret void
+;
 entry:
 ; lsl is an alias of ubfm
   %0 = load i32, ptr %y, align 8
@@ -341,6 +354,7 @@ define void @fct11(ptr nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
 ; OPT-NEXT:    [[SHL:%.*]] = shl i64 [[OR]], 2
 ; OPT-NEXT:    store i64 [[SHL]], ptr [[Y]], align 8
 ; OPT-NEXT:    ret void
+;
 entry:
 ; lsl is an alias of ubfm
   %0 = load i64, ptr %y, align 8
@@ -361,6 +375,7 @@ define zeroext i1 @fct12bis(i32 %tmp2) unnamed_addr nounwind ssp align 2 {
 ; OPT-NEXT:    [[AND_I_I:%.*]] = and i32 [[TMP2:%.*]], 2048
 ; OPT-NEXT:    [[TOBOOL_I_I:%.*]] = icmp ne i32 [[AND_I_I]], 0
 ; OPT-NEXT:    ret i1 [[TOBOOL_I_I]]
+;
   %and.i.i = and i32 %tmp2, 2048
   %tobool.i.i = icmp ne i32 %and.i.i, 0
   ret i1 %tobool.i.i
@@ -387,6 +402,7 @@ define void @fct12(ptr nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
 ; OPT-NEXT:    [[SHR2:%.*]] = lshr i32 [[SHL]], 4
 ; OPT-NEXT:    store i32 [[SHR2]], ptr [[Y]], align 8
 ; OPT-NEXT:    ret void
+;
 entry:
 ; lsr is an alias of ubfm
   %0 = load i32, ptr %y, align 8
@@ -419,6 +435,7 @@ define void @fct12_mask(ptr nocapture %y, i32 %x) nounwind optsize inlinehint ss
 ; OPT-NEXT:    [[MASK:%.*]] = and i32 [[LSHR]], 268435455
 ; OPT-NEXT:    store i32 [[MASK]], ptr [[Y]], align 8
 ; OPT-NEXT:    ret void
+;
 entry:
 ; lsr is an alias of ubfm
   %0 = load i32, ptr %y, align 8
@@ -454,6 +471,7 @@ define void @fct13(ptr nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
 ; OPT-NEXT:    [[SHR2:%.*]] = lshr i64 [[SHL]], 4
 ; OPT-NEXT:    store i64 [[SHR2]], ptr [[Y]], align 8
 ; OPT-NEXT:    ret void
+;
 entry:
 ; lsr is an alias of ubfm
   %0 = load i64, ptr %y, align 8
@@ -486,6 +504,7 @@ define void @fct13_mask(ptr nocapture %y, i64 %x) nounwind optsize inlinehint ss
 ; OPT-NEXT:    [[MASK:%.*]] = and i64 [[LSHR]], 1152921504606846975
 ; OPT-NEXT:    store i64 [[MASK]], ptr [[Y]], align 8
 ; OPT-NEXT:    ret void
+;
 entry:
 ; lsr is an alias of ubfm
   %0 = load i64, ptr %y, align 8
@@ -527,6 +546,7 @@ define void @fct14(ptr nocapture %y, i32 %x, i32 %x1) nounwind optsize inlinehin
 ; OPT-NEXT:    [[SHL1:%.*]] = shl i32 [[OR1]], 2
 ; OPT-NEXT:    store i32 [[SHL1]], ptr [[Y]], align 8
 ; OPT-NEXT:    ret void
+;
 entry:
 ; lsr is an alias of ubfm
 ; lsl is an alias of ubfm
@@ -573,6 +593,7 @@ define void @fct15(ptr nocapture %y, i64 %x, i64 %x1) nounwind optsize inlinehin
 ; OPT-NEXT:    [[SHL1:%.*]] = shl i64 [[OR1]], 2
 ; OPT-NEXT:    store i64 [[SHL1]], ptr [[Y]], align 8
 ; OPT-NEXT:    ret void
+;
 entry:
 ; lsr is an alias of ubfm
 ; lsl is an alias of ubfm
@@ -615,6 +636,7 @@ define void @fct16(ptr nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
 ; OPT-NEXT:    [[SHR2:%.*]] = lshr i32 [[SHL]], 4
 ; OPT-NEXT:    store i32 [[SHR2]], ptr [[Y]], align 8
 ; OPT-NEXT:    ret void
+;
 entry:
 ; Create the constant
 ; Do the masking
@@ -651,6 +673,7 @@ define void @fct16_mask(ptr nocapture %y, i32 %x) nounwind optsize inlinehint ss
 ; OPT-NEXT:    [[MASK:%.*]] = and i32 [[LSHR]], 268435455
 ; OPT-NEXT:    store i32 [[MASK]], ptr [[Y]], align 8
 ; OPT-NEXT:    ret void
+;
 entry:
 ; Create the constant
 ; Do the masking
@@ -692,6 +715,7 @@ define void @fct17(ptr nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
 ; OPT-NEXT:    [[SHR2:%.*]] = lshr i64 [[SHL]], 4
 ; OPT-NEXT:    store i64 [[SHR2]], ptr [[Y]], align 8
 ; OPT-NEXT:    ret void
+;
 entry:
 ; Create the constant
 ; Do the masking
@@ -728,6 +752,7 @@ define void @fct17_mask(ptr nocapture %y, i64 %x) nounwind optsize inlinehint ss
 ; OPT-NEXT:    [[MASK:%.*]] = and i64 [[LSHR]], 1152921504606846975
 ; OPT-NEXT:    store i64 [[MASK]], ptr [[Y]], align 8
 ; OPT-NEXT:    ret void
+;
 entry:
 ; Create the constant
 ; Do the masking
@@ -754,6 +779,7 @@ define i64 @fct18(i32 %xor72) nounwind ssp {
 ; OPT-NEXT:    [[CONV82:%.*]] = zext i32 [[SHR81]] to i64
 ; OPT-NEXT:    [[RESULT:%.*]] = and i64 [[CONV82]], 255
 ; OPT-NEXT:    ret i64 [[RESULT]]
+;
   %shr81 = lshr i32 %xor72, 9
   %conv82 = zext i32 %shr81 to i64
   %result = and i64 %conv82, 255
@@ -836,6 +862,7 @@ define i32 @fct19(i64 %arg1) nounwind readonly ssp  {
 ; OPT:       return:
 ; OPT-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ [[CONV]], [[IF_THEN]] ], [ [[ADD]], [[IF_THEN7]] ], [ [[ADD23]], [[IF_THEN17]] ], [ 64, [[IF_END13]] ]
 ; OPT-NEXT:    ret i32 [[RETVAL_0]]
+;
 entry:
   %x.sroa.1.0.extract.shift = lshr i64 %arg1, 16
   %x.sroa.1.0.extract.trunc = trunc i64 %x.sroa.1.0.extract.shift to i16
@@ -889,20 +916,20 @@ return:                                           ; preds = %if.end13, %if.then1
 define i80 @fct20(i128 %a, i128 %b) {
 ; LLC-LABEL: fct20:
 ; LLC:       // %bb.0: // %entry
-; LLC-NEXT:    mov x12, #11776
-; LLC-NEXT:    extr x9, x1, x0, #18
-; LLC-NEXT:    movk x12, #25856, lsl #16
-; LLC-NEXT:    lsr x8, x1, #18
-; LLC-NEXT:    movk x12, #11077, lsl #32
-; LLC-NEXT:    orr x10, x2, x3
-; LLC-NEXT:    mov w11, #26220
-; LLC-NEXT:    movk x12, #45, lsl #48
-; LLC-NEXT:    and x11, x8, x11
-; LLC-NEXT:    and x12, x9, x12
-; LLC-NEXT:    cmp x10, #0
-; LLC-NEXT:    csel x0, x12, x9, eq
-; LLC-NEXT:    csel x1, x11, x8, eq
-; LLC-NEXT:    ret
+; LLC-NEXT:	mov	x12, #11776                     // =0x2e00
+; LLC-NEXT:	lsr	x8, x1, #18
+; LLC-NEXT:	extr	x9, x1, x0, #18
+; LLC-NEXT:	movk	x12, #25856, lsl #16
+; LLC-NEXT:	orr	x10, x2, x3
+; LLC-NEXT:	mov	w11, #26220                     // =0x666c
+; LLC-NEXT:	movk	x12, #11077, lsl #32
+; LLC-NEXT:	and	x11, x8, x11
+; LLC-NEXT:	cmp	x10, #0
+; LLC-NEXT:	movk	x12, #45, lsl #48
+; LLC-NEXT:	csel	x1, x11, x8, eq
+; LLC-NEXT:	and	x12, x9, x12
+; LLC-NEXT:	csel	x0, x12, x9, eq
+; LLC-NEXT:	ret
 ; OPT-LABEL: @fct20(
 ; OPT-NEXT:  entry:
 ; OPT-NEXT:    [[SHR:%.*]] = lshr i128 [[A:%.*]], 18
@@ -916,6 +943,7 @@ define i80 @fct20(i128 %a, i128 %b) {
 ; OPT:       end:
 ; OPT-NEXT:    [[CONV3:%.*]] = phi i80 [ [[CONV]], [[ENTRY:%.*]] ], [ [[CONV2]], [[THEN]] ]
 ; OPT-NEXT:    ret i80 [[CONV3]]
+;
 entry:
   %shr = lshr i128 %a, 18
   %conv = trunc i128 %shr to i80
@@ -947,6 +975,7 @@ define i64 @fct21(i64 %x) {
 ; OPT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [8 x [64 x i64]], ptr @arr, i64 0, i64 0, i64 [[AND]]
 ; OPT-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
 ; OPT-NEXT:    ret i64 [[TMP0]]
+;
 entry:
   %shr = lshr i64 %x, 4
   %and = and i64 %shr, 15
@@ -971,6 +1000,7 @@ define i16 @test_ignored_rightbits(i32 %dst, i32 %in) {
 ; OPT-NEXT:    [[OR18:%.*]] = or i32 [[SHL16]], [[INSERTION]]
 ; OPT-NEXT:    [[CONV19:%.*]] = trunc i32 [[OR18]] to i16
 ; OPT-NEXT:    ret i16 [[CONV19]]
+;
   %positioned_field = shl i32 %in, 3
   %positioned_masked_field = and i32 %positioned_field, 120
   %masked_dst = and i32 %dst, 7
@@ -1016,6 +1046,7 @@ define void @sameOperandBFI(i64 %src, i64 %src2, ptr %ptr) {
 ; OPT-NEXT:    br label [[END]]
 ; OPT:       end:
 ; OPT-NEXT:    ret void
+;
 entry:
   %shr47 = lshr i64 %src, 47
   %src2.trunc = trunc i64 %src2 to i32

diff  --git a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
index e6b05f7182f8c4..9bf638f57a5120 100644
--- a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
@@ -6,10 +6,10 @@ define <4 x i16> @fptosi_v4f64_to_v4i16(ptr %ptr) {
 ; CHECK-LABEL: fptosi_v4f64_to_v4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
 ; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
-; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
 ; CHECK-NEXT:    xtn v1.2s, v1.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %ptr
@@ -20,17 +20,17 @@ define <4 x i16> @fptosi_v4f64_to_v4i16(ptr %ptr) {
 define <8 x i8> @fptosi_v4f64_to_v4i8(ptr %ptr) {
 ; CHECK-LABEL: fptosi_v4f64_to_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0, #32]
-; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    ldp q1, q0, [x0, #32]
 ; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
 ; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
-; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtzs v3.2d, v3.2d
 ; CHECK-NEXT:    fcvtzs v2.2d, v2.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-NEXT:    xtn v1.2s, v1.2d
-; CHECK-NEXT:    fcvtzs v3.2d, v3.2d
-; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    xtn v2.2s, v2.2d
 ; CHECK-NEXT:    xtn v3.2s, v3.2d
+; CHECK-NEXT:    xtn v2.2s, v2.2d
+; CHECK-NEXT:    uzp1 v0.4h, v1.4h, v0.4h
 ; CHECK-NEXT:    uzp1 v1.4h, v2.4h, v3.4h
 ; CHECK-NEXT:    uzp1 v0.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    ret
@@ -70,10 +70,10 @@ define <4 x i16> @fptoui_v4f64_to_v4i16(ptr %ptr) {
 ; CHECK-LABEL: fptoui_v4f64_to_v4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
 ; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
-; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
 ; CHECK-NEXT:    xtn v1.2s, v1.2d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %ptr

diff  --git a/llvm/test/CodeGen/AArch64/arm64-cse.ll b/llvm/test/CodeGen/AArch64/arm64-cse.ll
index 9ea51161dad0ea..7afa30970dff2f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-cse.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-cse.ll
@@ -15,8 +15,8 @@ define ptr @t1(ptr %base, ptr nocapture %offset, i32 %size) nounwind {
 ; CHECK-NEXT:    mov x0, xzr
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  LBB0_2: ; %if.end
-; CHECK-NEXT:    sub w9, w9, w8
 ; CHECK-NEXT:    add x0, x0, w8, sxtw
+; CHECK-NEXT:    sub w9, w9, w8
 ; CHECK-NEXT:    str w9, [x1]
 ; CHECK-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/arm64-csel.ll b/llvm/test/CodeGen/AArch64/arm64-csel.ll
index dd721e736f844f..35051d0c6017e9 100644
--- a/llvm/test/CodeGen/AArch64/arm64-csel.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-csel.ll
@@ -214,8 +214,8 @@ entry:
 define i32 @foo15(i32 %a, i32 %b) nounwind readnone optsize ssp {
 ; CHECK-LABEL: foo15:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w0, w1
 ; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    cmp w0, w1
 ; CHECK-NEXT:    cinc w0, w8, gt
 ; CHECK-NEXT:    ret
 entry:
@@ -227,8 +227,8 @@ entry:
 define i32 @foo16(i32 %a, i32 %b) nounwind readnone optsize ssp {
 ; CHECK-LABEL: foo16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w0, w1
 ; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    cmp w0, w1
 ; CHECK-NEXT:    cinc w0, w8, le
 ; CHECK-NEXT:    ret
 entry:
@@ -240,8 +240,8 @@ entry:
 define i64 @foo17(i64 %a, i64 %b) nounwind readnone optsize ssp {
 ; CHECK-LABEL: foo17:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp x0, x1
 ; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    cmp x0, x1
 ; CHECK-NEXT:    cinc x0, x8, gt
 ; CHECK-NEXT:    ret
 entry:
@@ -253,8 +253,8 @@ entry:
 define i64 @foo18(i64 %a, i64 %b) nounwind readnone optsize ssp {
 ; CHECK-LABEL: foo18:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp x0, x1
 ; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    cmp x0, x1
 ; CHECK-NEXT:    cinc x0, x8, le
 ; CHECK-NEXT:    ret
 entry:
@@ -267,8 +267,8 @@ entry:
 define i64 @foo18_overflow1(i64 %a, i64 %b) nounwind readnone optsize ssp {
 ; CHECK-LABEL: foo18_overflow1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp x0, x1
 ; CHECK-NEXT:    mov x8, #9223372036854775807 // =0x7fffffffffffffff
+; CHECK-NEXT:    cmp x0, x1
 ; CHECK-NEXT:    csel x0, x8, xzr, gt
 ; CHECK-NEXT:    ret
 entry:
@@ -281,8 +281,8 @@ entry:
 define i64 @foo18_overflow2(i64 %a, i64 %b) nounwind readnone optsize ssp {
 ; CHECK-LABEL: foo18_overflow2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp x0, x1
 ; CHECK-NEXT:    mov x8, #9223372036854775807 // =0x7fffffffffffffff
+; CHECK-NEXT:    cmp x0, x1
 ; CHECK-NEXT:    csel x0, xzr, x8, gt
 ; CHECK-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll
index a6afddfe3f73aa..2112944cc84793 100644
--- a/llvm/test/CodeGen/AArch64/arm64-dup.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll
@@ -420,9 +420,9 @@ define <4 x i16> @test_perfectshuffle_dupext_v4i16(<4 x i16> %a, <4 x i16> %b) n
 ;
 ; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4i16:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI33_0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    adrp x8, .LCPI33_0
 ; CHECK-GI-NEXT:    mov.d v0[1], v1[0]
 ; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI33_0]
 ; CHECK-GI-NEXT:    tbl.16b v0, { v0 }, v1
@@ -443,9 +443,9 @@ define <4 x half> @test_perfectshuffle_dupext_v4f16(<4 x half> %a, <4 x half> %b
 ;
 ; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4f16:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI34_0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    adrp x8, .LCPI34_0
 ; CHECK-GI-NEXT:    mov.d v0[1], v1[0]
 ; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI34_0]
 ; CHECK-GI-NEXT:    tbl.16b v0, { v0 }, v1
@@ -466,8 +466,8 @@ define <4 x i32> @test_perfectshuffle_dupext_v4i32(<4 x i32> %a, <4 x i32> %b) n
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    adrp x8, .LCPI35_0
 ; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
 ; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI35_0]
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
 ; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
 ; CHECK-GI-NEXT:    ret
   %r = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
@@ -485,8 +485,8 @@ define <4 x float> @test_perfectshuffle_dupext_v4f32(<4 x float> %a, <4 x float>
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    adrp x8, .LCPI36_0
 ; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
 ; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI36_0]
+; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
 ; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
 ; CHECK-GI-NEXT:    ret
   %r = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>

diff  --git a/llvm/test/CodeGen/AArch64/arm64-fcopysign.ll b/llvm/test/CodeGen/AArch64/arm64-fcopysign.ll
index 17d937d1f3940e..d5324e4274725b 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fcopysign.ll
@@ -6,8 +6,8 @@
 define float @test1(float %x, float %y) nounwind {
 ; CHECK-LABEL: test1:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    ; kill: def $s0 killed $s0 def $q0
 ; CHECK-NEXT:    mvni.4s v2, #128, lsl #24
+; CHECK-NEXT:    ; kill: def $s0 killed $s0 def $q0
 ; CHECK-NEXT:    ; kill: def $s1 killed $s1 def $q1
 ; CHECK-NEXT:    bif.16b v0, v1, v2
 ; CHECK-NEXT:    ; kill: def $s0 killed $s0 killed $q0
@@ -37,8 +37,8 @@ define double @test3(double %a, float %b, float %c) nounwind {
 ; CHECK-LABEL: test3:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    movi.2d v3, #0xffffffffffffffff
-; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    fadd s1, s1, s2
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    fneg.2d v2, v3
 ; CHECK-NEXT:    fcvt d1, s1
 ; CHECK-NEXT:    bif.16b v0, v1, v2

diff  --git a/llvm/test/CodeGen/AArch64/arm64-fmadd.ll b/llvm/test/CodeGen/AArch64/arm64-fmadd.ll
index b1dde3fefa9165..364cbefaa873a8 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fmadd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fmadd.ll
@@ -222,11 +222,11 @@ entry:
 define float @negated_constant(float %x) {
 ; CHECK-LABEL: negated_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-1037565952
+; CHECK-NEXT:    mov w8, #-1037565952 // =0xc2280000
 ; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    mov w8, #1109917696
-; CHECK-NEXT:    fmul s1, s0, s1
+; CHECK-NEXT:    mov w8, #1109917696 // =0x42280000
 ; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmul s1, s0, s1
 ; CHECK-NEXT:    fmadd s0, s0, s2, s1
 ; CHECK-NEXT:    ret
   %m = fmul float %x, 42.0

diff  --git a/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-no-helper.ll b/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-no-helper.ll
index 40f6c948683d52..3dd4fcba3412d9 100644
--- a/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-no-helper.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-no-helper.ll
@@ -20,12 +20,12 @@ define float @_Z3foofffi(float %b, float %x, float %y, i32 %z) uwtable ssp minsi
 ; CHECK-NEXT:    .cfi_offset b10, -56
 ; CHECK-NEXT:    .cfi_offset b11, -64
 ; CHECK-NEXT:    fmov s3, #1.00000000
-; CHECK-NEXT:    scvtf s4, w0
 ; CHECK-NEXT:    sub w19, w0, #1
 ; CHECK-NEXT:    fadd s8, s0, s3
 ; CHECK-NEXT:    fadd s0, s8, s1
+; CHECK-NEXT:    scvtf s1, w0
 ; CHECK-NEXT:    fadd s0, s0, s2
-; CHECK-NEXT:    fsub s9, s0, s4
+; CHECK-NEXT:    fsub s9, s0, s1
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __Z3goof
 ; CHECK-NEXT:    fmov s10, s0
@@ -59,12 +59,12 @@ define float @_Z3foofffi(float %b, float %x, float %y, i32 %z) uwtable ssp minsi
 ; CHECK-LINUX-NEXT:    .cfi_offset b10, -56
 ; CHECK-LINUX-NEXT:    .cfi_offset b11, -64
 ; CHECK-LINUX-NEXT:    fmov s3, #1.00000000
-; CHECK-LINUX-NEXT:    scvtf s4, w0
 ; CHECK-LINUX-NEXT:    sub w19, w0, #1
 ; CHECK-LINUX-NEXT:    fadd s8, s0, s3
 ; CHECK-LINUX-NEXT:    fadd s0, s8, s1
+; CHECK-LINUX-NEXT:    scvtf s1, w0
 ; CHECK-LINUX-NEXT:    fadd s0, s0, s2
-; CHECK-LINUX-NEXT:    fsub s9, s0, s4
+; CHECK-LINUX-NEXT:    fsub s9, s0, s1
 ; CHECK-LINUX-NEXT:    fmov s0, s8
 ; CHECK-LINUX-NEXT:    bl _Z3goof
 ; CHECK-LINUX-NEXT:    fmov s10, s0

diff  --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
index 6a82102d24a377..5ac8ad55906432 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -593,7 +593,7 @@ define ptr @test_v16i8_post_imm_st1_lane(<16 x i8> %in, ptr %addr) {
 define ptr @test_v16i8_post_reg_st1_lane(<16 x i8> %in, ptr %addr) {
 ; CHECK-LABEL: test_v16i8_post_reg_st1_lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov w8, #2
+; CHECK-NEXT:    mov w8, #2 ; =0x2
 ; CHECK-NEXT:    st1.b { v0 }[3], [x0], x8
 ; CHECK-NEXT:    ret
   %elt = extractelement <16 x i8> %in, i32 3
@@ -619,7 +619,7 @@ define ptr @test_v8i16_post_imm_st1_lane(<8 x i16> %in, ptr %addr) {
 define ptr @test_v8i16_post_reg_st1_lane(<8 x i16> %in, ptr %addr) {
 ; CHECK-LABEL: test_v8i16_post_reg_st1_lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov w8, #4
+; CHECK-NEXT:    mov w8, #4 ; =0x4
 ; CHECK-NEXT:    st1.h { v0 }[3], [x0], x8
 ; CHECK-NEXT:    ret
   %elt = extractelement <8 x i16> %in, i32 3
@@ -644,7 +644,7 @@ define ptr @test_v4i32_post_imm_st1_lane(<4 x i32> %in, ptr %addr) {
 define ptr @test_v4i32_post_reg_st1_lane(<4 x i32> %in, ptr %addr) {
 ; CHECK-LABEL: test_v4i32_post_reg_st1_lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov w8, #8
+; CHECK-NEXT:    mov w8, #8 ; =0x8
 ; CHECK-NEXT:    st1.s { v0 }[3], [x0], x8
 ; CHECK-NEXT:    ret
   %elt = extractelement <4 x i32> %in, i32 3
@@ -669,7 +669,7 @@ define ptr @test_v4f32_post_imm_st1_lane(<4 x float> %in, ptr %addr) {
 define ptr @test_v4f32_post_reg_st1_lane(<4 x float> %in, ptr %addr) {
 ; CHECK-LABEL: test_v4f32_post_reg_st1_lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov w8, #8
+; CHECK-NEXT:    mov w8, #8 ; =0x8
 ; CHECK-NEXT:    st1.s { v0 }[3], [x0], x8
 ; CHECK-NEXT:    ret
   %elt = extractelement <4 x float> %in, i32 3
@@ -694,7 +694,7 @@ define ptr @test_v2i64_post_imm_st1_lane(<2 x i64> %in, ptr %addr) {
 define ptr @test_v2i64_post_reg_st1_lane(<2 x i64> %in, ptr %addr) {
 ; CHECK-LABEL: test_v2i64_post_reg_st1_lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov w8, #16
+; CHECK-NEXT:    mov w8, #16 ; =0x10
 ; CHECK-NEXT:    st1.d { v0 }[1], [x0], x8
 ; CHECK-NEXT:    ret
   %elt = extractelement <2 x i64> %in, i64 1
@@ -719,7 +719,7 @@ define ptr @test_v2f64_post_imm_st1_lane(<2 x double> %in, ptr %addr) {
 define ptr @test_v2f64_post_reg_st1_lane(<2 x double> %in, ptr %addr) {
 ; CHECK-LABEL: test_v2f64_post_reg_st1_lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov w8, #16
+; CHECK-NEXT:    mov w8, #16 ; =0x10
 ; CHECK-NEXT:    st1.d { v0 }[1], [x0], x8
 ; CHECK-NEXT:    ret
   %elt = extractelement <2 x double> %in, i32 1
@@ -745,7 +745,7 @@ define ptr @test_v8i8_post_imm_st1_lane(<8 x i8> %in, ptr %addr) {
 define ptr @test_v8i8_post_reg_st1_lane(<8 x i8> %in, ptr %addr) {
 ; CHECK-LABEL: test_v8i8_post_reg_st1_lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov w8, #2
+; CHECK-NEXT:    mov w8, #2 ; =0x2
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    st1.b { v0 }[3], [x0], x8
 ; CHECK-NEXT:    ret
@@ -772,7 +772,7 @@ define ptr @test_v4i16_post_imm_st1_lane(<4 x i16> %in, ptr %addr) {
 define ptr @test_v4i16_post_reg_st1_lane(<4 x i16> %in, ptr %addr) {
 ; CHECK-LABEL: test_v4i16_post_reg_st1_lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov w8, #4
+; CHECK-NEXT:    mov w8, #4 ; =0x4
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    st1.h { v0 }[3], [x0], x8
 ; CHECK-NEXT:    ret
@@ -799,7 +799,7 @@ define ptr @test_v2i32_post_imm_st1_lane(<2 x i32> %in, ptr %addr) {
 define ptr @test_v2i32_post_reg_st1_lane(<2 x i32> %in, ptr %addr) {
 ; CHECK-LABEL: test_v2i32_post_reg_st1_lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov w8, #8
+; CHECK-NEXT:    mov w8, #8 ; =0x8
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    st1.s { v0 }[1], [x0], x8
 ; CHECK-NEXT:    ret
@@ -826,7 +826,7 @@ define ptr @test_v2f32_post_imm_st1_lane(<2 x float> %in, ptr %addr) {
 define ptr @test_v2f32_post_reg_st1_lane(<2 x float> %in, ptr %addr) {
 ; CHECK-LABEL: test_v2f32_post_reg_st1_lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov w8, #8
+; CHECK-NEXT:    mov w8, #8 ; =0x8
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    st1.s { v0 }[1], [x0], x8
 ; CHECK-NEXT:    ret
@@ -3909,8 +3909,8 @@ define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2lane(ptr %A, ptr %ptr, <
 define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2lane(ptr %A, ptr %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C) nounwind {
 ; CHECK-LABEL: test_v8i16_post_reg_ld2lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #1
 ; CHECK-NEXT:    ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    lsl x8, x2, #1
 ; CHECK-NEXT:    ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ld2.h { v0, v1 }[0], [x0], x8
 ; CHECK-NEXT:    str x0, [x1]
@@ -3941,8 +3941,8 @@ define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2lane(ptr %A, ptr %ptr, <
 define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2lane(ptr %A, ptr %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C) nounwind {
 ; CHECK-LABEL: test_v4i16_post_reg_ld2lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #1
 ; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    lsl x8, x2, #1
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ld2.h { v0, v1 }[0], [x0], x8
 ; CHECK-NEXT:    str x0, [x1]
@@ -3973,8 +3973,8 @@ define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2lane(ptr %A, ptr %ptr, <
 define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2lane(ptr %A, ptr %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C) nounwind {
 ; CHECK-LABEL: test_v4i32_post_reg_ld2lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ld2.s { v0, v1 }[0], [x0], x8
 ; CHECK-NEXT:    str x0, [x1]
@@ -4005,8 +4005,8 @@ define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2lane(ptr %A, ptr %ptr, <
 define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2lane(ptr %A, ptr %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C) nounwind {
 ; CHECK-LABEL: test_v2i32_post_reg_ld2lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ld2.s { v0, v1 }[0], [x0], x8
 ; CHECK-NEXT:    str x0, [x1]
@@ -4037,8 +4037,8 @@ define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2lane(ptr %A, ptr %ptr, <
 define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2lane(ptr %A, ptr %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C) nounwind {
 ; CHECK-LABEL: test_v2i64_post_reg_ld2lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ld2.d { v0, v1 }[0], [x0], x8
 ; CHECK-NEXT:    str x0, [x1]
@@ -4069,8 +4069,8 @@ define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2lane(ptr %A, ptr %ptr, <
 define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2lane(ptr %A, ptr %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C) nounwind {
 ; CHECK-LABEL: test_v1i64_post_reg_ld2lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ld2.d { v0, v1 }[0], [x0], x8
 ; CHECK-NEXT:    str x0, [x1]
@@ -4101,8 +4101,8 @@ define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2lane(ptr %A, ptr %pt
 define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2lane(ptr %A, ptr %ptr, i64 %inc, <4 x float> %B, <4 x float> %C) nounwind {
 ; CHECK-LABEL: test_v4f32_post_reg_ld2lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ld2.s { v0, v1 }[0], [x0], x8
 ; CHECK-NEXT:    str x0, [x1]
@@ -4133,8 +4133,8 @@ define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2lane(ptr %A, ptr %pt
 define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2lane(ptr %A, ptr %ptr, i64 %inc, <2 x float> %B, <2 x float> %C) nounwind {
 ; CHECK-LABEL: test_v2f32_post_reg_ld2lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ld2.s { v0, v1 }[0], [x0], x8
 ; CHECK-NEXT:    str x0, [x1]
@@ -4165,8 +4165,8 @@ define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2lane(ptr %A, ptr %
 define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2lane(ptr %A, ptr %ptr, i64 %inc, <2 x double> %B, <2 x double> %C) nounwind {
 ; CHECK-LABEL: test_v2f64_post_reg_ld2lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ld2.d { v0, v1 }[0], [x0], x8
 ; CHECK-NEXT:    str x0, [x1]
@@ -4197,8 +4197,8 @@ define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2lane(ptr %A, ptr %
 define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2lane(ptr %A, ptr %ptr, i64 %inc, <1 x double> %B, <1 x double> %C) nounwind {
 ; CHECK-LABEL: test_v1f64_post_reg_ld2lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ld2.d { v0, v1 }[0], [x0], x8
 ; CHECK-NEXT:    str x0, [x1]
@@ -5456,8 +5456,8 @@ define ptr @test_v8i16_post_imm_st3(ptr %A, ptr %ptr, <8 x i16> %B, <8 x i16> %C
 define ptr @test_v8i16_post_reg_st3(ptr %A, ptr %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v8i16_post_reg_st3:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #1
 ; CHECK-NEXT:    ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-NEXT:    lsl x8, x2, #1
 ; CHECK-NEXT:    ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    st3.8h { v0, v1, v2 }, [x0], x8
@@ -5486,8 +5486,8 @@ define ptr @test_v4i16_post_imm_st3(ptr %A, ptr %ptr, <4 x i16> %B, <4 x i16> %C
 define ptr @test_v4i16_post_reg_st3(ptr %A, ptr %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v4i16_post_reg_st3:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #1
 ; CHECK-NEXT:    ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2
+; CHECK-NEXT:    lsl x8, x2, #1
 ; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $d0_d1_d2 def $d0_d1_d2
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $d0_d1_d2 def $d0_d1_d2
 ; CHECK-NEXT:    st3.4h { v0, v1, v2 }, [x0], x8
@@ -5516,8 +5516,8 @@ define ptr @test_v4i32_post_imm_st3(ptr %A, ptr %ptr, <4 x i32> %B, <4 x i32> %C
 define ptr @test_v4i32_post_reg_st3(ptr %A, ptr %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v4i32_post_reg_st3:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    st3.4s { v0, v1, v2 }, [x0], x8
@@ -5546,8 +5546,8 @@ define ptr @test_v2i32_post_imm_st3(ptr %A, ptr %ptr, <2 x i32> %B, <2 x i32> %C
 define ptr @test_v2i32_post_reg_st3(ptr %A, ptr %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v2i32_post_reg_st3:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2
+; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $d0_d1_d2 def $d0_d1_d2
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $d0_d1_d2 def $d0_d1_d2
 ; CHECK-NEXT:    st3.2s { v0, v1, v2 }, [x0], x8
@@ -5576,8 +5576,8 @@ define ptr @test_v2i64_post_imm_st3(ptr %A, ptr %ptr, <2 x i64> %B, <2 x i64> %C
 define ptr @test_v2i64_post_reg_st3(ptr %A, ptr %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v2i64_post_reg_st3:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    st3.2d { v0, v1, v2 }, [x0], x8
@@ -5606,8 +5606,8 @@ define ptr @test_v1i64_post_imm_st3(ptr %A, ptr %ptr, <1 x i64> %B, <1 x i64> %C
 define ptr @test_v1i64_post_reg_st3(ptr %A, ptr %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v1i64_post_reg_st3:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2
+; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $d0_d1_d2 def $d0_d1_d2
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $d0_d1_d2 def $d0_d1_d2
 ; CHECK-NEXT:    st1.1d { v0, v1, v2 }, [x0], x8
@@ -5636,8 +5636,8 @@ define ptr @test_v4f32_post_imm_st3(ptr %A, ptr %ptr, <4 x float> %B, <4 x float
 define ptr @test_v4f32_post_reg_st3(ptr %A, ptr %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v4f32_post_reg_st3:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    st3.4s { v0, v1, v2 }, [x0], x8
@@ -5666,8 +5666,8 @@ define ptr @test_v2f32_post_imm_st3(ptr %A, ptr %ptr, <2 x float> %B, <2 x float
 define ptr @test_v2f32_post_reg_st3(ptr %A, ptr %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v2f32_post_reg_st3:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2
+; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $d0_d1_d2 def $d0_d1_d2
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $d0_d1_d2 def $d0_d1_d2
 ; CHECK-NEXT:    st3.2s { v0, v1, v2 }, [x0], x8
@@ -5696,8 +5696,8 @@ define ptr @test_v2f64_post_imm_st3(ptr %A, ptr %ptr, <2 x double> %B, <2 x doub
 define ptr @test_v2f64_post_reg_st3(ptr %A, ptr %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v2f64_post_reg_st3:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    st3.2d { v0, v1, v2 }, [x0], x8
@@ -5726,8 +5726,8 @@ define ptr @test_v1f64_post_imm_st3(ptr %A, ptr %ptr, <1 x double> %B, <1 x doub
 define ptr @test_v1f64_post_reg_st3(ptr %A, ptr %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v1f64_post_reg_st3:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2
+; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $d0_d1_d2 def $d0_d1_d2
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $d0_d1_d2 def $d0_d1_d2
 ; CHECK-NEXT:    st1.1d { v0, v1, v2 }, [x0], x8
@@ -6530,8 +6530,8 @@ define ptr @test_v8i16_post_imm_st1x3(ptr %A, ptr %ptr, <8 x i16> %B, <8 x i16>
 define ptr @test_v8i16_post_reg_st1x3(ptr %A, ptr %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v8i16_post_reg_st1x3:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #1
 ; CHECK-NEXT:    ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-NEXT:    lsl x8, x2, #1
 ; CHECK-NEXT:    ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    st1.8h { v0, v1, v2 }, [x0], x8
@@ -6560,8 +6560,8 @@ define ptr @test_v4i16_post_imm_st1x3(ptr %A, ptr %ptr, <4 x i16> %B, <4 x i16>
 define ptr @test_v4i16_post_reg_st1x3(ptr %A, ptr %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v4i16_post_reg_st1x3:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #1
 ; CHECK-NEXT:    ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2
+; CHECK-NEXT:    lsl x8, x2, #1
 ; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $d0_d1_d2 def $d0_d1_d2
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $d0_d1_d2 def $d0_d1_d2
 ; CHECK-NEXT:    st1.4h { v0, v1, v2 }, [x0], x8
@@ -6590,8 +6590,8 @@ define ptr @test_v4i32_post_imm_st1x3(ptr %A, ptr %ptr, <4 x i32> %B, <4 x i32>
 define ptr @test_v4i32_post_reg_st1x3(ptr %A, ptr %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v4i32_post_reg_st1x3:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    st1.4s { v0, v1, v2 }, [x0], x8
@@ -6620,8 +6620,8 @@ define ptr @test_v2i32_post_imm_st1x3(ptr %A, ptr %ptr, <2 x i32> %B, <2 x i32>
 define ptr @test_v2i32_post_reg_st1x3(ptr %A, ptr %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v2i32_post_reg_st1x3:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2
+; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $d0_d1_d2 def $d0_d1_d2
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $d0_d1_d2 def $d0_d1_d2
 ; CHECK-NEXT:    st1.2s { v0, v1, v2 }, [x0], x8
@@ -6650,8 +6650,8 @@ define ptr @test_v2i64_post_imm_st1x3(ptr %A, ptr %ptr, <2 x i64> %B, <2 x i64>
 define ptr @test_v2i64_post_reg_st1x3(ptr %A, ptr %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v2i64_post_reg_st1x3:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    st1.2d { v0, v1, v2 }, [x0], x8
@@ -6680,8 +6680,8 @@ define ptr @test_v1i64_post_imm_st1x3(ptr %A, ptr %ptr, <1 x i64> %B, <1 x i64>
 define ptr @test_v1i64_post_reg_st1x3(ptr %A, ptr %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v1i64_post_reg_st1x3:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2
+; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $d0_d1_d2 def $d0_d1_d2
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $d0_d1_d2 def $d0_d1_d2
 ; CHECK-NEXT:    st1.1d { v0, v1, v2 }, [x0], x8
@@ -6710,8 +6710,8 @@ define ptr @test_v4f32_post_imm_st1x3(ptr %A, ptr %ptr, <4 x float> %B, <4 x flo
 define ptr @test_v4f32_post_reg_st1x3(ptr %A, ptr %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v4f32_post_reg_st1x3:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    st1.4s { v0, v1, v2 }, [x0], x8
@@ -6740,8 +6740,8 @@ define ptr @test_v2f32_post_imm_st1x3(ptr %A, ptr %ptr, <2 x float> %B, <2 x flo
 define ptr @test_v2f32_post_reg_st1x3(ptr %A, ptr %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v2f32_post_reg_st1x3:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2
+; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $d0_d1_d2 def $d0_d1_d2
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $d0_d1_d2 def $d0_d1_d2
 ; CHECK-NEXT:    st1.2s { v0, v1, v2 }, [x0], x8
@@ -6770,8 +6770,8 @@ define ptr @test_v2f64_post_imm_st1x3(ptr %A, ptr %ptr, <2 x double> %B, <2 x do
 define ptr @test_v2f64_post_reg_st1x3(ptr %A, ptr %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v2f64_post_reg_st1x3:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    st1.2d { v0, v1, v2 }, [x0], x8
@@ -6800,8 +6800,8 @@ define ptr @test_v1f64_post_imm_st1x3(ptr %A, ptr %ptr, <1 x double> %B, <1 x do
 define ptr @test_v1f64_post_reg_st1x3(ptr %A, ptr %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v1f64_post_reg_st1x3:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2
+; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $d0_d1_d2 def $d0_d1_d2
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $d0_d1_d2 def $d0_d1_d2
 ; CHECK-NEXT:    st1.1d { v0, v1, v2 }, [x0], x8
@@ -7603,8 +7603,8 @@ define ptr @test_v8i16_post_imm_st3lane(ptr %A, ptr %ptr, <8 x i16> %B, <8 x i16
 define ptr @test_v8i16_post_reg_st3lane(ptr %A, ptr %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v8i16_post_reg_st3lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #1
 ; CHECK-NEXT:    ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-NEXT:    lsl x8, x2, #1
 ; CHECK-NEXT:    ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    st3.h { v0, v1, v2 }[0], [x0], x8
@@ -7633,8 +7633,8 @@ define ptr @test_v4i16_post_imm_st3lane(ptr %A, ptr %ptr, <4 x i16> %B, <4 x i16
 define ptr @test_v4i16_post_reg_st3lane(ptr %A, ptr %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v4i16_post_reg_st3lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #1
 ; CHECK-NEXT:    ; kill: def $d2 killed $d2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-NEXT:    lsl x8, x2, #1
 ; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    st3.h { v0, v1, v2 }[0], [x0], x8
@@ -7663,8 +7663,8 @@ define ptr @test_v4i32_post_imm_st3lane(ptr %A, ptr %ptr, <4 x i32> %B, <4 x i32
 define ptr @test_v4i32_post_reg_st3lane(ptr %A, ptr %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v4i32_post_reg_st3lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    st3.s { v0, v1, v2 }[0], [x0], x8
@@ -7693,8 +7693,8 @@ define ptr @test_v2i32_post_imm_st3lane(ptr %A, ptr %ptr, <2 x i32> %B, <2 x i32
 define ptr @test_v2i32_post_reg_st3lane(ptr %A, ptr %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v2i32_post_reg_st3lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $d2 killed $d2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    st3.s { v0, v1, v2 }[0], [x0], x8
@@ -7723,8 +7723,8 @@ define ptr @test_v2i64_post_imm_st3lane(ptr %A, ptr %ptr, <2 x i64> %B, <2 x i64
 define ptr @test_v2i64_post_reg_st3lane(ptr %A, ptr %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v2i64_post_reg_st3lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    st3.d { v0, v1, v2 }[0], [x0], x8
@@ -7753,8 +7753,8 @@ define ptr @test_v1i64_post_imm_st3lane(ptr %A, ptr %ptr, <1 x i64> %B, <1 x i64
 define ptr @test_v1i64_post_reg_st3lane(ptr %A, ptr %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v1i64_post_reg_st3lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $d2 killed $d2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    st3.d { v0, v1, v2 }[0], [x0], x8
@@ -7783,8 +7783,8 @@ define ptr @test_v4f32_post_imm_st3lane(ptr %A, ptr %ptr, <4 x float> %B, <4 x f
 define ptr @test_v4f32_post_reg_st3lane(ptr %A, ptr %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v4f32_post_reg_st3lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    st3.s { v0, v1, v2 }[0], [x0], x8
@@ -7813,8 +7813,8 @@ define ptr @test_v2f32_post_imm_st3lane(ptr %A, ptr %ptr, <2 x float> %B, <2 x f
 define ptr @test_v2f32_post_reg_st3lane(ptr %A, ptr %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v2f32_post_reg_st3lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $d2 killed $d2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    st3.s { v0, v1, v2 }[0], [x0], x8
@@ -7843,8 +7843,8 @@ define ptr @test_v2f64_post_imm_st3lane(ptr %A, ptr %ptr, <2 x double> %B, <2 x
 define ptr @test_v2f64_post_reg_st3lane(ptr %A, ptr %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v2f64_post_reg_st3lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    st3.d { v0, v1, v2 }[0], [x0], x8
@@ -7873,8 +7873,8 @@ define ptr @test_v1f64_post_imm_st3lane(ptr %A, ptr %ptr, <1 x double> %B, <1 x
 define ptr @test_v1f64_post_reg_st3lane(ptr %A, ptr %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind {
 ; CHECK-LABEL: test_v1f64_post_reg_st3lane:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $d2 killed $d2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-NEXT:    lsl x8, x2, #3
 ; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    st3.d { v0, v1, v2 }[0], [x0], x8
@@ -8910,10 +8910,10 @@ define <4 x float> @test_v4f32_post_reg_ld1lane_dep_vec_on_load(ptr %bar, ptr %p
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    ldr s1, [x0]
 ; CHECK-NEXT:    str q0, [x3]
-; CHECK-NEXT:    ldr q0, [x4]
 ; CHECK-NEXT:    add x8, x0, x2, lsl #2
-; CHECK-NEXT:    mov.s v0[1], v1[0]
+; CHECK-NEXT:    ldr q0, [x4]
 ; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    mov.s v0[1], v1[0]
 ; CHECK-NEXT:    ret
   %tmp1 = load float, ptr %bar
   store <4 x float> %vec, ptr %dep_ptr_1, align 16
@@ -9071,10 +9071,10 @@ define <4 x i32> @test_inc_cycle(<4 x i32> %vec, ptr %in) {
 ; CHECK-LABEL: test_inc_cycle:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    ld1.s { v0 }[0], [x0]
-; CHECK-NEXT:    adrp x8, _var at PAGE
-; CHECK-NEXT:    fmov x9, d0
-; CHECK-NEXT:    add x9, x0, x9, lsl #2
-; CHECK-NEXT:    str x9, [x8, _var at PAGEOFF]
+; CHECK-NEXT:    adrp x9, _var at PAGE
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    add x8, x0, x8, lsl #2
+; CHECK-NEXT:    str x8, [x9, _var at PAGEOFF]
 ; CHECK-NEXT:    ret
   %elt = load i32, ptr %in
   %newvec = insertelement <4 x i32> %vec, i32 %elt, i32 0
@@ -9143,7 +9143,7 @@ define i32 @load_single_extract_variable_index_v3i32_small_align(ptr %A, i32 %id
 ; CHECK-LABEL: load_single_extract_variable_index_v3i32_small_align:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    mov w9, w1
-; CHECK-NEXT:    mov w8, #2
+; CHECK-NEXT:    mov w8, #2 ; =0x2
 ; CHECK-NEXT:    cmp x9, #2
 ; CHECK-NEXT:    csel x8, x9, x8, lo
 ; CHECK-NEXT:    ldr w0, [x0, x8, lsl #2]
@@ -9157,7 +9157,7 @@ define i32 @load_single_extract_variable_index_v3i32_default_align(ptr %A, i32 %
 ; CHECK-LABEL: load_single_extract_variable_index_v3i32_default_align:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    mov w9, w1
-; CHECK-NEXT:    mov w8, #2
+; CHECK-NEXT:    mov w8, #2 ; =0x2
 ; CHECK-NEXT:    cmp x9, #2
 ; CHECK-NEXT:    csel x8, x9, x8, lo
 ; CHECK-NEXT:    ldr w0, [x0, x8, lsl #2]

diff  --git a/llvm/test/CodeGen/AArch64/arm64-inline-asm.ll b/llvm/test/CodeGen/AArch64/arm64-inline-asm.ll
index 11faa0051d9e40..8947563e31f855 100644
--- a/llvm/test/CodeGen/AArch64/arm64-inline-asm.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-inline-asm.ll
@@ -367,13 +367,13 @@ entry:
 define void @test_zero_reg(ptr %addr) {
 ; CHECK-LABEL: test_zero_reg:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov w8, #1
 ; CHECK-NEXT:    ; InlineAsm Start
 ; CHECK-NEXT:    USE(xzr)
 ; CHECK-NEXT:    ; InlineAsm End
 ; CHECK-NEXT:    ; InlineAsm Start
 ; CHECK-NEXT:    USE(wzr)
 ; CHECK-NEXT:    ; InlineAsm End
+; CHECK-NEXT:    mov w8, #1 ; =0x1
 ; CHECK-NEXT:    ; InlineAsm Start
 ; CHECK-NEXT:    USE(w8)
 ; CHECK-NEXT:    ; InlineAsm End
@@ -487,8 +487,8 @@ define void @test_vector_too_large_r_m(ptr nocapture readonly %0) {
 ; CHECK-NEXT:    ldp q2, q1, [x0]
 ; CHECK-NEXT:    mov x8, sp
 ; CHECK-NEXT:    ldr s0, [x0, #32]
-; CHECK-NEXT:    stp q2, q1, [sp]
 ; CHECK-NEXT:    str s0, [sp, #32]
+; CHECK-NEXT:    stp q2, q1, [sp]
 ; CHECK-NEXT:    ; InlineAsm Start
 ; CHECK-NEXT:    ; InlineAsm End
 ; CHECK-NEXT:    add sp, sp, #64

diff  --git a/llvm/test/CodeGen/AArch64/arm64-instruction-mix-remarks.ll b/llvm/test/CodeGen/AArch64/arm64-instruction-mix-remarks.ll
index dc4d12b2b21ca0..2816f91df44ddc 100644
--- a/llvm/test/CodeGen/AArch64/arm64-instruction-mix-remarks.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-instruction-mix-remarks.ll
@@ -36,9 +36,9 @@ define i32 @foo(ptr %ptr, i32 %x, i64 %y) !dbg !3 {
 ; CHECK-NEXT:    b.eq LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %else
 ; CHECK-NEXT:    mul w9, w0, w1
-; CHECK-NEXT:    mov w10, #10
 ; CHECK-NEXT:    mul w0, w9, w1
-; CHECK-NEXT:    str w10, [x8]
+; CHECK-NEXT:    mov w9, #10 ; =0xa
+; CHECK-NEXT:    str w9, [x8]
 ; CHECK-NEXT:  LBB0_2: ; %common.ret
 ; CHECK-NEXT:    ; kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/arm64-ld1.ll b/llvm/test/CodeGen/AArch64/arm64-ld1.ll
index 292617156597d4..161424eaf11e38 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ld1.ll
@@ -913,8 +913,8 @@ define <2 x float> @ld1_2s_float(<2 x float> %V, ptr %bar) {
 define void @ld1r_2s_from_dup(ptr nocapture %a, ptr nocapture %b, ptr nocapture %
diff ) nounwind ssp {
 entry:
 ; CHECK: ld1r_2s_from_dup
-; CHECK: ld1r.2s { [[ARG2:v[0-9]+]] }, [x1]
-; CHECK-NEXT: ld1r.2s { [[ARG1:v[0-9]+]] }, [x0]
+; CHECK: ld1r.2s { [[ARG1:v[0-9]+]] }, [x0]
+; CHECK-NEXT: ld1r.2s { [[ARG2:v[0-9]+]] }, [x1]
 ; CHECK-NEXT: usubl.8h v[[RESREGNUM:[0-9]+]], [[ARG1]], [[ARG2]]
 ; CHECK-NEXT: str d[[RESREGNUM]], [x2]
 ; CHECK-NEXT: ret

diff  --git a/llvm/test/CodeGen/AArch64/arm64-ldp.ll b/llvm/test/CodeGen/AArch64/arm64-ldp.ll
index 9d5c68f6fea315..03b7a8c8ba4f48 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ldp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ldp.ll
@@ -422,10 +422,8 @@ define i64 @ldp_sext_int_post(i32* %p) nounwind {
 ; CHECK-LABEL: ldp_sext_int_post:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-NEXT:    add x8, x0, #8
 ; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    ldpsw x19, x20, [x0]
-; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    ldpsw x19, x20, [x0], #8
 ; CHECK-NEXT:    bl "use-ptr"
 ; CHECK-NEXT:    add x0, x20, x19
 ; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll b/llvm/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll
index ab314702463d23..1cc0771d729d6b 100644
--- a/llvm/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll
@@ -10,7 +10,7 @@ define void @t0(ptr %out, ptr %in) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    mov w2, #16
+; CHECK-NEXT:    mov w2, #16 ; =0x10
 ; CHECK-NEXT:    bl _memcpy
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -39,12 +39,12 @@ define void @t2(ptr %out, ptr %in) {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    ldrb w8, [x1, #3]
 ; CHECK-NEXT:    ldrb w9, [x1, #2]
-; CHECK-NEXT:    ldrb w10, [x1, #1]
-; CHECK-NEXT:    ldrb w11, [x1]
+; CHECK-NEXT:    ldrb w10, [x1]
+; CHECK-NEXT:    ldrb w11, [x1, #1]
 ; CHECK-NEXT:    strb w8, [x0, #3]
 ; CHECK-NEXT:    strb w9, [x0, #2]
-; CHECK-NEXT:    strb w10, [x0, #1]
-; CHECK-NEXT:    strb w11, [x0]
+; CHECK-NEXT:    strb w11, [x0, #1]
+; CHECK-NEXT:    strb w10, [x0]
 ; CHECK-NEXT:    ret
 entry:
   call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %in, i64 4, i1 false)

diff  --git a/llvm/test/CodeGen/AArch64/arm64-mul.ll b/llvm/test/CodeGen/AArch64/arm64-mul.ll
index 16a7cf8d10bec4..da978b49fff749 100644
--- a/llvm/test/CodeGen/AArch64/arm64-mul.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-mul.ll
@@ -111,7 +111,7 @@ entry:
 define i64 @t9(i32 %a) nounwind {
 ; CHECK-LABEL: t9:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #8896
+; CHECK-NEXT:    mov w8, #8896 // =0x22c0
 ; CHECK-NEXT:    movk w8, #2, lsl #16
 ; CHECK-NEXT:    umull x0, w0, w8
 ; CHECK-NEXT:    ret
@@ -125,11 +125,11 @@ entry:
 define i64 @t10(i32 %a) nounwind {
 ; CHECK-LABEL: t10:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #2
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    sxtw x9, w0
-; CHECK-NEXT:    movk w8, #32768, lsl #16
-; CHECK-NEXT:    mul x0, x9, x8
+; CHECK-NEXT:    sxtw x8, w0
+; CHECK-NEXT:    mov w9, #2 // =0x2
+; CHECK-NEXT:    movk w9, #32768, lsl #16
+; CHECK-NEXT:    mul x0, x8, x9
 ; CHECK-NEXT:    ret
 entry:
   %tmp1 = sext i32 %a to i64
@@ -141,7 +141,7 @@ entry:
 define i64 @t11(i64 %a) nounwind {
 ; CHECK-LABEL: t11:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #29594
+; CHECK-NEXT:    mov w8, #29594 // =0x739a
 ; CHECK-NEXT:    movk w8, #65499, lsl #16
 ; CHECK-NEXT:    smnegl x0, w0, w8
 ; CHECK-NEXT:    ret
@@ -156,7 +156,7 @@ entry:
 define i64 @t12(i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: t12:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #35118
+; CHECK-NEXT:    mov w8, #35118 // =0x892e
 ; CHECK-NEXT:    movk w8, #65008, lsl #16
 ; CHECK-NEXT:    smaddl x0, w0, w8, x1
 ; CHECK-NEXT:    ret
@@ -171,7 +171,7 @@ entry:
 define i64 @t13(i32 %a, i64 %b) nounwind {
 ; CHECK-LABEL: t13:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #24910
+; CHECK-NEXT:    mov w8, #24910 // =0x614e
 ; CHECK-NEXT:    movk w8, #188, lsl #16
 ; CHECK-NEXT:    umsubl x0, w0, w8, x1
 ; CHECK-NEXT:    ret
@@ -185,7 +185,7 @@ entry:
 define i64 @t14(i32 %a, i64 %b) nounwind {
 ; CHECK-LABEL: t14:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #40626
+; CHECK-NEXT:    mov w8, #40626 // =0x9eb2
 ; CHECK-NEXT:    movk w8, #65347, lsl #16
 ; CHECK-NEXT:    smsubl x0, w0, w8, x1
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/arm64-neon-3v
diff .ll b/llvm/test/CodeGen/AArch64/arm64-neon-3v
diff .ll
index fdbd6f815354fe..79645e32074c89 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-3v
diff .ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-3v
diff .ll
@@ -2537,18 +2537,18 @@ entry:
 define <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %scale.coerce) {
 ; CHECK-LABEL: cmplx_mul_combined_re_im:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    lsr x8, x0, #16
-; CHECK-NEXT:    adrp x9, .LCPI196_0
-; CHECK-NEXT:    fmov d4, x0
-; CHECK-NEXT:    rev32 v5.8h, v0.8h
-; CHECK-NEXT:    dup v1.8h, w8
-; CHECK-NEXT:    ldr q3, [x9, :lo12:.LCPI196_0]
+; CHECK-NEXT:    lsr x9, x0, #16
+; CHECK-NEXT:    adrp x8, .LCPI196_0
+; CHECK-NEXT:    fmov d5, x0
+; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI196_0]
+; CHECK-NEXT:    rev32 v4.8h, v0.8h
+; CHECK-NEXT:    dup v1.8h, w9
 ; CHECK-NEXT:    sqneg v2.8h, v1.8h
 ; CHECK-NEXT:    tbl v1.16b, { v1.16b, v2.16b }, v3.16b
-; CHECK-NEXT:    sqdmull v2.4s, v0.4h, v4.h[0]
-; CHECK-NEXT:    sqdmull2 v0.4s, v0.8h, v4.h[0]
-; CHECK-NEXT:    sqdmlal v2.4s, v5.4h, v1.4h
-; CHECK-NEXT:    sqdmlal2 v0.4s, v5.8h, v1.8h
+; CHECK-NEXT:    sqdmull v2.4s, v0.4h, v5.h[0]
+; CHECK-NEXT:    sqdmull2 v0.4s, v0.8h, v5.h[0]
+; CHECK-NEXT:    sqdmlal v2.4s, v4.4h, v1.4h
+; CHECK-NEXT:    sqdmlal2 v0.4s, v4.8h, v1.8h
 ; CHECK-NEXT:    uzp2 v0.8h, v2.8h, v0.8h
 ; CHECK-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll b/llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll
index f77df93783b8cc..7039cccdf9393c 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll
@@ -73,13 +73,13 @@ define <1 x i64> @mul1xi64(<1 x i64> %A, <1 x i64> %B) {
 define <2 x i64> @mul2xi64(<2 x i64> %A, <2 x i64> %B) {
 ; CHECK-LABEL: mul2xi64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov x9, d1
-; CHECK-NEXT:    fmov x10, d0
+; CHECK-NEXT:    fmov x10, d1
+; CHECK-NEXT:    fmov x11, d0
 ; CHECK-NEXT:    mov x8, v1.d[1]
-; CHECK-NEXT:    mov x11, v0.d[1]
-; CHECK-NEXT:    mul x9, x10, x9
-; CHECK-NEXT:    mul x8, x11, x8
-; CHECK-NEXT:    fmov d0, x9
+; CHECK-NEXT:    mov x9, v0.d[1]
+; CHECK-NEXT:    mul x10, x11, x10
+; CHECK-NEXT:    mul x8, x9, x8
+; CHECK-NEXT:    fmov d0, x10
 ; CHECK-NEXT:    mov v0.d[1], x8
 ; CHECK-NEXT:    ret
   %tmp3 = mul <2 x i64> %A, %B;
@@ -164,6 +164,7 @@ define <8 x i8> @sdiv8x8(<8 x i8> %A, <8 x i8> %B) {
 ; CHECK-NEXT:    smov w11, v0.b[2]
 ; CHECK-NEXT:    smov w12, v0.b[3]
 ; CHECK-NEXT:    smov w13, v0.b[4]
+; CHECK-NEXT:    smov w14, v0.b[5]
 ; CHECK-NEXT:    sdiv w8, w9, w8
 ; CHECK-NEXT:    smov w9, v1.b[0]
 ; CHECK-NEXT:    sdiv w9, w10, w9
@@ -171,18 +172,17 @@ define <8 x i8> @sdiv8x8(<8 x i8> %A, <8 x i8> %B) {
 ; CHECK-NEXT:    sdiv w10, w11, w10
 ; CHECK-NEXT:    smov w11, v1.b[3]
 ; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    smov w9, v1.b[5]
+; CHECK-NEXT:    smov w9, v1.b[6]
 ; CHECK-NEXT:    mov v2.b[1], w8
 ; CHECK-NEXT:    sdiv w11, w12, w11
 ; CHECK-NEXT:    smov w12, v1.b[4]
 ; CHECK-NEXT:    mov v2.b[2], w10
 ; CHECK-NEXT:    smov w10, v0.b[6]
 ; CHECK-NEXT:    sdiv w12, w13, w12
-; CHECK-NEXT:    smov w13, v0.b[5]
+; CHECK-NEXT:    smov w13, v1.b[5]
 ; CHECK-NEXT:    mov v2.b[3], w11
 ; CHECK-NEXT:    smov w11, v0.b[7]
-; CHECK-NEXT:    sdiv w8, w13, w9
-; CHECK-NEXT:    smov w9, v1.b[6]
+; CHECK-NEXT:    sdiv w8, w14, w13
 ; CHECK-NEXT:    mov v2.b[4], w12
 ; CHECK-NEXT:    sdiv w9, w10, w9
 ; CHECK-NEXT:    smov w10, v1.b[7]
@@ -207,16 +207,17 @@ define <16 x i8> @sdiv16x8(<16 x i8> %A, <16 x i8> %B) {
 ; CHECK-NEXT:    smov w13, v0.b[4]
 ; CHECK-NEXT:    smov w14, v0.b[5]
 ; CHECK-NEXT:    smov w15, v0.b[6]
-; CHECK-NEXT:    sdiv w8, w9, w8
-; CHECK-NEXT:    smov w9, v1.b[0]
 ; CHECK-NEXT:    smov w16, v0.b[7]
 ; CHECK-NEXT:    smov w17, v0.b[8]
+; CHECK-NEXT:    smov w18, v0.b[9]
+; CHECK-NEXT:    sdiv w8, w9, w8
+; CHECK-NEXT:    smov w9, v1.b[0]
 ; CHECK-NEXT:    sdiv w9, w10, w9
 ; CHECK-NEXT:    smov w10, v1.b[2]
 ; CHECK-NEXT:    sdiv w10, w11, w10
 ; CHECK-NEXT:    smov w11, v1.b[3]
 ; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    smov w9, v1.b[9]
+; CHECK-NEXT:    smov w9, v1.b[10]
 ; CHECK-NEXT:    mov v2.b[1], w8
 ; CHECK-NEXT:    sdiv w11, w12, w11
 ; CHECK-NEXT:    smov w12, v1.b[4]
@@ -238,10 +239,9 @@ define <16 x i8> @sdiv16x8(<16 x i8> %A, <16 x i8> %B) {
 ; CHECK-NEXT:    smov w16, v1.b[8]
 ; CHECK-NEXT:    mov v2.b[6], w14
 ; CHECK-NEXT:    sdiv w16, w17, w16
-; CHECK-NEXT:    smov w17, v0.b[9]
+; CHECK-NEXT:    smov w17, v1.b[9]
 ; CHECK-NEXT:    mov v2.b[7], w15
-; CHECK-NEXT:    sdiv w8, w17, w9
-; CHECK-NEXT:    smov w9, v1.b[10]
+; CHECK-NEXT:    sdiv w8, w18, w17
 ; CHECK-NEXT:    mov v2.b[8], w16
 ; CHECK-NEXT:    sdiv w9, w10, w9
 ; CHECK-NEXT:    smov w10, v1.b[11]
@@ -319,6 +319,7 @@ define <8 x i16> @sdiv8x16(<8 x i16> %A, <8 x i16> %B) {
 ; CHECK-NEXT:    smov w11, v0.h[2]
 ; CHECK-NEXT:    smov w12, v0.h[3]
 ; CHECK-NEXT:    smov w13, v0.h[4]
+; CHECK-NEXT:    smov w14, v0.h[5]
 ; CHECK-NEXT:    sdiv w8, w9, w8
 ; CHECK-NEXT:    smov w9, v1.h[0]
 ; CHECK-NEXT:    sdiv w9, w10, w9
@@ -326,18 +327,17 @@ define <8 x i16> @sdiv8x16(<8 x i16> %A, <8 x i16> %B) {
 ; CHECK-NEXT:    sdiv w10, w11, w10
 ; CHECK-NEXT:    smov w11, v1.h[3]
 ; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    smov w9, v1.h[5]
+; CHECK-NEXT:    smov w9, v1.h[6]
 ; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    sdiv w11, w12, w11
 ; CHECK-NEXT:    smov w12, v1.h[4]
 ; CHECK-NEXT:    mov v2.h[2], w10
 ; CHECK-NEXT:    smov w10, v0.h[6]
 ; CHECK-NEXT:    sdiv w12, w13, w12
-; CHECK-NEXT:    smov w13, v0.h[5]
+; CHECK-NEXT:    smov w13, v1.h[5]
 ; CHECK-NEXT:    mov v2.h[3], w11
 ; CHECK-NEXT:    smov w11, v0.h[7]
-; CHECK-NEXT:    sdiv w8, w13, w9
-; CHECK-NEXT:    smov w9, v1.h[6]
+; CHECK-NEXT:    sdiv w8, w14, w13
 ; CHECK-NEXT:    mov v2.h[4], w12
 ; CHECK-NEXT:    sdiv w9, w10, w9
 ; CHECK-NEXT:    smov w10, v1.h[7]
@@ -463,6 +463,7 @@ define <8 x i8> @udiv8x8(<8 x i8> %A, <8 x i8> %B) {
 ; CHECK-NEXT:    umov w11, v0.b[2]
 ; CHECK-NEXT:    umov w12, v0.b[3]
 ; CHECK-NEXT:    umov w13, v0.b[4]
+; CHECK-NEXT:    umov w14, v0.b[5]
 ; CHECK-NEXT:    udiv w8, w9, w8
 ; CHECK-NEXT:    umov w9, v1.b[0]
 ; CHECK-NEXT:    udiv w9, w10, w9
@@ -470,18 +471,17 @@ define <8 x i8> @udiv8x8(<8 x i8> %A, <8 x i8> %B) {
 ; CHECK-NEXT:    udiv w10, w11, w10
 ; CHECK-NEXT:    umov w11, v1.b[3]
 ; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    umov w9, v1.b[5]
+; CHECK-NEXT:    umov w9, v1.b[6]
 ; CHECK-NEXT:    mov v2.b[1], w8
 ; CHECK-NEXT:    udiv w11, w12, w11
 ; CHECK-NEXT:    umov w12, v1.b[4]
 ; CHECK-NEXT:    mov v2.b[2], w10
 ; CHECK-NEXT:    umov w10, v0.b[6]
 ; CHECK-NEXT:    udiv w12, w13, w12
-; CHECK-NEXT:    umov w13, v0.b[5]
+; CHECK-NEXT:    umov w13, v1.b[5]
 ; CHECK-NEXT:    mov v2.b[3], w11
 ; CHECK-NEXT:    umov w11, v0.b[7]
-; CHECK-NEXT:    udiv w8, w13, w9
-; CHECK-NEXT:    umov w9, v1.b[6]
+; CHECK-NEXT:    udiv w8, w14, w13
 ; CHECK-NEXT:    mov v2.b[4], w12
 ; CHECK-NEXT:    udiv w9, w10, w9
 ; CHECK-NEXT:    umov w10, v1.b[7]
@@ -506,16 +506,17 @@ define <16 x i8> @udiv16x8(<16 x i8> %A, <16 x i8> %B) {
 ; CHECK-NEXT:    umov w13, v0.b[4]
 ; CHECK-NEXT:    umov w14, v0.b[5]
 ; CHECK-NEXT:    umov w15, v0.b[6]
-; CHECK-NEXT:    udiv w8, w9, w8
-; CHECK-NEXT:    umov w9, v1.b[0]
 ; CHECK-NEXT:    umov w16, v0.b[7]
 ; CHECK-NEXT:    umov w17, v0.b[8]
+; CHECK-NEXT:    umov w18, v0.b[9]
+; CHECK-NEXT:    udiv w8, w9, w8
+; CHECK-NEXT:    umov w9, v1.b[0]
 ; CHECK-NEXT:    udiv w9, w10, w9
 ; CHECK-NEXT:    umov w10, v1.b[2]
 ; CHECK-NEXT:    udiv w10, w11, w10
 ; CHECK-NEXT:    umov w11, v1.b[3]
 ; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    umov w9, v1.b[9]
+; CHECK-NEXT:    umov w9, v1.b[10]
 ; CHECK-NEXT:    mov v2.b[1], w8
 ; CHECK-NEXT:    udiv w11, w12, w11
 ; CHECK-NEXT:    umov w12, v1.b[4]
@@ -537,10 +538,9 @@ define <16 x i8> @udiv16x8(<16 x i8> %A, <16 x i8> %B) {
 ; CHECK-NEXT:    umov w16, v1.b[8]
 ; CHECK-NEXT:    mov v2.b[6], w14
 ; CHECK-NEXT:    udiv w16, w17, w16
-; CHECK-NEXT:    umov w17, v0.b[9]
+; CHECK-NEXT:    umov w17, v1.b[9]
 ; CHECK-NEXT:    mov v2.b[7], w15
-; CHECK-NEXT:    udiv w8, w17, w9
-; CHECK-NEXT:    umov w9, v1.b[10]
+; CHECK-NEXT:    udiv w8, w18, w17
 ; CHECK-NEXT:    mov v2.b[8], w16
 ; CHECK-NEXT:    udiv w9, w10, w9
 ; CHECK-NEXT:    umov w10, v1.b[11]
@@ -618,6 +618,7 @@ define <8 x i16> @udiv8x16(<8 x i16> %A, <8 x i16> %B) {
 ; CHECK-NEXT:    umov w11, v0.h[2]
 ; CHECK-NEXT:    umov w12, v0.h[3]
 ; CHECK-NEXT:    umov w13, v0.h[4]
+; CHECK-NEXT:    umov w14, v0.h[5]
 ; CHECK-NEXT:    udiv w8, w9, w8
 ; CHECK-NEXT:    umov w9, v1.h[0]
 ; CHECK-NEXT:    udiv w9, w10, w9
@@ -625,18 +626,17 @@ define <8 x i16> @udiv8x16(<8 x i16> %A, <8 x i16> %B) {
 ; CHECK-NEXT:    udiv w10, w11, w10
 ; CHECK-NEXT:    umov w11, v1.h[3]
 ; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    umov w9, v1.h[5]
+; CHECK-NEXT:    umov w9, v1.h[6]
 ; CHECK-NEXT:    mov v2.h[1], w8
 ; CHECK-NEXT:    udiv w11, w12, w11
 ; CHECK-NEXT:    umov w12, v1.h[4]
 ; CHECK-NEXT:    mov v2.h[2], w10
 ; CHECK-NEXT:    umov w10, v0.h[6]
 ; CHECK-NEXT:    udiv w12, w13, w12
-; CHECK-NEXT:    umov w13, v0.h[5]
+; CHECK-NEXT:    umov w13, v1.h[5]
 ; CHECK-NEXT:    mov v2.h[3], w11
 ; CHECK-NEXT:    umov w11, v0.h[7]
-; CHECK-NEXT:    udiv w8, w13, w9
-; CHECK-NEXT:    umov w9, v1.h[6]
+; CHECK-NEXT:    udiv w8, w14, w13
 ; CHECK-NEXT:    mov v2.h[4], w12
 ; CHECK-NEXT:    udiv w9, w10, w9
 ; CHECK-NEXT:    umov w10, v1.h[7]
@@ -765,37 +765,37 @@ define <8 x i8> @srem8x8(<8 x i8> %A, <8 x i8> %B) {
 ; CHECK-NEXT:    smov w15, v0.b[2]
 ; CHECK-NEXT:    smov w17, v1.b[3]
 ; CHECK-NEXT:    smov w18, v0.b[3]
-; CHECK-NEXT:    sdiv w13, w12, w11
 ; CHECK-NEXT:    smov w1, v1.b[4]
 ; CHECK-NEXT:    smov w2, v0.b[4]
-; CHECK-NEXT:    msub w11, w13, w11, w12
-; CHECK-NEXT:    smov w12, v1.b[5]
+; CHECK-NEXT:    smov w4, v1.b[5]
+; CHECK-NEXT:    smov w5, v0.b[5]
+; CHECK-NEXT:    sdiv w13, w12, w11
 ; CHECK-NEXT:    sdiv w10, w9, w8
-; CHECK-NEXT:    smov w13, v0.b[5]
+; CHECK-NEXT:    msub w11, w13, w11, w12
+; CHECK-NEXT:    smov w13, v1.b[7]
 ; CHECK-NEXT:    fmov s2, w11
 ; CHECK-NEXT:    smov w11, v0.b[6]
+; CHECK-NEXT:    sdiv w16, w15, w14
 ; CHECK-NEXT:    msub w8, w10, w8, w9
 ; CHECK-NEXT:    smov w10, v1.b[6]
-; CHECK-NEXT:    sdiv w16, w15, w14
 ; CHECK-NEXT:    mov v2.b[1], w8
-; CHECK-NEXT:    msub w8, w16, w14, w15
-; CHECK-NEXT:    smov w15, v1.b[7]
 ; CHECK-NEXT:    sdiv w0, w18, w17
-; CHECK-NEXT:    smov w16, v0.b[7]
+; CHECK-NEXT:    msub w8, w16, w14, w15
+; CHECK-NEXT:    smov w14, v0.b[7]
 ; CHECK-NEXT:    mov v2.b[2], w8
-; CHECK-NEXT:    msub w14, w0, w17, w18
 ; CHECK-NEXT:    sdiv w3, w2, w1
-; CHECK-NEXT:    mov v2.b[3], w14
-; CHECK-NEXT:    msub w14, w3, w1, w2
-; CHECK-NEXT:    sdiv w9, w13, w12
-; CHECK-NEXT:    mov v2.b[4], w14
-; CHECK-NEXT:    msub w9, w9, w12, w13
-; CHECK-NEXT:    sdiv w8, w11, w10
-; CHECK-NEXT:    mov v2.b[5], w9
-; CHECK-NEXT:    msub w8, w8, w10, w11
-; CHECK-NEXT:    sdiv w12, w16, w15
+; CHECK-NEXT:    msub w8, w0, w17, w18
+; CHECK-NEXT:    mov v2.b[3], w8
+; CHECK-NEXT:    sdiv w9, w5, w4
+; CHECK-NEXT:    msub w8, w3, w1, w2
+; CHECK-NEXT:    mov v2.b[4], w8
+; CHECK-NEXT:    sdiv w12, w11, w10
+; CHECK-NEXT:    msub w8, w9, w4, w5
+; CHECK-NEXT:    mov v2.b[5], w8
+; CHECK-NEXT:    sdiv w9, w14, w13
+; CHECK-NEXT:    msub w8, w12, w10, w11
 ; CHECK-NEXT:    mov v2.b[6], w8
-; CHECK-NEXT:    msub w8, w12, w15, w16
+; CHECK-NEXT:    msub w8, w9, w13, w14
 ; CHECK-NEXT:    mov v2.b[7], w8
 ; CHECK-NEXT:    fmov d0, d2
 ; CHECK-NEXT:    ret
@@ -806,11 +806,12 @@ define <8 x i8> @srem8x8(<8 x i8> %A, <8 x i8> %B) {
 define <16 x i8> @srem16x8(<16 x i8> %A, <16 x i8> %B) {
 ; CHECK-LABEL: srem16x8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x26, x25, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    stp x28, x27, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 80
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w20, -16
 ; CHECK-NEXT:    .cfi_offset w21, -24
@@ -819,6 +820,8 @@ define <16 x i8> @srem16x8(<16 x i8> %A, <16 x i8> %B) {
 ; CHECK-NEXT:    .cfi_offset w24, -48
 ; CHECK-NEXT:    .cfi_offset w25, -56
 ; CHECK-NEXT:    .cfi_offset w26, -64
+; CHECK-NEXT:    .cfi_offset w27, -72
+; CHECK-NEXT:    .cfi_offset w28, -80
 ; CHECK-NEXT:    smov w11, v1.b[0]
 ; CHECK-NEXT:    smov w12, v0.b[0]
 ; CHECK-NEXT:    smov w8, v1.b[1]
@@ -827,83 +830,84 @@ define <16 x i8> @srem16x8(<16 x i8> %A, <16 x i8> %B) {
 ; CHECK-NEXT:    smov w15, v0.b[2]
 ; CHECK-NEXT:    smov w17, v1.b[3]
 ; CHECK-NEXT:    smov w18, v0.b[3]
-; CHECK-NEXT:    sdiv w13, w12, w11
 ; CHECK-NEXT:    smov w1, v1.b[4]
 ; CHECK-NEXT:    smov w2, v0.b[4]
 ; CHECK-NEXT:    smov w4, v1.b[5]
 ; CHECK-NEXT:    smov w5, v0.b[5]
+; CHECK-NEXT:    sdiv w13, w12, w11
 ; CHECK-NEXT:    smov w7, v1.b[6]
 ; CHECK-NEXT:    smov w19, v0.b[6]
 ; CHECK-NEXT:    smov w21, v1.b[7]
 ; CHECK-NEXT:    smov w22, v0.b[7]
 ; CHECK-NEXT:    smov w24, v1.b[8]
 ; CHECK-NEXT:    smov w25, v0.b[8]
-; CHECK-NEXT:    msub w11, w13, w11, w12
-; CHECK-NEXT:    smov w12, v1.b[9]
+; CHECK-NEXT:    smov w27, v1.b[9]
+; CHECK-NEXT:    smov w28, v0.b[9]
 ; CHECK-NEXT:    sdiv w10, w9, w8
-; CHECK-NEXT:    smov w13, v0.b[9]
+; CHECK-NEXT:    msub w11, w13, w11, w12
+; CHECK-NEXT:    smov w13, v1.b[11]
 ; CHECK-NEXT:    fmov s2, w11
 ; CHECK-NEXT:    smov w11, v0.b[10]
+; CHECK-NEXT:    sdiv w16, w15, w14
 ; CHECK-NEXT:    msub w8, w10, w8, w9
 ; CHECK-NEXT:    smov w10, v1.b[10]
-; CHECK-NEXT:    sdiv w16, w15, w14
 ; CHECK-NEXT:    mov v2.b[1], w8
-; CHECK-NEXT:    msub w8, w16, w14, w15
-; CHECK-NEXT:    smov w15, v1.b[11]
 ; CHECK-NEXT:    sdiv w0, w18, w17
-; CHECK-NEXT:    smov w16, v0.b[11]
+; CHECK-NEXT:    msub w8, w16, w14, w15
+; CHECK-NEXT:    smov w14, v0.b[11]
+; CHECK-NEXT:    smov w16, v1.b[12]
 ; CHECK-NEXT:    mov v2.b[2], w8
-; CHECK-NEXT:    msub w14, w0, w17, w18
-; CHECK-NEXT:    smov w18, v1.b[12]
 ; CHECK-NEXT:    sdiv w3, w2, w1
-; CHECK-NEXT:    smov w0, v0.b[12]
-; CHECK-NEXT:    mov v2.b[3], w14
-; CHECK-NEXT:    msub w14, w3, w1, w2
-; CHECK-NEXT:    smov w2, v1.b[13]
+; CHECK-NEXT:    msub w8, w0, w17, w18
+; CHECK-NEXT:    smov w17, v0.b[12]
+; CHECK-NEXT:    smov w0, v1.b[13]
+; CHECK-NEXT:    mov v2.b[3], w8
 ; CHECK-NEXT:    sdiv w6, w5, w4
-; CHECK-NEXT:    smov w3, v0.b[13]
-; CHECK-NEXT:    mov v2.b[4], w14
-; CHECK-NEXT:    msub w17, w6, w4, w5
+; CHECK-NEXT:    msub w8, w3, w1, w2
+; CHECK-NEXT:    smov w1, v0.b[13]
+; CHECK-NEXT:    mov v2.b[4], w8
 ; CHECK-NEXT:    sdiv w20, w19, w7
-; CHECK-NEXT:    mov v2.b[5], w17
-; CHECK-NEXT:    msub w17, w20, w7, w19
+; CHECK-NEXT:    msub w8, w6, w4, w5
+; CHECK-NEXT:    mov v2.b[5], w8
 ; CHECK-NEXT:    sdiv w23, w22, w21
-; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    mov v2.b[6], w17
-; CHECK-NEXT:    msub w1, w23, w21, w22
+; CHECK-NEXT:    msub w8, w20, w7, w19
+; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    mov v2.b[6], w8
 ; CHECK-NEXT:    sdiv w26, w25, w24
-; CHECK-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    mov v2.b[7], w1
-; CHECK-NEXT:    msub w1, w26, w24, w25
-; CHECK-NEXT:    sdiv w9, w13, w12
-; CHECK-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    mov v2.b[8], w1
-; CHECK-NEXT:    msub w9, w9, w12, w13
-; CHECK-NEXT:    smov w13, v1.b[15]
-; CHECK-NEXT:    sdiv w8, w11, w10
-; CHECK-NEXT:    mov v2.b[9], w9
-; CHECK-NEXT:    smov w9, v1.b[14]
-; CHECK-NEXT:    msub w8, w8, w10, w11
-; CHECK-NEXT:    smov w10, v0.b[14]
-; CHECK-NEXT:    sdiv w14, w16, w15
+; CHECK-NEXT:    msub w8, w23, w21, w22
+; CHECK-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    mov v2.b[7], w8
+; CHECK-NEXT:    sdiv w9, w28, w27
+; CHECK-NEXT:    msub w8, w26, w24, w25
+; CHECK-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    mov v2.b[8], w8
+; CHECK-NEXT:    sdiv w12, w11, w10
+; CHECK-NEXT:    msub w8, w9, w27, w28
+; CHECK-NEXT:    mov v2.b[9], w8
+; CHECK-NEXT:    sdiv w15, w14, w13
+; CHECK-NEXT:    msub w8, w12, w10, w11
+; CHECK-NEXT:    smov w10, v1.b[14]
+; CHECK-NEXT:    smov w11, v0.b[14]
 ; CHECK-NEXT:    mov v2.b[10], w8
-; CHECK-NEXT:    msub w11, w14, w15, w16
+; CHECK-NEXT:    sdiv w18, w17, w16
+; CHECK-NEXT:    msub w8, w15, w13, w14
+; CHECK-NEXT:    smov w13, v1.b[15]
 ; CHECK-NEXT:    smov w14, v0.b[15]
-; CHECK-NEXT:    sdiv w17, w0, w18
-; CHECK-NEXT:    mov v2.b[11], w11
-; CHECK-NEXT:    msub w11, w17, w18, w0
-; CHECK-NEXT:    sdiv w12, w3, w2
-; CHECK-NEXT:    mov v2.b[12], w11
-; CHECK-NEXT:    msub w12, w12, w2, w3
-; CHECK-NEXT:    sdiv w8, w10, w9
-; CHECK-NEXT:    mov v2.b[13], w12
-; CHECK-NEXT:    msub w8, w8, w9, w10
-; CHECK-NEXT:    sdiv w11, w14, w13
+; CHECK-NEXT:    mov v2.b[11], w8
+; CHECK-NEXT:    sdiv w9, w1, w0
+; CHECK-NEXT:    msub w8, w18, w16, w17
+; CHECK-NEXT:    mov v2.b[12], w8
+; CHECK-NEXT:    sdiv w12, w11, w10
+; CHECK-NEXT:    msub w8, w9, w0, w1
+; CHECK-NEXT:    mov v2.b[13], w8
+; CHECK-NEXT:    sdiv w9, w14, w13
+; CHECK-NEXT:    msub w8, w12, w10, w11
 ; CHECK-NEXT:    mov v2.b[14], w8
-; CHECK-NEXT:    msub w8, w11, w13, w14
+; CHECK-NEXT:    msub w8, w9, w13, w14
 ; CHECK-NEXT:    mov v2.b[15], w8
 ; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    ldp x26, x25, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x28, x27, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 	%tmp3 = srem <16 x i8> %A, %B;
 	ret <16 x i8> %tmp3
@@ -935,19 +939,19 @@ define <4 x i16> @srem4x16(<4 x i16> %A, <4 x i16> %B) {
 ; CHECK-NEXT:    smov w9, v0.h[1]
 ; CHECK-NEXT:    smov w14, v1.h[2]
 ; CHECK-NEXT:    smov w15, v0.h[2]
+; CHECK-NEXT:    smov w17, v1.h[3]
+; CHECK-NEXT:    smov w18, v0.h[3]
 ; CHECK-NEXT:    sdiv w13, w12, w11
-; CHECK-NEXT:    msub w11, w13, w11, w12
-; CHECK-NEXT:    smov w12, v1.h[3]
 ; CHECK-NEXT:    sdiv w10, w9, w8
-; CHECK-NEXT:    smov w13, v0.h[3]
+; CHECK-NEXT:    msub w11, w13, w11, w12
 ; CHECK-NEXT:    fmov s0, w11
-; CHECK-NEXT:    msub w8, w10, w8, w9
 ; CHECK-NEXT:    sdiv w16, w15, w14
+; CHECK-NEXT:    msub w8, w10, w8, w9
 ; CHECK-NEXT:    mov v0.h[1], w8
-; CHECK-NEXT:    msub w10, w16, w14, w15
-; CHECK-NEXT:    sdiv w9, w13, w12
-; CHECK-NEXT:    mov v0.h[2], w10
-; CHECK-NEXT:    msub w8, w9, w12, w13
+; CHECK-NEXT:    sdiv w9, w18, w17
+; CHECK-NEXT:    msub w8, w16, w14, w15
+; CHECK-NEXT:    mov v0.h[2], w8
+; CHECK-NEXT:    msub w8, w9, w17, w18
 ; CHECK-NEXT:    mov v0.h[3], w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
@@ -966,37 +970,37 @@ define <8 x i16> @srem8x16(<8 x i16> %A, <8 x i16> %B) {
 ; CHECK-NEXT:    smov w15, v0.h[2]
 ; CHECK-NEXT:    smov w17, v1.h[3]
 ; CHECK-NEXT:    smov w18, v0.h[3]
-; CHECK-NEXT:    sdiv w13, w12, w11
 ; CHECK-NEXT:    smov w1, v1.h[4]
 ; CHECK-NEXT:    smov w2, v0.h[4]
-; CHECK-NEXT:    msub w11, w13, w11, w12
-; CHECK-NEXT:    smov w12, v1.h[5]
+; CHECK-NEXT:    smov w4, v1.h[5]
+; CHECK-NEXT:    smov w5, v0.h[5]
+; CHECK-NEXT:    sdiv w13, w12, w11
 ; CHECK-NEXT:    sdiv w10, w9, w8
-; CHECK-NEXT:    smov w13, v0.h[5]
+; CHECK-NEXT:    msub w11, w13, w11, w12
+; CHECK-NEXT:    smov w13, v1.h[7]
 ; CHECK-NEXT:    fmov s2, w11
 ; CHECK-NEXT:    smov w11, v0.h[6]
+; CHECK-NEXT:    sdiv w16, w15, w14
 ; CHECK-NEXT:    msub w8, w10, w8, w9
 ; CHECK-NEXT:    smov w10, v1.h[6]
-; CHECK-NEXT:    sdiv w16, w15, w14
 ; CHECK-NEXT:    mov v2.h[1], w8
-; CHECK-NEXT:    msub w8, w16, w14, w15
-; CHECK-NEXT:    smov w15, v1.h[7]
 ; CHECK-NEXT:    sdiv w0, w18, w17
-; CHECK-NEXT:    smov w16, v0.h[7]
+; CHECK-NEXT:    msub w8, w16, w14, w15
+; CHECK-NEXT:    smov w14, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    msub w14, w0, w17, w18
 ; CHECK-NEXT:    sdiv w3, w2, w1
-; CHECK-NEXT:    mov v2.h[3], w14
-; CHECK-NEXT:    msub w14, w3, w1, w2
-; CHECK-NEXT:    sdiv w9, w13, w12
-; CHECK-NEXT:    mov v2.h[4], w14
-; CHECK-NEXT:    msub w9, w9, w12, w13
-; CHECK-NEXT:    sdiv w8, w11, w10
-; CHECK-NEXT:    mov v2.h[5], w9
-; CHECK-NEXT:    msub w8, w8, w10, w11
-; CHECK-NEXT:    sdiv w12, w16, w15
+; CHECK-NEXT:    msub w8, w0, w17, w18
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    sdiv w9, w5, w4
+; CHECK-NEXT:    msub w8, w3, w1, w2
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    sdiv w12, w11, w10
+; CHECK-NEXT:    msub w8, w9, w4, w5
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    sdiv w9, w14, w13
+; CHECK-NEXT:    msub w8, w12, w10, w11
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    msub w8, w12, w15, w16
+; CHECK-NEXT:    msub w8, w9, w13, w14
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
@@ -1029,8 +1033,8 @@ define <2 x i32> @srem2x32(<2 x i32> %A, <2 x i32> %B) {
 ; CHECK-NEXT:    mov w11, v1.s[1]
 ; CHECK-NEXT:    mov w12, v0.s[1]
 ; CHECK-NEXT:    sdiv w10, w9, w8
-; CHECK-NEXT:    msub w8, w10, w8, w9
 ; CHECK-NEXT:    sdiv w13, w12, w11
+; CHECK-NEXT:    msub w8, w10, w8, w9
 ; CHECK-NEXT:    fmov s0, w8
 ; CHECK-NEXT:    msub w9, w13, w11, w12
 ; CHECK-NEXT:    mov v0.s[1], w9
@@ -1049,18 +1053,18 @@ define <4 x i32> @srem4x32(<4 x i32> %A, <4 x i32> %B) {
 ; CHECK-NEXT:    mov w9, v0.s[1]
 ; CHECK-NEXT:    mov w14, v1.s[2]
 ; CHECK-NEXT:    mov w15, v0.s[2]
-; CHECK-NEXT:    sdiv w13, w12, w11
 ; CHECK-NEXT:    mov w17, v1.s[3]
 ; CHECK-NEXT:    mov w18, v0.s[3]
-; CHECK-NEXT:    msub w11, w13, w11, w12
+; CHECK-NEXT:    sdiv w13, w12, w11
 ; CHECK-NEXT:    sdiv w10, w9, w8
+; CHECK-NEXT:    msub w11, w13, w11, w12
 ; CHECK-NEXT:    fmov s0, w11
-; CHECK-NEXT:    msub w8, w10, w8, w9
 ; CHECK-NEXT:    sdiv w16, w15, w14
+; CHECK-NEXT:    msub w8, w10, w8, w9
 ; CHECK-NEXT:    mov v0.s[1], w8
-; CHECK-NEXT:    msub w10, w16, w14, w15
 ; CHECK-NEXT:    sdiv w9, w18, w17
-; CHECK-NEXT:    mov v0.s[2], w10
+; CHECK-NEXT:    msub w8, w16, w14, w15
+; CHECK-NEXT:    mov v0.s[2], w8
 ; CHECK-NEXT:    msub w8, w9, w17, w18
 ; CHECK-NEXT:    mov v0.s[3], w8
 ; CHECK-NEXT:    ret
@@ -1091,8 +1095,8 @@ define <2 x i64> @srem2x64(<2 x i64> %A, <2 x i64> %B) {
 ; CHECK-NEXT:    mov x11, v1.d[1]
 ; CHECK-NEXT:    mov x12, v0.d[1]
 ; CHECK-NEXT:    sdiv x10, x9, x8
-; CHECK-NEXT:    msub x8, x10, x8, x9
 ; CHECK-NEXT:    sdiv x13, x12, x11
+; CHECK-NEXT:    msub x8, x10, x8, x9
 ; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    msub x9, x13, x11, x12
 ; CHECK-NEXT:    mov v0.d[1], x9
@@ -1129,37 +1133,37 @@ define <8 x i8> @urem8x8(<8 x i8> %A, <8 x i8> %B) {
 ; CHECK-NEXT:    umov w15, v0.b[2]
 ; CHECK-NEXT:    umov w17, v1.b[3]
 ; CHECK-NEXT:    umov w18, v0.b[3]
-; CHECK-NEXT:    udiv w13, w12, w11
 ; CHECK-NEXT:    umov w1, v1.b[4]
 ; CHECK-NEXT:    umov w2, v0.b[4]
-; CHECK-NEXT:    msub w11, w13, w11, w12
-; CHECK-NEXT:    umov w12, v1.b[5]
+; CHECK-NEXT:    umov w4, v1.b[5]
+; CHECK-NEXT:    umov w5, v0.b[5]
+; CHECK-NEXT:    udiv w13, w12, w11
 ; CHECK-NEXT:    udiv w10, w9, w8
-; CHECK-NEXT:    umov w13, v0.b[5]
+; CHECK-NEXT:    msub w11, w13, w11, w12
+; CHECK-NEXT:    umov w13, v1.b[7]
 ; CHECK-NEXT:    fmov s2, w11
 ; CHECK-NEXT:    umov w11, v0.b[6]
+; CHECK-NEXT:    udiv w16, w15, w14
 ; CHECK-NEXT:    msub w8, w10, w8, w9
 ; CHECK-NEXT:    umov w10, v1.b[6]
-; CHECK-NEXT:    udiv w16, w15, w14
 ; CHECK-NEXT:    mov v2.b[1], w8
-; CHECK-NEXT:    msub w8, w16, w14, w15
-; CHECK-NEXT:    umov w15, v1.b[7]
 ; CHECK-NEXT:    udiv w0, w18, w17
-; CHECK-NEXT:    umov w16, v0.b[7]
+; CHECK-NEXT:    msub w8, w16, w14, w15
+; CHECK-NEXT:    umov w14, v0.b[7]
 ; CHECK-NEXT:    mov v2.b[2], w8
-; CHECK-NEXT:    msub w14, w0, w17, w18
 ; CHECK-NEXT:    udiv w3, w2, w1
-; CHECK-NEXT:    mov v2.b[3], w14
-; CHECK-NEXT:    msub w14, w3, w1, w2
-; CHECK-NEXT:    udiv w9, w13, w12
-; CHECK-NEXT:    mov v2.b[4], w14
-; CHECK-NEXT:    msub w9, w9, w12, w13
-; CHECK-NEXT:    udiv w8, w11, w10
-; CHECK-NEXT:    mov v2.b[5], w9
-; CHECK-NEXT:    msub w8, w8, w10, w11
-; CHECK-NEXT:    udiv w12, w16, w15
+; CHECK-NEXT:    msub w8, w0, w17, w18
+; CHECK-NEXT:    mov v2.b[3], w8
+; CHECK-NEXT:    udiv w9, w5, w4
+; CHECK-NEXT:    msub w8, w3, w1, w2
+; CHECK-NEXT:    mov v2.b[4], w8
+; CHECK-NEXT:    udiv w12, w11, w10
+; CHECK-NEXT:    msub w8, w9, w4, w5
+; CHECK-NEXT:    mov v2.b[5], w8
+; CHECK-NEXT:    udiv w9, w14, w13
+; CHECK-NEXT:    msub w8, w12, w10, w11
 ; CHECK-NEXT:    mov v2.b[6], w8
-; CHECK-NEXT:    msub w8, w12, w15, w16
+; CHECK-NEXT:    msub w8, w9, w13, w14
 ; CHECK-NEXT:    mov v2.b[7], w8
 ; CHECK-NEXT:    fmov d0, d2
 ; CHECK-NEXT:    ret
@@ -1170,11 +1174,12 @@ define <8 x i8> @urem8x8(<8 x i8> %A, <8 x i8> %B) {
 define <16 x i8> @urem16x8(<16 x i8> %A, <16 x i8> %B) {
 ; CHECK-LABEL: urem16x8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x26, x25, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    stp x28, x27, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 80
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w20, -16
 ; CHECK-NEXT:    .cfi_offset w21, -24
@@ -1183,6 +1188,8 @@ define <16 x i8> @urem16x8(<16 x i8> %A, <16 x i8> %B) {
 ; CHECK-NEXT:    .cfi_offset w24, -48
 ; CHECK-NEXT:    .cfi_offset w25, -56
 ; CHECK-NEXT:    .cfi_offset w26, -64
+; CHECK-NEXT:    .cfi_offset w27, -72
+; CHECK-NEXT:    .cfi_offset w28, -80
 ; CHECK-NEXT:    umov w11, v1.b[0]
 ; CHECK-NEXT:    umov w12, v0.b[0]
 ; CHECK-NEXT:    umov w8, v1.b[1]
@@ -1191,83 +1198,84 @@ define <16 x i8> @urem16x8(<16 x i8> %A, <16 x i8> %B) {
 ; CHECK-NEXT:    umov w15, v0.b[2]
 ; CHECK-NEXT:    umov w17, v1.b[3]
 ; CHECK-NEXT:    umov w18, v0.b[3]
-; CHECK-NEXT:    udiv w13, w12, w11
 ; CHECK-NEXT:    umov w1, v1.b[4]
 ; CHECK-NEXT:    umov w2, v0.b[4]
 ; CHECK-NEXT:    umov w4, v1.b[5]
 ; CHECK-NEXT:    umov w5, v0.b[5]
+; CHECK-NEXT:    udiv w13, w12, w11
 ; CHECK-NEXT:    umov w7, v1.b[6]
 ; CHECK-NEXT:    umov w19, v0.b[6]
 ; CHECK-NEXT:    umov w21, v1.b[7]
 ; CHECK-NEXT:    umov w22, v0.b[7]
 ; CHECK-NEXT:    umov w24, v1.b[8]
 ; CHECK-NEXT:    umov w25, v0.b[8]
-; CHECK-NEXT:    msub w11, w13, w11, w12
-; CHECK-NEXT:    umov w12, v1.b[9]
+; CHECK-NEXT:    umov w27, v1.b[9]
+; CHECK-NEXT:    umov w28, v0.b[9]
 ; CHECK-NEXT:    udiv w10, w9, w8
-; CHECK-NEXT:    umov w13, v0.b[9]
+; CHECK-NEXT:    msub w11, w13, w11, w12
+; CHECK-NEXT:    umov w13, v1.b[11]
 ; CHECK-NEXT:    fmov s2, w11
 ; CHECK-NEXT:    umov w11, v0.b[10]
+; CHECK-NEXT:    udiv w16, w15, w14
 ; CHECK-NEXT:    msub w8, w10, w8, w9
 ; CHECK-NEXT:    umov w10, v1.b[10]
-; CHECK-NEXT:    udiv w16, w15, w14
 ; CHECK-NEXT:    mov v2.b[1], w8
-; CHECK-NEXT:    msub w8, w16, w14, w15
-; CHECK-NEXT:    umov w15, v1.b[11]
 ; CHECK-NEXT:    udiv w0, w18, w17
-; CHECK-NEXT:    umov w16, v0.b[11]
+; CHECK-NEXT:    msub w8, w16, w14, w15
+; CHECK-NEXT:    umov w14, v0.b[11]
+; CHECK-NEXT:    umov w16, v1.b[12]
 ; CHECK-NEXT:    mov v2.b[2], w8
-; CHECK-NEXT:    msub w14, w0, w17, w18
-; CHECK-NEXT:    umov w18, v1.b[12]
 ; CHECK-NEXT:    udiv w3, w2, w1
-; CHECK-NEXT:    umov w0, v0.b[12]
-; CHECK-NEXT:    mov v2.b[3], w14
-; CHECK-NEXT:    msub w14, w3, w1, w2
-; CHECK-NEXT:    umov w2, v1.b[13]
+; CHECK-NEXT:    msub w8, w0, w17, w18
+; CHECK-NEXT:    umov w17, v0.b[12]
+; CHECK-NEXT:    umov w0, v1.b[13]
+; CHECK-NEXT:    mov v2.b[3], w8
 ; CHECK-NEXT:    udiv w6, w5, w4
-; CHECK-NEXT:    umov w3, v0.b[13]
-; CHECK-NEXT:    mov v2.b[4], w14
-; CHECK-NEXT:    msub w17, w6, w4, w5
+; CHECK-NEXT:    msub w8, w3, w1, w2
+; CHECK-NEXT:    umov w1, v0.b[13]
+; CHECK-NEXT:    mov v2.b[4], w8
 ; CHECK-NEXT:    udiv w20, w19, w7
-; CHECK-NEXT:    mov v2.b[5], w17
-; CHECK-NEXT:    msub w17, w20, w7, w19
+; CHECK-NEXT:    msub w8, w6, w4, w5
+; CHECK-NEXT:    mov v2.b[5], w8
 ; CHECK-NEXT:    udiv w23, w22, w21
-; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    mov v2.b[6], w17
-; CHECK-NEXT:    msub w1, w23, w21, w22
+; CHECK-NEXT:    msub w8, w20, w7, w19
+; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    mov v2.b[6], w8
 ; CHECK-NEXT:    udiv w26, w25, w24
-; CHECK-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    mov v2.b[7], w1
-; CHECK-NEXT:    msub w1, w26, w24, w25
-; CHECK-NEXT:    udiv w9, w13, w12
-; CHECK-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    mov v2.b[8], w1
-; CHECK-NEXT:    msub w9, w9, w12, w13
-; CHECK-NEXT:    umov w13, v1.b[15]
-; CHECK-NEXT:    udiv w8, w11, w10
-; CHECK-NEXT:    mov v2.b[9], w9
-; CHECK-NEXT:    umov w9, v1.b[14]
-; CHECK-NEXT:    msub w8, w8, w10, w11
-; CHECK-NEXT:    umov w10, v0.b[14]
-; CHECK-NEXT:    udiv w14, w16, w15
+; CHECK-NEXT:    msub w8, w23, w21, w22
+; CHECK-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    mov v2.b[7], w8
+; CHECK-NEXT:    udiv w9, w28, w27
+; CHECK-NEXT:    msub w8, w26, w24, w25
+; CHECK-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    mov v2.b[8], w8
+; CHECK-NEXT:    udiv w12, w11, w10
+; CHECK-NEXT:    msub w8, w9, w27, w28
+; CHECK-NEXT:    mov v2.b[9], w8
+; CHECK-NEXT:    udiv w15, w14, w13
+; CHECK-NEXT:    msub w8, w12, w10, w11
+; CHECK-NEXT:    umov w10, v1.b[14]
+; CHECK-NEXT:    umov w11, v0.b[14]
 ; CHECK-NEXT:    mov v2.b[10], w8
-; CHECK-NEXT:    msub w11, w14, w15, w16
+; CHECK-NEXT:    udiv w18, w17, w16
+; CHECK-NEXT:    msub w8, w15, w13, w14
+; CHECK-NEXT:    umov w13, v1.b[15]
 ; CHECK-NEXT:    umov w14, v0.b[15]
-; CHECK-NEXT:    udiv w17, w0, w18
-; CHECK-NEXT:    mov v2.b[11], w11
-; CHECK-NEXT:    msub w11, w17, w18, w0
-; CHECK-NEXT:    udiv w12, w3, w2
-; CHECK-NEXT:    mov v2.b[12], w11
-; CHECK-NEXT:    msub w12, w12, w2, w3
-; CHECK-NEXT:    udiv w8, w10, w9
-; CHECK-NEXT:    mov v2.b[13], w12
-; CHECK-NEXT:    msub w8, w8, w9, w10
-; CHECK-NEXT:    udiv w11, w14, w13
+; CHECK-NEXT:    mov v2.b[11], w8
+; CHECK-NEXT:    udiv w9, w1, w0
+; CHECK-NEXT:    msub w8, w18, w16, w17
+; CHECK-NEXT:    mov v2.b[12], w8
+; CHECK-NEXT:    udiv w12, w11, w10
+; CHECK-NEXT:    msub w8, w9, w0, w1
+; CHECK-NEXT:    mov v2.b[13], w8
+; CHECK-NEXT:    udiv w9, w14, w13
+; CHECK-NEXT:    msub w8, w12, w10, w11
 ; CHECK-NEXT:    mov v2.b[14], w8
-; CHECK-NEXT:    msub w8, w11, w13, w14
+; CHECK-NEXT:    msub w8, w9, w13, w14
 ; CHECK-NEXT:    mov v2.b[15], w8
 ; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    ldp x26, x25, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x28, x27, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 	%tmp3 = urem <16 x i8> %A, %B;
 	ret <16 x i8> %tmp3
@@ -1299,19 +1307,19 @@ define <4 x i16> @urem4x16(<4 x i16> %A, <4 x i16> %B) {
 ; CHECK-NEXT:    umov w9, v0.h[1]
 ; CHECK-NEXT:    umov w14, v1.h[2]
 ; CHECK-NEXT:    umov w15, v0.h[2]
+; CHECK-NEXT:    umov w17, v1.h[3]
+; CHECK-NEXT:    umov w18, v0.h[3]
 ; CHECK-NEXT:    udiv w13, w12, w11
-; CHECK-NEXT:    msub w11, w13, w11, w12
-; CHECK-NEXT:    umov w12, v1.h[3]
 ; CHECK-NEXT:    udiv w10, w9, w8
-; CHECK-NEXT:    umov w13, v0.h[3]
+; CHECK-NEXT:    msub w11, w13, w11, w12
 ; CHECK-NEXT:    fmov s0, w11
-; CHECK-NEXT:    msub w8, w10, w8, w9
 ; CHECK-NEXT:    udiv w16, w15, w14
+; CHECK-NEXT:    msub w8, w10, w8, w9
 ; CHECK-NEXT:    mov v0.h[1], w8
-; CHECK-NEXT:    msub w10, w16, w14, w15
-; CHECK-NEXT:    udiv w9, w13, w12
-; CHECK-NEXT:    mov v0.h[2], w10
-; CHECK-NEXT:    msub w8, w9, w12, w13
+; CHECK-NEXT:    udiv w9, w18, w17
+; CHECK-NEXT:    msub w8, w16, w14, w15
+; CHECK-NEXT:    mov v0.h[2], w8
+; CHECK-NEXT:    msub w8, w9, w17, w18
 ; CHECK-NEXT:    mov v0.h[3], w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
@@ -1330,37 +1338,37 @@ define <8 x i16> @urem8x16(<8 x i16> %A, <8 x i16> %B) {
 ; CHECK-NEXT:    umov w15, v0.h[2]
 ; CHECK-NEXT:    umov w17, v1.h[3]
 ; CHECK-NEXT:    umov w18, v0.h[3]
-; CHECK-NEXT:    udiv w13, w12, w11
 ; CHECK-NEXT:    umov w1, v1.h[4]
 ; CHECK-NEXT:    umov w2, v0.h[4]
-; CHECK-NEXT:    msub w11, w13, w11, w12
-; CHECK-NEXT:    umov w12, v1.h[5]
+; CHECK-NEXT:    umov w4, v1.h[5]
+; CHECK-NEXT:    umov w5, v0.h[5]
+; CHECK-NEXT:    udiv w13, w12, w11
 ; CHECK-NEXT:    udiv w10, w9, w8
-; CHECK-NEXT:    umov w13, v0.h[5]
+; CHECK-NEXT:    msub w11, w13, w11, w12
+; CHECK-NEXT:    umov w13, v1.h[7]
 ; CHECK-NEXT:    fmov s2, w11
 ; CHECK-NEXT:    umov w11, v0.h[6]
+; CHECK-NEXT:    udiv w16, w15, w14
 ; CHECK-NEXT:    msub w8, w10, w8, w9
 ; CHECK-NEXT:    umov w10, v1.h[6]
-; CHECK-NEXT:    udiv w16, w15, w14
 ; CHECK-NEXT:    mov v2.h[1], w8
-; CHECK-NEXT:    msub w8, w16, w14, w15
-; CHECK-NEXT:    umov w15, v1.h[7]
 ; CHECK-NEXT:    udiv w0, w18, w17
-; CHECK-NEXT:    umov w16, v0.h[7]
+; CHECK-NEXT:    msub w8, w16, w14, w15
+; CHECK-NEXT:    umov w14, v0.h[7]
 ; CHECK-NEXT:    mov v2.h[2], w8
-; CHECK-NEXT:    msub w14, w0, w17, w18
 ; CHECK-NEXT:    udiv w3, w2, w1
-; CHECK-NEXT:    mov v2.h[3], w14
-; CHECK-NEXT:    msub w14, w3, w1, w2
-; CHECK-NEXT:    udiv w9, w13, w12
-; CHECK-NEXT:    mov v2.h[4], w14
-; CHECK-NEXT:    msub w9, w9, w12, w13
-; CHECK-NEXT:    udiv w8, w11, w10
-; CHECK-NEXT:    mov v2.h[5], w9
-; CHECK-NEXT:    msub w8, w8, w10, w11
-; CHECK-NEXT:    udiv w12, w16, w15
+; CHECK-NEXT:    msub w8, w0, w17, w18
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    udiv w9, w5, w4
+; CHECK-NEXT:    msub w8, w3, w1, w2
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    udiv w12, w11, w10
+; CHECK-NEXT:    msub w8, w9, w4, w5
+; CHECK-NEXT:    mov v2.h[5], w8
+; CHECK-NEXT:    udiv w9, w14, w13
+; CHECK-NEXT:    msub w8, w12, w10, w11
 ; CHECK-NEXT:    mov v2.h[6], w8
-; CHECK-NEXT:    msub w8, w12, w15, w16
+; CHECK-NEXT:    msub w8, w9, w13, w14
 ; CHECK-NEXT:    mov v2.h[7], w8
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
@@ -1393,8 +1401,8 @@ define <2 x i32> @urem2x32(<2 x i32> %A, <2 x i32> %B) {
 ; CHECK-NEXT:    mov w11, v1.s[1]
 ; CHECK-NEXT:    mov w12, v0.s[1]
 ; CHECK-NEXT:    udiv w10, w9, w8
-; CHECK-NEXT:    msub w8, w10, w8, w9
 ; CHECK-NEXT:    udiv w13, w12, w11
+; CHECK-NEXT:    msub w8, w10, w8, w9
 ; CHECK-NEXT:    fmov s0, w8
 ; CHECK-NEXT:    msub w9, w13, w11, w12
 ; CHECK-NEXT:    mov v0.s[1], w9
@@ -1413,18 +1421,18 @@ define <4 x i32> @urem4x32(<4 x i32> %A, <4 x i32> %B) {
 ; CHECK-NEXT:    mov w9, v0.s[1]
 ; CHECK-NEXT:    mov w14, v1.s[2]
 ; CHECK-NEXT:    mov w15, v0.s[2]
-; CHECK-NEXT:    udiv w13, w12, w11
 ; CHECK-NEXT:    mov w17, v1.s[3]
 ; CHECK-NEXT:    mov w18, v0.s[3]
-; CHECK-NEXT:    msub w11, w13, w11, w12
+; CHECK-NEXT:    udiv w13, w12, w11
 ; CHECK-NEXT:    udiv w10, w9, w8
+; CHECK-NEXT:    msub w11, w13, w11, w12
 ; CHECK-NEXT:    fmov s0, w11
-; CHECK-NEXT:    msub w8, w10, w8, w9
 ; CHECK-NEXT:    udiv w16, w15, w14
+; CHECK-NEXT:    msub w8, w10, w8, w9
 ; CHECK-NEXT:    mov v0.s[1], w8
-; CHECK-NEXT:    msub w10, w16, w14, w15
 ; CHECK-NEXT:    udiv w9, w18, w17
-; CHECK-NEXT:    mov v0.s[2], w10
+; CHECK-NEXT:    msub w8, w16, w14, w15
+; CHECK-NEXT:    mov v0.s[2], w8
 ; CHECK-NEXT:    msub w8, w9, w17, w18
 ; CHECK-NEXT:    mov v0.s[3], w8
 ; CHECK-NEXT:    ret
@@ -1455,8 +1463,8 @@ define <2 x i64> @urem2x64(<2 x i64> %A, <2 x i64> %B) {
 ; CHECK-NEXT:    mov x11, v1.d[1]
 ; CHECK-NEXT:    mov x12, v0.d[1]
 ; CHECK-NEXT:    udiv x10, x9, x8
-; CHECK-NEXT:    msub x8, x10, x8, x9
 ; CHECK-NEXT:    udiv x13, x12, x11
+; CHECK-NEXT:    msub x8, x10, x8, x9
 ; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    msub x9, x13, x11, x12
 ; CHECK-NEXT:    mov v0.d[1], x9

diff  --git a/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll b/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
index bb13e96532baf1..3d52a2c044a0cb 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
@@ -614,11 +614,11 @@ entry:
 define i16 @test_vqrdmlahh_s16(i16 %a, i16 %b, i16 %c) {
 ; CHECK-LABEL: test_vqrdmlahh_s16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s0, w1
-; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    fmov s1, w1
 ; CHECK-NEXT:    fmov s2, w2
-; CHECK-NEXT:    sqrdmlah v1.4h, v0.4h, v2.4h
-; CHECK-NEXT:    umov w0, v1.h[0]
+; CHECK-NEXT:    sqrdmlah v0.4h, v1.4h, v2.4h
+; CHECK-NEXT:    umov w0, v0.h[0]
 ; CHECK-NEXT:    ret
 entry:
   %0 = insertelement <4 x i16> undef, i16 %a, i64 0
@@ -632,11 +632,11 @@ entry:
 define i32 @test_vqrdmlahs_s32(i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: test_vqrdmlahs_s32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s0, w1
-; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    fmov s1, w1
 ; CHECK-NEXT:    fmov s2, w2
-; CHECK-NEXT:    sqrdmlah s1, s0, s2
-; CHECK-NEXT:    fmov w0, s1
+; CHECK-NEXT:    sqrdmlah s0, s1, s2
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 entry:
   %vqrdmlahs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 %a, i32 %b, i32 %c) #4
@@ -646,11 +646,11 @@ entry:
 define i16 @test_vqrdmlahh_lane_s16(i16 %a, i16 %b, <4 x i16> %c) {
 ; CHECK-LABEL: test_vqrdmlahh_lane_s16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s1, w1
-; CHECK-NEXT:    fmov s2, w0
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fmov s2, w1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    sqrdmlah v2.4h, v1.4h, v0.h[3]
-; CHECK-NEXT:    umov w0, v2.h[0]
+; CHECK-NEXT:    sqrdmlah v1.4h, v2.4h, v0.h[3]
+; CHECK-NEXT:    umov w0, v1.h[0]
 ; CHECK-NEXT:    ret
 entry:
   %0 = insertelement <4 x i16> undef, i16 %a, i64 0
@@ -664,11 +664,11 @@ entry:
 define i32 @test_vqrdmlahs_lane_s32(i32 %a, i32 %b, <2 x i32> %c) {
 ; CHECK-LABEL: test_vqrdmlahs_lane_s32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s1, w1
-; CHECK-NEXT:    fmov s2, w0
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fmov s2, w1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    sqrdmlah s2, s1, v0.s[1]
-; CHECK-NEXT:    fmov w0, s2
+; CHECK-NEXT:    sqrdmlah s1, s2, v0.s[1]
+; CHECK-NEXT:    fmov w0, s1
 ; CHECK-NEXT:    ret
 entry:
   %vget_lane = extractelement <2 x i32> %c, i64 1
@@ -679,10 +679,10 @@ entry:
 define i16 @test_vqrdmlahh_laneq_s16(i16 %a, i16 %b, <8 x i16> %c) {
 ; CHECK-LABEL: test_vqrdmlahh_laneq_s16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s1, w1
-; CHECK-NEXT:    fmov s2, w0
-; CHECK-NEXT:    sqrdmlah v2.4h, v1.4h, v0.h[7]
-; CHECK-NEXT:    umov w0, v2.h[0]
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fmov s2, w1
+; CHECK-NEXT:    sqrdmlah v1.4h, v2.4h, v0.h[7]
+; CHECK-NEXT:    umov w0, v1.h[0]
 ; CHECK-NEXT:    ret
 entry:
   %0 = insertelement <4 x i16> undef, i16 %a, i64 0
@@ -696,10 +696,10 @@ entry:
 define i32 @test_vqrdmlahs_laneq_s32(i32 %a, i32 %b, <4 x i32> %c) {
 ; CHECK-LABEL: test_vqrdmlahs_laneq_s32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s1, w1
-; CHECK-NEXT:    fmov s2, w0
-; CHECK-NEXT:    sqrdmlah s2, s1, v0.s[3]
-; CHECK-NEXT:    fmov w0, s2
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fmov s2, w1
+; CHECK-NEXT:    sqrdmlah s1, s2, v0.s[3]
+; CHECK-NEXT:    fmov w0, s1
 ; CHECK-NEXT:    ret
 entry:
   %vgetq_lane = extractelement <4 x i32> %c, i64 3
@@ -754,11 +754,11 @@ entry:
 define i16 @test_vqrdmlshh_s16(i16 %a, i16 %b, i16 %c) {
 ; CHECK-LABEL: test_vqrdmlshh_s16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s0, w1
-; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    fmov s1, w1
 ; CHECK-NEXT:    fmov s2, w2
-; CHECK-NEXT:    sqrdmlsh v1.4h, v0.4h, v2.4h
-; CHECK-NEXT:    umov w0, v1.h[0]
+; CHECK-NEXT:    sqrdmlsh v0.4h, v1.4h, v2.4h
+; CHECK-NEXT:    umov w0, v0.h[0]
 ; CHECK-NEXT:    ret
 entry:
   %0 = insertelement <4 x i16> undef, i16 %a, i64 0
@@ -772,11 +772,11 @@ entry:
 define i32 @test_vqrdmlshs_s32(i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: test_vqrdmlshs_s32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s0, w1
-; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    fmov s1, w1
 ; CHECK-NEXT:    fmov s2, w2
-; CHECK-NEXT:    sqrdmlsh s1, s0, s2
-; CHECK-NEXT:    fmov w0, s1
+; CHECK-NEXT:    sqrdmlsh s0, s1, s2
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 entry:
   %vqrdmlshs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %a, i32 %b, i32 %c) #4
@@ -786,11 +786,11 @@ entry:
 define i16 @test_vqrdmlshh_lane_s16(i16 %a, i16 %b, <4 x i16> %c) {
 ; CHECK-LABEL: test_vqrdmlshh_lane_s16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s1, w1
-; CHECK-NEXT:    fmov s2, w0
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fmov s2, w1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    sqrdmlsh v2.4h, v1.4h, v0.h[3]
-; CHECK-NEXT:    umov w0, v2.h[0]
+; CHECK-NEXT:    sqrdmlsh v1.4h, v2.4h, v0.h[3]
+; CHECK-NEXT:    umov w0, v1.h[0]
 ; CHECK-NEXT:    ret
 entry:
   %0 = insertelement <4 x i16> undef, i16 %a, i64 0
@@ -804,11 +804,11 @@ entry:
 define i32 @test_vqrdmlshs_lane_s32(i32 %a, i32 %b, <2 x i32> %c) {
 ; CHECK-LABEL: test_vqrdmlshs_lane_s32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s1, w1
-; CHECK-NEXT:    fmov s2, w0
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fmov s2, w1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    sqrdmlsh s2, s1, v0.s[1]
-; CHECK-NEXT:    fmov w0, s2
+; CHECK-NEXT:    sqrdmlsh s1, s2, v0.s[1]
+; CHECK-NEXT:    fmov w0, s1
 ; CHECK-NEXT:    ret
 entry:
   %vget_lane = extractelement <2 x i32> %c, i64 1
@@ -819,10 +819,10 @@ entry:
 define i16 @test_vqrdmlshh_laneq_s16(i16 %a, i16 %b, <8 x i16> %c) {
 ; CHECK-LABEL: test_vqrdmlshh_laneq_s16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s1, w1
-; CHECK-NEXT:    fmov s2, w0
-; CHECK-NEXT:    sqrdmlsh v2.4h, v1.4h, v0.h[7]
-; CHECK-NEXT:    umov w0, v2.h[0]
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fmov s2, w1
+; CHECK-NEXT:    sqrdmlsh v1.4h, v2.4h, v0.h[7]
+; CHECK-NEXT:    umov w0, v1.h[0]
 ; CHECK-NEXT:    ret
 entry:
   %0 = insertelement <4 x i16> undef, i16 %a, i64 0
@@ -836,10 +836,10 @@ entry:
 define i32 @test_vqrdmlshs_laneq_s32(i32 %a, i32 %b, <4 x i32> %c) {
 ; CHECK-LABEL: test_vqrdmlshs_laneq_s32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s1, w1
-; CHECK-NEXT:    fmov s2, w0
-; CHECK-NEXT:    sqrdmlsh s2, s1, v0.s[3]
-; CHECK-NEXT:    fmov w0, s2
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fmov s2, w1
+; CHECK-NEXT:    sqrdmlsh s1, s2, v0.s[3]
+; CHECK-NEXT:    fmov w0, s1
 ; CHECK-NEXT:    ret
 entry:
   %vgetq_lane = extractelement <4 x i32> %c, i64 3

diff  --git a/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll
index 5a44550cc172af..d48c3d437d6202 100644
--- a/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll
@@ -17,9 +17,9 @@ define i56 @ldi56(ptr %p) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrb w8, [x0, #6]
 ; CHECK-NEXT:    ldrh w9, [x0, #4]
-; CHECK-NEXT:    ldr w10, [x0]
 ; CHECK-NEXT:    orr w8, w9, w8, lsl #16
-; CHECK-NEXT:    orr x0, x10, x8, lsl #32
+; CHECK-NEXT:    ldr w9, [x0]
+; CHECK-NEXT:    orr x0, x9, x8, lsl #32
 ; CHECK-NEXT:    ret
     %r = load i56, i56* %p
     ret i56 %r
@@ -41,10 +41,10 @@ define i120 @ldi120(ptr %p) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrb w8, [x0, #14]
 ; CHECK-NEXT:    ldrh w9, [x0, #12]
-; CHECK-NEXT:    ldr w10, [x0, #8]
-; CHECK-NEXT:    ldr x0, [x0]
 ; CHECK-NEXT:    orr w8, w9, w8, lsl #16
-; CHECK-NEXT:    orr x1, x10, x8, lsl #32
+; CHECK-NEXT:    ldr w9, [x0, #8]
+; CHECK-NEXT:    ldr x0, [x0]
+; CHECK-NEXT:    orr x1, x9, x8, lsl #32
 ; CHECK-NEXT:    ret
     %r = load i120, i120* %p
     ret i120 %r
@@ -53,12 +53,12 @@ define i120 @ldi120(ptr %p) nounwind {
 define i280 @ldi280(ptr %p) nounwind {
 ; CHECK-LABEL: ldi280:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x1, [x0]
 ; CHECK-NEXT:    ldrb w9, [x0, #34]
 ; CHECK-NEXT:    ldrh w10, [x0, #32]
+; CHECK-NEXT:    ldp x8, x1, [x0]
 ; CHECK-NEXT:    ldp x2, x3, [x0, #16]
-; CHECK-NEXT:    mov x0, x8
 ; CHECK-NEXT:    orr x4, x10, x9, lsl #16
+; CHECK-NEXT:    mov x0, x8
 ; CHECK-NEXT:    ret
     %r = load i280, i280* %p
     ret i280 %r
@@ -128,15 +128,15 @@ define void @sti280(ptr %p, i280 %a) nounwind {
 define void @i56_or(ptr %a) {
 ; CHECK-LABEL: i56_or:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, x0
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    ldrh w10, [x8, #4]!
-; CHECK-NEXT:    ldrb w11, [x8, #2]
-; CHECK-NEXT:    orr w9, w9, #0x180
-; CHECK-NEXT:    orr w10, w10, w11, lsl #16
-; CHECK-NEXT:    str w9, [x0]
-; CHECK-NEXT:    strb w11, [x8, #2]
-; CHECK-NEXT:    strh w10, [x8]
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    mov x9, x0
+; CHECK-NEXT:    ldrh w10, [x9, #4]!
+; CHECK-NEXT:    ldrb w11, [x9, #2]
+; CHECK-NEXT:    orr w8, w8, #0x180
+; CHECK-NEXT:    str w8, [x0]
+; CHECK-NEXT:    orr w8, w10, w11, lsl #16
+; CHECK-NEXT:    strb w11, [x9, #2]
+; CHECK-NEXT:    strh w8, [x9]
 ; CHECK-NEXT:    ret
   %aa = load i56, ptr %a, align 1
   %b = or i56 %aa, 384
@@ -147,16 +147,16 @@ define void @i56_or(ptr %a) {
 define void @i56_and_or(ptr %a) {
 ; CHECK-LABEL: i56_and_or:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, x0
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    ldrh w10, [x8, #4]!
-; CHECK-NEXT:    ldrb w11, [x8, #2]
-; CHECK-NEXT:    orr w9, w9, #0x180
-; CHECK-NEXT:    and w9, w9, #0xffffff80
-; CHECK-NEXT:    orr w10, w10, w11, lsl #16
-; CHECK-NEXT:    strb w11, [x8, #2]
-; CHECK-NEXT:    str w9, [x0]
-; CHECK-NEXT:    strh w10, [x8]
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    mov x9, x0
+; CHECK-NEXT:    ldrh w10, [x9, #4]!
+; CHECK-NEXT:    ldrb w11, [x9, #2]
+; CHECK-NEXT:    orr w8, w8, #0x180
+; CHECK-NEXT:    and w8, w8, #0xffffff80
+; CHECK-NEXT:    strb w11, [x9, #2]
+; CHECK-NEXT:    str w8, [x0]
+; CHECK-NEXT:    orr w8, w10, w11, lsl #16
+; CHECK-NEXT:    strh w8, [x9]
 ; CHECK-NEXT:    ret
   %b = load i56, ptr %a, align 1
   %c = and i56 %b, -128
@@ -175,8 +175,8 @@ define void @i56_insert_bit(ptr %a, i1 zeroext %bit) {
 ; CHECK-NEXT:    orr w9, w9, w10, lsl #16
 ; CHECK-NEXT:    strb w10, [x8, #2]
 ; CHECK-NEXT:    orr x11, x11, x9, lsl #32
-; CHECK-NEXT:    and x11, x11, #0xffffffffffffdfff
 ; CHECK-NEXT:    strh w9, [x8]
+; CHECK-NEXT:    and x11, x11, #0xffffffffffffdfff
 ; CHECK-NEXT:    orr w11, w11, w1, lsl #13
 ; CHECK-NEXT:    str w11, [x0]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/arm64-nvcast.ll b/llvm/test/CodeGen/AArch64/arm64-nvcast.ll
index 527393657530b6..80314765abdd1f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-nvcast.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-nvcast.ll
@@ -51,8 +51,8 @@ entry:
 define internal void @nvcast_f32_v8i8() {
 ; CHECK-LABEL: nvcast_f32_v8i8:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    adrp x8, __gv at GOTPAGE
 ; CHECK-NEXT:    movi.8b v0, #254
+; CHECK-NEXT:    adrp x8, __gv at GOTPAGE
 ; CHECK-NEXT:    ldr x8, [x8, __gv at GOTPAGEOFF]
 ; CHECK-NEXT:    str d0, [x8]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/arm64-promote-const-complex-initializers.ll b/llvm/test/CodeGen/AArch64/arm64-promote-const-complex-initializers.ll
index 8ec2bd8ef5a97a..86ebf803c57831 100644
--- a/llvm/test/CodeGen/AArch64/arm64-promote-const-complex-initializers.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-promote-const-complex-initializers.ll
@@ -45,8 +45,8 @@ define [1 x <4 x float>] @test2() {
 ; CHECK-NEXT: Lloh3:
 ; CHECK-NEXT:     ldr q1, [x8, lCPI1_0 at PAGEOFF]
 ; CHECK-NEXT:     mov s2, v1[1]
-; CHECK-NEXT:     mov s3, v1[2]
 ; CHECK-NEXT:     fneg    s0, s1
+; CHECK-NEXT:     mov s3, v1[2]
 ; CHECK-NEXT:     mov s1, v1[3]
 ; CHECK-NEXT:     fneg    s2, s2
 ; CHECK-NEXT:     fneg    s3, s3

diff  --git a/llvm/test/CodeGen/AArch64/arm64-register-pairing.ll b/llvm/test/CodeGen/AArch64/arm64-register-pairing.ll
index 3ea0b6994ea7ae..a96cad7e32dcdd 100644
--- a/llvm/test/CodeGen/AArch64/arm64-register-pairing.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-register-pairing.ll
@@ -17,7 +17,7 @@ define void @odd() nounwind {
 ; CHECK-NEXT:    stp x22, x21, [sp, #112] ; 16-byte Folded Spill
 ; CHECK-NEXT:    stp x20, x19, [sp, #128] ; 16-byte Folded Spill
 ; CHECK-NEXT:    ; InlineAsm Start
-; CHECK-NEXT:    mov x0, #42
+; CHECK-NEXT:    mov x0, #42 ; =0x2a
 ; CHECK-NEXT:    ; InlineAsm End
 ; CHECK-NEXT:    ldp x20, x19, [sp, #128] ; 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x22, x21, [sp, #112] ; 16-byte Folded Reload
@@ -38,12 +38,12 @@ define void @odd() nounwind {
 ; CHECK-NOTMACHO-NEXT:    stp x25, x23, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NOTMACHO-NEXT:    stp x21, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NOTMACHO-NEXT:    //APP
-; CHECK-NOTMACHO-NEXT:    mov x0, #42
+; CHECK-NOTMACHO-NEXT:    mov x0, #42 // =0x2a
 ; CHECK-NOTMACHO-NEXT:    //NO_APP
 ; CHECK-NOTMACHO-NEXT:    ldp x21, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NOTMACHO-NEXT:    ldr x27, [sp, #32] // 8-byte Folded Reload
 ; CHECK-NOTMACHO-NEXT:    ldp x25, x23, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NOTMACHO-NEXT:    ldp d10, d8, [sp, #16] // 16-byte Folded Reload
-; CHECK-NOTMACHO-NEXT:    ldr x27, [sp, #32] // 8-byte Folded Reload
 ; CHECK-NOTMACHO-NEXT:    ldp d14, d12, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NOTMACHO-NEXT:    ret
 
@@ -64,7 +64,7 @@ define void @even() nounwind {
 ; CHECK-NEXT:    stp x22, x21, [sp, #112] ; 16-byte Folded Spill
 ; CHECK-NEXT:    stp x20, x19, [sp, #128] ; 16-byte Folded Spill
 ; CHECK-NEXT:    ; InlineAsm Start
-; CHECK-NEXT:    mov x0, #42
+; CHECK-NEXT:    mov x0, #42 ; =0x2a
 ; CHECK-NEXT:    ; InlineAsm End
 ; CHECK-NEXT:    ldp x20, x19, [sp, #128] ; 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x22, x21, [sp, #112] ; 16-byte Folded Reload
@@ -85,12 +85,12 @@ define void @even() nounwind {
 ; CHECK-NOTMACHO-NEXT:    stp x26, x24, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NOTMACHO-NEXT:    stp x22, x20, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NOTMACHO-NEXT:    //APP
-; CHECK-NOTMACHO-NEXT:    mov x0, #42
+; CHECK-NOTMACHO-NEXT:    mov x0, #42 // =0x2a
 ; CHECK-NOTMACHO-NEXT:    //NO_APP
 ; CHECK-NOTMACHO-NEXT:    ldp x22, x20, [sp, #64] // 16-byte Folded Reload
+; CHECK-NOTMACHO-NEXT:    ldr x28, [sp, #32] // 8-byte Folded Reload
 ; CHECK-NOTMACHO-NEXT:    ldp x26, x24, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NOTMACHO-NEXT:    ldp d11, d9, [sp, #16] // 16-byte Folded Reload
-; CHECK-NOTMACHO-NEXT:    ldr x28, [sp, #32] // 8-byte Folded Reload
 ; CHECK-NOTMACHO-NEXT:    ldp d15, d13, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NOTMACHO-NEXT:    ret
 

diff  --git a/llvm/test/CodeGen/AArch64/arm64-rev.ll b/llvm/test/CodeGen/AArch64/arm64-rev.ll
index dd9ccd771c7a1d..28b22cc0c57edb 100644
--- a/llvm/test/CodeGen/AArch64/arm64-rev.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-rev.ll
@@ -306,8 +306,8 @@ define <8 x i8> @test_vrev32D8(ptr %A) nounwind {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    ldr d0, [x0]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI19_0
-; CHECK-GI-NEXT:    mov.d v0[1], v0[0]
 ; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI19_0]
+; CHECK-GI-NEXT:    mov.d v0[1], v0[0]
 ; CHECK-GI-NEXT:    tbl.16b v0, { v0 }, v1
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
@@ -327,8 +327,8 @@ define <4 x i16> @test_vrev32D16(ptr %A) nounwind {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    ldr d0, [x0]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI20_0
-; CHECK-GI-NEXT:    mov.d v0[1], v0[0]
 ; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI20_0]
+; CHECK-GI-NEXT:    mov.d v0[1], v0[0]
 ; CHECK-GI-NEXT:    tbl.16b v0, { v0 }, v1
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
@@ -386,8 +386,8 @@ define <8 x i8> @test_vrev16D8(ptr %A) nounwind {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    ldr d0, [x0]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI23_0
-; CHECK-GI-NEXT:    mov.d v0[1], v0[0]
 ; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI23_0]
+; CHECK-GI-NEXT:    mov.d v0[1], v0[0]
 ; CHECK-GI-NEXT:    tbl.16b v0, { v0 }, v1
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
@@ -451,16 +451,16 @@ define <8 x i16> @test_vrev32Q16_undef(ptr %A) nounwind {
 define void @test_vrev64(ptr nocapture %source, ptr nocapture %dst) nounwind ssp {
 ; CHECK-SD-LABEL: test_vrev64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    add x8, x1, #2
 ; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    add x8, x1, #2
 ; CHECK-SD-NEXT:    st1.h { v0 }[5], [x8]
 ; CHECK-SD-NEXT:    st1.h { v0 }[6], [x1]
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: test_vrev64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    add x8, x1, #2
 ; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    add x8, x1, #2
 ; CHECK-GI-NEXT:    st1.h { v0 }[6], [x1]
 ; CHECK-GI-NEXT:    st1.h { v0 }[5], [x8]
 ; CHECK-GI-NEXT:    ret
@@ -487,8 +487,8 @@ define void @float_vrev64(ptr nocapture %source, ptr nocapture %dest) nounwind n
 ;
 ; CHECK-GI-LABEL: float_vrev64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI28_0
 ; CHECK-GI-NEXT:    movi d0, #0000000000000000
+; CHECK-GI-NEXT:    adrp x8, .LCPI28_0
 ; CHECK-GI-NEXT:    ldr q1, [x0]
 ; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI28_0]
 ; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
@@ -615,19 +615,19 @@ entry:
 define i64 @test_rev16_x_hwbyteswaps_complex1(i64 %a) nounwind {
 ; CHECK-SD-LABEL: test_rev16_x_hwbyteswaps_complex1:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    lsr x8, x0, #48
-; CHECK-SD-NEXT:    lsr x9, x0, #8
-; CHECK-SD-NEXT:    lsr x10, x0, #32
-; CHECK-SD-NEXT:    and x11, x9, #0xff000000000000
-; CHECK-SD-NEXT:    lsr x12, x0, #16
-; CHECK-SD-NEXT:    bfi x11, x8, #56, #8
-; CHECK-SD-NEXT:    and x8, x9, #0xff00000000
-; CHECK-SD-NEXT:    orr x8, x11, x8
-; CHECK-SD-NEXT:    and x9, x9, #0xff0000
-; CHECK-SD-NEXT:    bfi x8, x10, #40, #8
-; CHECK-SD-NEXT:    orr x8, x8, x9
+; CHECK-SD-NEXT:    lsr x8, x0, #8
+; CHECK-SD-NEXT:    lsr x9, x0, #48
+; CHECK-SD-NEXT:    and x10, x8, #0xff000000000000
+; CHECK-SD-NEXT:    and x11, x8, #0xff00000000
+; CHECK-SD-NEXT:    and x8, x8, #0xff0000
+; CHECK-SD-NEXT:    bfi x10, x9, #56, #8
+; CHECK-SD-NEXT:    lsr x9, x0, #32
+; CHECK-SD-NEXT:    orr x10, x10, x11
+; CHECK-SD-NEXT:    bfi x10, x9, #40, #8
+; CHECK-SD-NEXT:    lsr x9, x0, #16
+; CHECK-SD-NEXT:    orr x8, x10, x8
+; CHECK-SD-NEXT:    bfi x8, x9, #24, #8
 ; CHECK-SD-NEXT:    ubfiz x9, x0, #8, #8
-; CHECK-SD-NEXT:    bfi x8, x12, #24, #8
 ; CHECK-SD-NEXT:    bfxil x8, x0, #8, #8
 ; CHECK-SD-NEXT:    orr x0, x8, x9
 ; CHECK-SD-NEXT:    ret
@@ -640,16 +640,16 @@ define i64 @test_rev16_x_hwbyteswaps_complex1(i64 %a) nounwind {
 ; CHECK-GI-NEXT:    and x11, x9, #0xff00000000000000
 ; CHECK-GI-NEXT:    and x12, x8, #0xff00000000
 ; CHECK-GI-NEXT:    and x13, x9, #0xff0000000000
+; CHECK-GI-NEXT:    and x14, x8, #0xff0000
 ; CHECK-GI-NEXT:    orr x10, x10, x11
-; CHECK-GI-NEXT:    orr x11, x12, x13
-; CHECK-GI-NEXT:    and x12, x8, #0xff0000
-; CHECK-GI-NEXT:    and x13, x9, #0xff000000
+; CHECK-GI-NEXT:    and x11, x9, #0xff000000
 ; CHECK-GI-NEXT:    orr x12, x12, x13
 ; CHECK-GI-NEXT:    and x8, x8, #0xff
-; CHECK-GI-NEXT:    orr x10, x10, x11
-; CHECK-GI-NEXT:    orr x8, x12, x8
-; CHECK-GI-NEXT:    orr x8, x10, x8
+; CHECK-GI-NEXT:    orr x11, x14, x11
+; CHECK-GI-NEXT:    orr x10, x10, x12
 ; CHECK-GI-NEXT:    and x9, x9, #0xff00
+; CHECK-GI-NEXT:    orr x8, x11, x8
+; CHECK-GI-NEXT:    orr x8, x10, x8
 ; CHECK-GI-NEXT:    orr x0, x8, x9
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -676,14 +676,14 @@ entry:
 define i64 @test_rev16_x_hwbyteswaps_complex2(i64 %a) nounwind {
 ; CHECK-SD-LABEL: test_rev16_x_hwbyteswaps_complex2:
 ; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    lsr x8, x0, #8
 ; CHECK-SD-NEXT:    lsr x9, x0, #48
 ; CHECK-SD-NEXT:    lsr x10, x0, #32
-; CHECK-SD-NEXT:    lsr x8, x0, #8
-; CHECK-SD-NEXT:    lsr x11, x0, #16
 ; CHECK-SD-NEXT:    and x8, x8, #0xff00ff00ff00ff
 ; CHECK-SD-NEXT:    bfi x8, x9, #56, #8
+; CHECK-SD-NEXT:    lsr x9, x0, #16
 ; CHECK-SD-NEXT:    bfi x8, x10, #40, #8
-; CHECK-SD-NEXT:    bfi x8, x11, #24, #8
+; CHECK-SD-NEXT:    bfi x8, x9, #24, #8
 ; CHECK-SD-NEXT:    bfi x8, x0, #8, #8
 ; CHECK-SD-NEXT:    mov x0, x8
 ; CHECK-SD-NEXT:    ret
@@ -696,16 +696,16 @@ define i64 @test_rev16_x_hwbyteswaps_complex2(i64 %a) nounwind {
 ; CHECK-GI-NEXT:    and x11, x8, #0xff00000000
 ; CHECK-GI-NEXT:    and x12, x8, #0xff0000
 ; CHECK-GI-NEXT:    and x8, x8, #0xff
+; CHECK-GI-NEXT:    and x13, x9, #0xff00000000000000
 ; CHECK-GI-NEXT:    orr x10, x10, x11
+; CHECK-GI-NEXT:    and x11, x9, #0xff0000000000
 ; CHECK-GI-NEXT:    orr x8, x12, x8
-; CHECK-GI-NEXT:    and x11, x9, #0xff00000000000000
-; CHECK-GI-NEXT:    and x12, x9, #0xff0000000000
-; CHECK-GI-NEXT:    orr x11, x11, x12
 ; CHECK-GI-NEXT:    and x12, x9, #0xff000000
+; CHECK-GI-NEXT:    orr x11, x13, x11
 ; CHECK-GI-NEXT:    orr x8, x10, x8
+; CHECK-GI-NEXT:    and x9, x9, #0xff00
 ; CHECK-GI-NEXT:    orr x10, x11, x12
 ; CHECK-GI-NEXT:    orr x8, x8, x10
-; CHECK-GI-NEXT:    and x9, x9, #0xff00
 ; CHECK-GI-NEXT:    orr x0, x8, x9
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -733,19 +733,19 @@ entry:
 define i64 @test_rev16_x_hwbyteswaps_complex3(i64 %a) nounwind {
 ; CHECK-SD-LABEL: test_rev16_x_hwbyteswaps_complex3:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    lsr x8, x0, #48
-; CHECK-SD-NEXT:    lsr x9, x0, #8
-; CHECK-SD-NEXT:    lsr x10, x0, #32
-; CHECK-SD-NEXT:    and x11, x9, #0xff000000000000
-; CHECK-SD-NEXT:    lsr x12, x0, #16
-; CHECK-SD-NEXT:    bfi x11, x8, #56, #8
-; CHECK-SD-NEXT:    and x8, x9, #0xff00000000
-; CHECK-SD-NEXT:    orr x8, x8, x11
-; CHECK-SD-NEXT:    and x9, x9, #0xff0000
-; CHECK-SD-NEXT:    bfi x8, x10, #40, #8
-; CHECK-SD-NEXT:    orr x8, x9, x8
+; CHECK-SD-NEXT:    lsr x8, x0, #8
+; CHECK-SD-NEXT:    lsr x9, x0, #48
+; CHECK-SD-NEXT:    and x10, x8, #0xff000000000000
+; CHECK-SD-NEXT:    and x11, x8, #0xff00000000
+; CHECK-SD-NEXT:    and x8, x8, #0xff0000
+; CHECK-SD-NEXT:    bfi x10, x9, #56, #8
+; CHECK-SD-NEXT:    lsr x9, x0, #32
+; CHECK-SD-NEXT:    orr x10, x11, x10
+; CHECK-SD-NEXT:    bfi x10, x9, #40, #8
+; CHECK-SD-NEXT:    lsr x9, x0, #16
+; CHECK-SD-NEXT:    orr x8, x8, x10
+; CHECK-SD-NEXT:    bfi x8, x9, #24, #8
 ; CHECK-SD-NEXT:    ubfiz x9, x0, #8, #8
-; CHECK-SD-NEXT:    bfi x8, x12, #24, #8
 ; CHECK-SD-NEXT:    bfxil x8, x0, #8, #8
 ; CHECK-SD-NEXT:    orr x0, x9, x8
 ; CHECK-SD-NEXT:    ret
@@ -758,16 +758,16 @@ define i64 @test_rev16_x_hwbyteswaps_complex3(i64 %a) nounwind {
 ; CHECK-GI-NEXT:    and x11, x9, #0xff00000000000000
 ; CHECK-GI-NEXT:    and x12, x8, #0xff00000000
 ; CHECK-GI-NEXT:    and x13, x9, #0xff0000000000
+; CHECK-GI-NEXT:    and x14, x8, #0xff0000
 ; CHECK-GI-NEXT:    orr x10, x11, x10
-; CHECK-GI-NEXT:    orr x11, x13, x12
-; CHECK-GI-NEXT:    and x12, x8, #0xff0000
-; CHECK-GI-NEXT:    and x13, x9, #0xff000000
+; CHECK-GI-NEXT:    and x11, x9, #0xff000000
 ; CHECK-GI-NEXT:    orr x12, x13, x12
 ; CHECK-GI-NEXT:    and x8, x8, #0xff
-; CHECK-GI-NEXT:    orr x10, x11, x10
-; CHECK-GI-NEXT:    orr x8, x8, x12
-; CHECK-GI-NEXT:    orr x8, x8, x10
+; CHECK-GI-NEXT:    orr x11, x11, x14
+; CHECK-GI-NEXT:    orr x10, x12, x10
 ; CHECK-GI-NEXT:    and x9, x9, #0xff00
+; CHECK-GI-NEXT:    orr x8, x8, x11
+; CHECK-GI-NEXT:    orr x8, x8, x10
 ; CHECK-GI-NEXT:    orr x0, x9, x8
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -794,11 +794,11 @@ entry:
 define i64 @test_or_and_combine1(i64 %a) nounwind {
 ; CHECK-SD-LABEL: test_or_and_combine1:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    lsr x8, x0, #24
-; CHECK-SD-NEXT:    lsr x9, x0, #8
-; CHECK-SD-NEXT:    and x10, x9, #0xff000000000000
-; CHECK-SD-NEXT:    bfi x10, x8, #32, #8
-; CHECK-SD-NEXT:    and x8, x9, #0xff0000
+; CHECK-SD-NEXT:    lsr x8, x0, #8
+; CHECK-SD-NEXT:    lsr x9, x0, #24
+; CHECK-SD-NEXT:    and x10, x8, #0xff000000000000
+; CHECK-SD-NEXT:    and x8, x8, #0xff0000
+; CHECK-SD-NEXT:    bfi x10, x9, #32, #8
 ; CHECK-SD-NEXT:    orr x0, x10, x8
 ; CHECK-SD-NEXT:    ret
 ;
@@ -808,8 +808,8 @@ define i64 @test_or_and_combine1(i64 %a) nounwind {
 ; CHECK-GI-NEXT:    lsl x9, x0, #8
 ; CHECK-GI-NEXT:    and x10, x8, #0xff000000000000
 ; CHECK-GI-NEXT:    and x9, x9, #0xff00000000
-; CHECK-GI-NEXT:    orr x9, x10, x9
 ; CHECK-GI-NEXT:    and x8, x8, #0xff0000
+; CHECK-GI-NEXT:    orr x9, x10, x9
 ; CHECK-GI-NEXT:    orr x0, x9, x8
 ; CHECK-GI-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/arm64-setcc-int-to-fp-combine.ll b/llvm/test/CodeGen/AArch64/arm64-setcc-int-to-fp-combine.ll
index b9cffbcbdbd67b..8ea7934457c973 100644
--- a/llvm/test/CodeGen/AArch64/arm64-setcc-int-to-fp-combine.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-setcc-int-to-fp-combine.ll
@@ -40,9 +40,9 @@ define void @foo1(<4 x float> %val, <4 x float> %test, ptr %p) nounwind {
 define <4 x float> @foo2(<4 x float> %val, <4 x float> %test) nounwind {
 ; CHECK-LABEL: foo2:
 ; CHECK:       ; %bb.0:
+; CHECK-NEXT:    fcmeq.4s v0, v0, v1
 ; CHECK-NEXT:  Lloh0:
 ; CHECK-NEXT:    adrp x8, lCPI2_0 at PAGE
-; CHECK-NEXT:    fcmeq.4s v0, v0, v1
 ; CHECK-NEXT:  Lloh1:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI2_0 at PAGEOFF]
 ; CHECK-NEXT:    and.16b v0, v0, v1

diff  --git a/llvm/test/CodeGen/AArch64/arm64-setcc-swap-infloop.ll b/llvm/test/CodeGen/AArch64/arm64-setcc-swap-infloop.ll
index e2c5546ab42330..20aef34d4cb7d8 100644
--- a/llvm/test/CodeGen/AArch64/arm64-setcc-swap-infloop.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-setcc-swap-infloop.ll
@@ -12,14 +12,14 @@ declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
 define <16 x i1> @setcc_swap_infloop(ptr %arg) {
 ; CHECK-LABEL: setcc_swap_infloop:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    mov w9, #16 ; =0x10
 ; CHECK-NEXT:    movi.16b v1, #1
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    ldr q0, [x8]
+; CHECK-NEXT:    cmeq.16b v0, v0, #0
 ; CHECK-NEXT:    cmeq.16b v2, v1, #0
 ; CHECK-NEXT:    str q1, [x8]
-; CHECK-NEXT:    cmeq.16b v0, v0, #0
-; CHECK-NEXT:    str q1, [x9]
+; CHECK-NEXT:    mov w8, #16 ; =0x10
+; CHECK-NEXT:    str q1, [x8]
 ; CHECK-NEXT:    orr.16b v0, v0, v2
 ; CHECK-NEXT:    ret
   call void @llvm.memset.p0.i64(ptr nonnull null, i8 1, i64 32, i1 false)

diff  --git a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
index 8079c1306d9b6f..80b8c963a697c9 100644
--- a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
@@ -80,7 +80,7 @@ define i32 @freqSaveAndRestoreOutsideLoop(i32 %cond, i32 %N) {
 ; ENABLE-NEXT:    .cfi_offset w19, -24
 ; ENABLE-NEXT:    .cfi_offset w20, -32
 ; ENABLE-NEXT:    mov w19, wzr
-; ENABLE-NEXT:    mov w20, #10
+; ENABLE-NEXT:    mov w20, #10 ; =0xa
 ; ENABLE-NEXT:  LBB1_2: ; %for.body
 ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; ENABLE-NEXT:    bl _something
@@ -109,7 +109,7 @@ define i32 @freqSaveAndRestoreOutsideLoop(i32 %cond, i32 %N) {
 ; DISABLE-NEXT:    cbz w0, LBB1_4
 ; DISABLE-NEXT:  ; %bb.1: ; %for.body.preheader
 ; DISABLE-NEXT:    mov w19, wzr
-; DISABLE-NEXT:    mov w20, #10
+; DISABLE-NEXT:    mov w20, #10 ; =0xa
 ; DISABLE-NEXT:  LBB1_2: ; %for.body
 ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; DISABLE-NEXT:    bl _something
@@ -167,7 +167,7 @@ define i32 @freqSaveAndRestoreOutsideLoop2(i32 %cond) {
 ; ENABLE-NEXT:    .cfi_offset w19, -24
 ; ENABLE-NEXT:    .cfi_offset w20, -32
 ; ENABLE-NEXT:    mov w19, wzr
-; ENABLE-NEXT:    mov w20, #10
+; ENABLE-NEXT:    mov w20, #10 ; =0xa
 ; ENABLE-NEXT:  LBB2_1: ; %for.body
 ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; ENABLE-NEXT:    bl _something
@@ -191,7 +191,7 @@ define i32 @freqSaveAndRestoreOutsideLoop2(i32 %cond) {
 ; DISABLE-NEXT:    .cfi_offset w19, -24
 ; DISABLE-NEXT:    .cfi_offset w20, -32
 ; DISABLE-NEXT:    mov w19, wzr
-; DISABLE-NEXT:    mov w20, #10
+; DISABLE-NEXT:    mov w20, #10 ; =0xa
 ; DISABLE-NEXT:  LBB2_1: ; %for.body
 ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; DISABLE-NEXT:    bl _something
@@ -235,7 +235,7 @@ define i32 @loopInfoSaveOutsideLoop(i32 %cond, i32 %N) {
 ; ENABLE-NEXT:    .cfi_offset w19, -24
 ; ENABLE-NEXT:    .cfi_offset w20, -32
 ; ENABLE-NEXT:    mov w19, wzr
-; ENABLE-NEXT:    mov w20, #10
+; ENABLE-NEXT:    mov w20, #10 ; =0xa
 ; ENABLE-NEXT:  LBB3_2: ; %for.body
 ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; ENABLE-NEXT:    bl _something
@@ -265,7 +265,7 @@ define i32 @loopInfoSaveOutsideLoop(i32 %cond, i32 %N) {
 ; DISABLE-NEXT:    cbz w0, LBB3_4
 ; DISABLE-NEXT:  ; %bb.1: ; %for.body.preheader
 ; DISABLE-NEXT:    mov w19, wzr
-; DISABLE-NEXT:    mov w20, #10
+; DISABLE-NEXT:    mov w20, #10 ; =0xa
 ; DISABLE-NEXT:  LBB3_2: ; %for.body
 ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; DISABLE-NEXT:    bl _something
@@ -329,7 +329,7 @@ define i32 @loopInfoRestoreOutsideLoop(i32 %cond, i32 %N) nounwind uwtable {
 ; ENABLE-NEXT:    .cfi_offset w20, -32
 ; ENABLE-NEXT:    bl _somethingElse
 ; ENABLE-NEXT:    mov w19, wzr
-; ENABLE-NEXT:    mov w20, #10
+; ENABLE-NEXT:    mov w20, #10 ; =0xa
 ; ENABLE-NEXT:  LBB4_2: ; %for.body
 ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; ENABLE-NEXT:    bl _something
@@ -366,7 +366,7 @@ define i32 @loopInfoRestoreOutsideLoop(i32 %cond, i32 %N) nounwind uwtable {
 ; DISABLE-NEXT:  ; %bb.1: ; %if.then
 ; DISABLE-NEXT:    bl _somethingElse
 ; DISABLE-NEXT:    mov w19, wzr
-; DISABLE-NEXT:    mov w20, #10
+; DISABLE-NEXT:    mov w20, #10 ; =0xa
 ; DISABLE-NEXT:  LBB4_2: ; %for.body
 ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; DISABLE-NEXT:    bl _something
@@ -452,8 +452,8 @@ define i32 @variadicFunc(i32 %cond, i32 %count, ...) nounwind uwtable {
 ; ENABLE-NEXT:    add x9, x8, #8
 ; ENABLE-NEXT:    str x9, [sp, #8]
 ; ENABLE-NEXT:    ldr w8, [x8]
-; ENABLE-NEXT:    add w0, w0, w8
 ; ENABLE-NEXT:    subs w1, w1, #1
+; ENABLE-NEXT:    add w0, w0, w8
 ; ENABLE-NEXT:    b.ne LBB6_2
 ; ENABLE-NEXT:  LBB6_3: ; %for.end
 ; ENABLE-NEXT:    add sp, sp, #16
@@ -480,8 +480,8 @@ define i32 @variadicFunc(i32 %cond, i32 %count, ...) nounwind uwtable {
 ; DISABLE-NEXT:    add x9, x8, #8
 ; DISABLE-NEXT:    str x9, [sp, #8]
 ; DISABLE-NEXT:    ldr w8, [x8]
-; DISABLE-NEXT:    add w0, w0, w8
 ; DISABLE-NEXT:    subs w1, w1, #1
+; DISABLE-NEXT:    add w0, w0, w8
 ; DISABLE-NEXT:    b.ne LBB6_2
 ; DISABLE-NEXT:    b LBB6_4
 ; DISABLE-NEXT:  LBB6_3: ; %if.else
@@ -537,7 +537,7 @@ define i32 @inlineAsm(i32 %cond, i32 %N) {
 ; ENABLE-NEXT:    .cfi_def_cfa_offset 16
 ; ENABLE-NEXT:    .cfi_offset w19, -8
 ; ENABLE-NEXT:    .cfi_offset w20, -16
-; ENABLE-NEXT:    mov w8, #10
+; ENABLE-NEXT:    mov w8, #10 ; =0xa
 ; ENABLE-NEXT:  LBB7_2: ; %for.body
 ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; ENABLE-NEXT:    subs w8, w8, #1
@@ -561,7 +561,7 @@ define i32 @inlineAsm(i32 %cond, i32 %N) {
 ; DISABLE-NEXT:    .cfi_offset w20, -16
 ; DISABLE-NEXT:    cbz w0, LBB7_4
 ; DISABLE-NEXT:  ; %bb.1: ; %for.body.preheader
-; DISABLE-NEXT:    mov w8, #10
+; DISABLE-NEXT:    mov w8, #10 ; =0xa
 ; DISABLE-NEXT:  LBB7_2: ; %for.body
 ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; DISABLE-NEXT:    subs w8, w8, #1
@@ -612,8 +612,8 @@ define i32 @callVariadicFunc(i32 %cond, i32 %N) {
 ; ENABLE-NEXT:    .cfi_offset w29, -16
 ; ENABLE-NEXT:    stp x1, x1, [sp, #32]
 ; ENABLE-NEXT:    stp x1, x1, [sp, #16]
-; ENABLE-NEXT:    mov w0, w1
 ; ENABLE-NEXT:    stp x1, x1, [sp]
+; ENABLE-NEXT:    mov w0, w1
 ; ENABLE-NEXT:    bl _someVariadicFunc
 ; ENABLE-NEXT:    lsl w0, w0, #3
 ; ENABLE-NEXT:    ldp x29, x30, [sp, #48] ; 16-byte Folded Reload
@@ -636,8 +636,8 @@ define i32 @callVariadicFunc(i32 %cond, i32 %N) {
 ; DISABLE-NEXT:  ; %bb.1: ; %if.then
 ; DISABLE-NEXT:    stp x1, x1, [sp, #32]
 ; DISABLE-NEXT:    stp x1, x1, [sp, #16]
-; DISABLE-NEXT:    mov w0, w1
 ; DISABLE-NEXT:    stp x1, x1, [sp]
+; DISABLE-NEXT:    mov w0, w1
 ; DISABLE-NEXT:    bl _someVariadicFunc
 ; DISABLE-NEXT:    lsl w0, w0, #3
 ; DISABLE-NEXT:    b LBB8_3
@@ -676,7 +676,7 @@ define i32 @noreturn(i8 signext %bad_thing) {
 ; ENABLE:       ; %bb.0: ; %entry
 ; ENABLE-NEXT:    cbnz w0, LBB9_2
 ; ENABLE-NEXT:  ; %bb.1: ; %if.end
-; ENABLE-NEXT:    mov w0, #42
+; ENABLE-NEXT:    mov w0, #42 ; =0x2a
 ; ENABLE-NEXT:    ret
 ; ENABLE-NEXT:  LBB9_2: ; %if.abort
 ; ENABLE-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
@@ -695,7 +695,7 @@ define i32 @noreturn(i8 signext %bad_thing) {
 ; DISABLE-NEXT:    .cfi_offset w29, -16
 ; DISABLE-NEXT:    cbnz w0, LBB9_2
 ; DISABLE-NEXT:  ; %bb.1: ; %if.end
-; DISABLE-NEXT:    mov w0, #42
+; DISABLE-NEXT:    mov w0, #42 ; =0x2a
 ; DISABLE-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; DISABLE-NEXT:    ret
 ; DISABLE-NEXT:  LBB9_2: ; %if.abort
@@ -816,10 +816,10 @@ define void @infiniteloop2() {
 ; ENABLE-NEXT:  LBB11_2: ; %for.body
 ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; ENABLE-NEXT:    ; InlineAsm Start
-; ENABLE-NEXT:    mov x10, #0
+; ENABLE-NEXT:    mov x10, #0 ; =0x0
 ; ENABLE-NEXT:    ; InlineAsm End
 ; ENABLE-NEXT:    add w10, w10, w9
-; ENABLE-NEXT:    mov w9, #1
+; ENABLE-NEXT:    mov w9, #1 ; =0x1
 ; ENABLE-NEXT:    str w10, [x8]
 ; ENABLE-NEXT:    ; InlineAsm Start
 ; ENABLE-NEXT:    nop
@@ -849,10 +849,10 @@ define void @infiniteloop2() {
 ; DISABLE-NEXT:  LBB11_2: ; %for.body
 ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; DISABLE-NEXT:    ; InlineAsm Start
-; DISABLE-NEXT:    mov x10, #0
+; DISABLE-NEXT:    mov x10, #0 ; =0x0
 ; DISABLE-NEXT:    ; InlineAsm End
 ; DISABLE-NEXT:    add w10, w10, w9
-; DISABLE-NEXT:    mov w9, #1
+; DISABLE-NEXT:    mov w9, #1 ; =0x1
 ; DISABLE-NEXT:    str w10, [x8]
 ; DISABLE-NEXT:    ; InlineAsm Start
 ; DISABLE-NEXT:    nop
@@ -969,8 +969,8 @@ end:
 define i32 @stack_realign(i32 %a, i32 %b, ptr %ptr1, ptr %ptr2) {
 ; ENABLE-LABEL: stack_realign:
 ; ENABLE:       ; %bb.0:
-; ENABLE-NEXT:    lsl w8, w0, w1
-; ENABLE-NEXT:    lsl w9, w1, w0
+; ENABLE-NEXT:    lsl w9, w0, w1
+; ENABLE-NEXT:    lsl w8, w1, w0
 ; ENABLE-NEXT:    cmp w0, w1
 ; ENABLE-NEXT:    b.ge LBB13_2
 ; ENABLE-NEXT:  ; %bb.1: ; %true
@@ -985,8 +985,8 @@ define i32 @stack_realign(i32 %a, i32 %b, ptr %ptr1, ptr %ptr2) {
 ; ENABLE-NEXT:    mov sp, x29
 ; ENABLE-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; ENABLE-NEXT:  LBB13_2: ; %false
-; ENABLE-NEXT:    str w8, [x2]
-; ENABLE-NEXT:    str w9, [x3]
+; ENABLE-NEXT:    str w9, [x2]
+; ENABLE-NEXT:    str w8, [x3]
 ; ENABLE-NEXT:    ret
 ;
 ; DISABLE-LABEL: stack_realign:
@@ -998,15 +998,15 @@ define i32 @stack_realign(i32 %a, i32 %b, ptr %ptr1, ptr %ptr2) {
 ; DISABLE-NEXT:    .cfi_def_cfa w29, 16
 ; DISABLE-NEXT:    .cfi_offset w30, -8
 ; DISABLE-NEXT:    .cfi_offset w29, -16
-; DISABLE-NEXT:    lsl w8, w0, w1
-; DISABLE-NEXT:    lsl w9, w1, w0
+; DISABLE-NEXT:    lsl w9, w0, w1
+; DISABLE-NEXT:    lsl w8, w1, w0
 ; DISABLE-NEXT:    cmp w0, w1
 ; DISABLE-NEXT:    b.ge LBB13_2
 ; DISABLE-NEXT:  ; %bb.1: ; %true
 ; DISABLE-NEXT:    str w0, [sp]
 ; DISABLE-NEXT:  LBB13_2: ; %false
-; DISABLE-NEXT:    str w8, [x2]
-; DISABLE-NEXT:    str w9, [x3]
+; DISABLE-NEXT:    str w9, [x2]
+; DISABLE-NEXT:    str w8, [x3]
 ; DISABLE-NEXT:    mov sp, x29
 ; DISABLE-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; DISABLE-NEXT:    ret
@@ -1058,16 +1058,16 @@ define void @stack_realign2(i32 %a, i32 %b, ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr
 ; ENABLE-NEXT:    .cfi_offset w26, -80
 ; ENABLE-NEXT:    .cfi_offset w27, -88
 ; ENABLE-NEXT:    .cfi_offset w28, -96
-; ENABLE-NEXT:    add w8, w1, w0
-; ENABLE-NEXT:    lsl w9, w0, w1
-; ENABLE-NEXT:    lsl w10, w1, w0
-; ENABLE-NEXT:    lsr w12, w0, w1
-; ENABLE-NEXT:    lsr w13, w1, w0
-; ENABLE-NEXT:    sub w11, w10, w12
+; ENABLE-NEXT:    lsl w8, w1, w0
+; ENABLE-NEXT:    lsr w10, w0, w1
+; ENABLE-NEXT:    lsl w16, w0, w1
+; ENABLE-NEXT:    lsr w11, w1, w0
+; ENABLE-NEXT:    add w14, w1, w0
+; ENABLE-NEXT:    sub w9, w8, w10
 ; ENABLE-NEXT:    subs w17, w1, w0
-; ENABLE-NEXT:    add w16, w9, w10
-; ENABLE-NEXT:    add w14, w12, w13
-; ENABLE-NEXT:    add w15, w13, w8
+; ENABLE-NEXT:    add w15, w16, w8
+; ENABLE-NEXT:    add w12, w10, w11
+; ENABLE-NEXT:    add w13, w11, w14
 ; ENABLE-NEXT:    b.le LBB14_2
 ; ENABLE-NEXT:  ; %bb.1: ; %true
 ; ENABLE-NEXT:    str w0, [sp]
@@ -1075,15 +1075,15 @@ define void @stack_realign2(i32 %a, i32 %b, ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr
 ; ENABLE-NEXT:    nop
 ; ENABLE-NEXT:    ; InlineAsm End
 ; ENABLE-NEXT:  LBB14_2: ; %false
-; ENABLE-NEXT:    str w9, [x2]
-; ENABLE-NEXT:    str w10, [x3]
-; ENABLE-NEXT:    str w12, [x4]
-; ENABLE-NEXT:    str w13, [x5]
-; ENABLE-NEXT:    str w8, [x6]
+; ENABLE-NEXT:    str w16, [x2]
+; ENABLE-NEXT:    str w8, [x3]
+; ENABLE-NEXT:    str w10, [x4]
+; ENABLE-NEXT:    str w11, [x5]
+; ENABLE-NEXT:    str w14, [x6]
 ; ENABLE-NEXT:    str w17, [x7]
 ; ENABLE-NEXT:    stp w0, w1, [x2, #4]
-; ENABLE-NEXT:    stp w16, w11, [x2, #12]
-; ENABLE-NEXT:    stp w14, w15, [x2, #20]
+; ENABLE-NEXT:    stp w15, w9, [x2, #12]
+; ENABLE-NEXT:    stp w12, w13, [x2, #20]
 ; ENABLE-NEXT:    sub sp, x29, #80
 ; ENABLE-NEXT:    ldp x29, x30, [sp, #80] ; 16-byte Folded Reload
 ; ENABLE-NEXT:    ldp x20, x19, [sp, #64] ; 16-byte Folded Reload
@@ -1117,16 +1117,16 @@ define void @stack_realign2(i32 %a, i32 %b, ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr
 ; DISABLE-NEXT:    .cfi_offset w26, -80
 ; DISABLE-NEXT:    .cfi_offset w27, -88
 ; DISABLE-NEXT:    .cfi_offset w28, -96
-; DISABLE-NEXT:    add w8, w1, w0
-; DISABLE-NEXT:    lsl w9, w0, w1
-; DISABLE-NEXT:    lsl w10, w1, w0
-; DISABLE-NEXT:    lsr w12, w0, w1
-; DISABLE-NEXT:    lsr w13, w1, w0
-; DISABLE-NEXT:    sub w11, w10, w12
+; DISABLE-NEXT:    lsl w8, w1, w0
+; DISABLE-NEXT:    lsr w10, w0, w1
+; DISABLE-NEXT:    lsl w16, w0, w1
+; DISABLE-NEXT:    lsr w11, w1, w0
+; DISABLE-NEXT:    add w14, w1, w0
+; DISABLE-NEXT:    sub w9, w8, w10
 ; DISABLE-NEXT:    subs w17, w1, w0
-; DISABLE-NEXT:    add w16, w9, w10
-; DISABLE-NEXT:    add w14, w12, w13
-; DISABLE-NEXT:    add w15, w13, w8
+; DISABLE-NEXT:    add w15, w16, w8
+; DISABLE-NEXT:    add w12, w10, w11
+; DISABLE-NEXT:    add w13, w11, w14
 ; DISABLE-NEXT:    b.le LBB14_2
 ; DISABLE-NEXT:  ; %bb.1: ; %true
 ; DISABLE-NEXT:    str w0, [sp]
@@ -1134,15 +1134,15 @@ define void @stack_realign2(i32 %a, i32 %b, ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr
 ; DISABLE-NEXT:    nop
 ; DISABLE-NEXT:    ; InlineAsm End
 ; DISABLE-NEXT:  LBB14_2: ; %false
-; DISABLE-NEXT:    str w9, [x2]
-; DISABLE-NEXT:    str w10, [x3]
-; DISABLE-NEXT:    str w12, [x4]
-; DISABLE-NEXT:    str w13, [x5]
-; DISABLE-NEXT:    str w8, [x6]
+; DISABLE-NEXT:    str w16, [x2]
+; DISABLE-NEXT:    str w8, [x3]
+; DISABLE-NEXT:    str w10, [x4]
+; DISABLE-NEXT:    str w11, [x5]
+; DISABLE-NEXT:    str w14, [x6]
 ; DISABLE-NEXT:    str w17, [x7]
 ; DISABLE-NEXT:    stp w0, w1, [x2, #4]
-; DISABLE-NEXT:    stp w16, w11, [x2, #12]
-; DISABLE-NEXT:    stp w14, w15, [x2, #20]
+; DISABLE-NEXT:    stp w15, w9, [x2, #12]
+; DISABLE-NEXT:    stp w12, w13, [x2, #20]
 ; DISABLE-NEXT:    sub sp, x29, #80
 ; DISABLE-NEXT:    ldp x29, x30, [sp, #80] ; 16-byte Folded Reload
 ; DISABLE-NEXT:    ldp x20, x19, [sp, #64] ; 16-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
index 60c2dada6b0627..43d9eb7f368979 100644
--- a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
@@ -249,11 +249,11 @@ define <4 x i64> @zext_v4i8_to_v4i64(<4 x i8> %v0) nounwind {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    ushll.4s v0, v0, #0
 ; CHECK-GI-NEXT:    adrp x8, .LCPI14_0
-; CHECK-GI-NEXT:    mov d1, v0[1]
 ; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI14_0]
+; CHECK-GI-NEXT:    mov d1, v0[1]
 ; CHECK-GI-NEXT:    ushll.2d v0, v0, #0
-; CHECK-GI-NEXT:    ushll.2d v1, v1, #0
 ; CHECK-GI-NEXT:    and.16b v0, v0, v2
+; CHECK-GI-NEXT:    ushll.2d v1, v1, #0
 ; CHECK-GI-NEXT:    and.16b v1, v1, v2
 ; CHECK-GI-NEXT:    ret
   %r = zext <4 x i8> %v0 to <4 x i64>
@@ -266,8 +266,8 @@ define <4 x i64> @sext_v4i8_to_v4i64(<4 x i8> %v0) nounwind {
 ; CHECK-SD-NEXT:    ushll.4s v0, v0, #0
 ; CHECK-SD-NEXT:    ushll.2d v1, v0, #0
 ; CHECK-SD-NEXT:    ushll2.2d v0, v0, #0
-; CHECK-SD-NEXT:    shl.2d v2, v1, #56
 ; CHECK-SD-NEXT:    shl.2d v0, v0, #56
+; CHECK-SD-NEXT:    shl.2d v2, v1, #56
 ; CHECK-SD-NEXT:    sshr.2d v1, v0, #56
 ; CHECK-SD-NEXT:    sshr.2d v0, v2, #56
 ; CHECK-SD-NEXT:    ret
@@ -291,11 +291,11 @@ define <8 x i64> @zext_v8i8_to_v8i64(<8 x i8> %v0) nounwind {
 ; CHECK-SD-LABEL: zext_v8i8_to_v8i64:
 ; CHECK-SD:       // %bb.0:
 ; CHECK-SD-NEXT:    ushll.8h v0, v0, #0
+; CHECK-SD-NEXT:    ushll.4s v1, v0, #0
 ; CHECK-SD-NEXT:    ushll2.4s v2, v0, #0
-; CHECK-SD-NEXT:    ushll.4s v0, v0, #0
+; CHECK-SD-NEXT:    ushll.2d v0, v1, #0
 ; CHECK-SD-NEXT:    ushll2.2d v3, v2, #0
-; CHECK-SD-NEXT:    ushll2.2d v1, v0, #0
-; CHECK-SD-NEXT:    ushll.2d v0, v0, #0
+; CHECK-SD-NEXT:    ushll2.2d v1, v1, #0
 ; CHECK-SD-NEXT:    ushll.2d v2, v2, #0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -304,13 +304,13 @@ define <8 x i64> @zext_v8i8_to_v8i64(<8 x i8> %v0) nounwind {
 ; CHECK-GI-NEXT:    ushll.8h v0, v0, #0
 ; CHECK-GI-NEXT:    mov d1, v0[1]
 ; CHECK-GI-NEXT:    ushll.4s v0, v0, #0
-; CHECK-GI-NEXT:    mov d2, v0[1]
-; CHECK-GI-NEXT:    ushll.4s v3, v1, #0
+; CHECK-GI-NEXT:    ushll.4s v2, v1, #0
+; CHECK-GI-NEXT:    mov d1, v0[1]
 ; CHECK-GI-NEXT:    ushll.2d v0, v0, #0
-; CHECK-GI-NEXT:    mov d4, v3[1]
-; CHECK-GI-NEXT:    ushll.2d v1, v2, #0
-; CHECK-GI-NEXT:    ushll.2d v2, v3, #0
-; CHECK-GI-NEXT:    ushll.2d v3, v4, #0
+; CHECK-GI-NEXT:    mov d3, v2[1]
+; CHECK-GI-NEXT:    ushll.2d v2, v2, #0
+; CHECK-GI-NEXT:    ushll.2d v1, v1, #0
+; CHECK-GI-NEXT:    ushll.2d v3, v3, #0
 ; CHECK-GI-NEXT:    ret
   %r = zext <8 x i8> %v0 to <8 x i64>
   ret <8 x i64> %r
@@ -320,11 +320,11 @@ define <8 x i64> @sext_v8i8_to_v8i64(<8 x i8> %v0) nounwind {
 ; CHECK-SD-LABEL: sext_v8i8_to_v8i64:
 ; CHECK-SD:       // %bb.0:
 ; CHECK-SD-NEXT:    sshll.8h v0, v0, #0
+; CHECK-SD-NEXT:    sshll.4s v1, v0, #0
 ; CHECK-SD-NEXT:    sshll2.4s v2, v0, #0
-; CHECK-SD-NEXT:    sshll.4s v0, v0, #0
+; CHECK-SD-NEXT:    sshll.2d v0, v1, #0
 ; CHECK-SD-NEXT:    sshll2.2d v3, v2, #0
-; CHECK-SD-NEXT:    sshll2.2d v1, v0, #0
-; CHECK-SD-NEXT:    sshll.2d v0, v0, #0
+; CHECK-SD-NEXT:    sshll2.2d v1, v1, #0
 ; CHECK-SD-NEXT:    sshll.2d v2, v2, #0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -333,13 +333,13 @@ define <8 x i64> @sext_v8i8_to_v8i64(<8 x i8> %v0) nounwind {
 ; CHECK-GI-NEXT:    sshll.8h v0, v0, #0
 ; CHECK-GI-NEXT:    mov d1, v0[1]
 ; CHECK-GI-NEXT:    sshll.4s v0, v0, #0
-; CHECK-GI-NEXT:    mov d2, v0[1]
-; CHECK-GI-NEXT:    sshll.4s v3, v1, #0
+; CHECK-GI-NEXT:    sshll.4s v2, v1, #0
+; CHECK-GI-NEXT:    mov d1, v0[1]
 ; CHECK-GI-NEXT:    sshll.2d v0, v0, #0
-; CHECK-GI-NEXT:    mov d4, v3[1]
-; CHECK-GI-NEXT:    sshll.2d v1, v2, #0
-; CHECK-GI-NEXT:    sshll.2d v2, v3, #0
-; CHECK-GI-NEXT:    sshll.2d v3, v4, #0
+; CHECK-GI-NEXT:    mov d3, v2[1]
+; CHECK-GI-NEXT:    sshll.2d v2, v2, #0
+; CHECK-GI-NEXT:    sshll.2d v1, v1, #0
+; CHECK-GI-NEXT:    sshll.2d v3, v3, #0
 ; CHECK-GI-NEXT:    ret
   %r = sext <8 x i8> %v0 to <8 x i64>
   ret <8 x i64> %r
@@ -352,14 +352,13 @@ define <32 x i8> @zext_v32i1(<32 x i1> %arg) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr w8, [sp, #64]
 ; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    ldr w9, [sp]
-; CHECK-NEXT:    ldr w10, [sp, #8]
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    ldr w8, [sp, #72]
-; CHECK-NEXT:    mov.b v0[1], w1
+; CHECK-NEXT:    ldr w9, [sp, #72]
 ; CHECK-NEXT:    movi.16b v2, #1
-; CHECK-NEXT:    mov.b v1[1], w8
+; CHECK-NEXT:    fmov s1, w8
 ; CHECK-NEXT:    ldr w8, [sp, #80]
+; CHECK-NEXT:    mov.b v0[1], w1
+; CHECK-NEXT:    mov.b v1[1], w9
+; CHECK-NEXT:    ldr w9, [sp]
 ; CHECK-NEXT:    mov.b v0[2], w2
 ; CHECK-NEXT:    mov.b v1[2], w8
 ; CHECK-NEXT:    ldr w8, [sp, #88]
@@ -379,33 +378,34 @@ define <32 x i8> @zext_v32i1(<32 x i1> %arg) {
 ; CHECK-NEXT:    mov.b v1[7], w8
 ; CHECK-NEXT:    ldr w8, [sp, #128]
 ; CHECK-NEXT:    mov.b v0[8], w9
-; CHECK-NEXT:    ldr w9, [sp, #16]
+; CHECK-NEXT:    ldr w9, [sp, #8]
 ; CHECK-NEXT:    mov.b v1[8], w8
 ; CHECK-NEXT:    ldr w8, [sp, #136]
-; CHECK-NEXT:    mov.b v0[9], w10
-; CHECK-NEXT:    ldr w10, [sp, #24]
+; CHECK-NEXT:    mov.b v0[9], w9
+; CHECK-NEXT:    ldr w9, [sp, #16]
 ; CHECK-NEXT:    mov.b v1[9], w8
 ; CHECK-NEXT:    ldr w8, [sp, #144]
 ; CHECK-NEXT:    mov.b v0[10], w9
-; CHECK-NEXT:    ldr w9, [sp, #32]
+; CHECK-NEXT:    ldr w9, [sp, #24]
 ; CHECK-NEXT:    mov.b v1[10], w8
 ; CHECK-NEXT:    ldr w8, [sp, #152]
-; CHECK-NEXT:    mov.b v0[11], w10
-; CHECK-NEXT:    ldr w10, [sp, #40]
+; CHECK-NEXT:    mov.b v0[11], w9
+; CHECK-NEXT:    ldr w9, [sp, #32]
 ; CHECK-NEXT:    mov.b v1[11], w8
 ; CHECK-NEXT:    ldr w8, [sp, #160]
 ; CHECK-NEXT:    mov.b v0[12], w9
-; CHECK-NEXT:    ldr w9, [sp, #48]
+; CHECK-NEXT:    ldr w9, [sp, #40]
 ; CHECK-NEXT:    mov.b v1[12], w8
 ; CHECK-NEXT:    ldr w8, [sp, #168]
-; CHECK-NEXT:    mov.b v0[13], w10
-; CHECK-NEXT:    ldr w10, [sp, #56]
+; CHECK-NEXT:    mov.b v0[13], w9
+; CHECK-NEXT:    ldr w9, [sp, #48]
 ; CHECK-NEXT:    mov.b v1[13], w8
 ; CHECK-NEXT:    ldr w8, [sp, #176]
 ; CHECK-NEXT:    mov.b v0[14], w9
+; CHECK-NEXT:    ldr w9, [sp, #56]
 ; CHECK-NEXT:    mov.b v1[14], w8
 ; CHECK-NEXT:    ldr w8, [sp, #184]
-; CHECK-NEXT:    mov.b v0[15], w10
+; CHECK-NEXT:    mov.b v0[15], w9
 ; CHECK-NEXT:    mov.b v1[15], w8
 ; CHECK-NEXT:    and.16b v0, v0, v2
 ; CHECK-NEXT:    and.16b v1, v1, v2
@@ -418,65 +418,65 @@ define <32 x i8> @sext_v32i1(<32 x i1> %arg) {
 ; CHECK-LABEL: sext_v32i1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr w8, [sp, #64]
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    ldr w9, [sp]
-; CHECK-NEXT:    ldr w10, [sp, #8]
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    ldr w8, [sp, #72]
-; CHECK-NEXT:    mov.b v0[1], w1
-; CHECK-NEXT:    mov.b v1[1], w8
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    ldr w9, [sp, #72]
+; CHECK-NEXT:    fmov s0, w8
 ; CHECK-NEXT:    ldr w8, [sp, #80]
-; CHECK-NEXT:    mov.b v0[2], w2
-; CHECK-NEXT:    mov.b v1[2], w8
+; CHECK-NEXT:    mov.b v1[1], w1
+; CHECK-NEXT:    mov.b v0[1], w9
+; CHECK-NEXT:    ldr w9, [sp]
+; CHECK-NEXT:    mov.b v1[2], w2
+; CHECK-NEXT:    mov.b v0[2], w8
 ; CHECK-NEXT:    ldr w8, [sp, #88]
-; CHECK-NEXT:    mov.b v0[3], w3
-; CHECK-NEXT:    mov.b v1[3], w8
+; CHECK-NEXT:    mov.b v1[3], w3
+; CHECK-NEXT:    mov.b v0[3], w8
 ; CHECK-NEXT:    ldr w8, [sp, #96]
-; CHECK-NEXT:    mov.b v0[4], w4
-; CHECK-NEXT:    mov.b v1[4], w8
+; CHECK-NEXT:    mov.b v1[4], w4
+; CHECK-NEXT:    mov.b v0[4], w8
 ; CHECK-NEXT:    ldr w8, [sp, #104]
-; CHECK-NEXT:    mov.b v0[5], w5
-; CHECK-NEXT:    mov.b v1[5], w8
+; CHECK-NEXT:    mov.b v1[5], w5
+; CHECK-NEXT:    mov.b v0[5], w8
 ; CHECK-NEXT:    ldr w8, [sp, #112]
-; CHECK-NEXT:    mov.b v0[6], w6
-; CHECK-NEXT:    mov.b v1[6], w8
+; CHECK-NEXT:    mov.b v1[6], w6
+; CHECK-NEXT:    mov.b v0[6], w8
 ; CHECK-NEXT:    ldr w8, [sp, #120]
-; CHECK-NEXT:    mov.b v0[7], w7
-; CHECK-NEXT:    mov.b v1[7], w8
+; CHECK-NEXT:    mov.b v1[7], w7
+; CHECK-NEXT:    mov.b v0[7], w8
 ; CHECK-NEXT:    ldr w8, [sp, #128]
-; CHECK-NEXT:    mov.b v0[8], w9
-; CHECK-NEXT:    ldr w9, [sp, #16]
-; CHECK-NEXT:    mov.b v1[8], w8
+; CHECK-NEXT:    mov.b v1[8], w9
+; CHECK-NEXT:    ldr w9, [sp, #8]
+; CHECK-NEXT:    mov.b v0[8], w8
 ; CHECK-NEXT:    ldr w8, [sp, #136]
-; CHECK-NEXT:    mov.b v0[9], w10
-; CHECK-NEXT:    ldr w10, [sp, #24]
-; CHECK-NEXT:    mov.b v1[9], w8
+; CHECK-NEXT:    mov.b v1[9], w9
+; CHECK-NEXT:    ldr w9, [sp, #16]
+; CHECK-NEXT:    mov.b v0[9], w8
 ; CHECK-NEXT:    ldr w8, [sp, #144]
-; CHECK-NEXT:    mov.b v0[10], w9
-; CHECK-NEXT:    ldr w9, [sp, #32]
-; CHECK-NEXT:    mov.b v1[10], w8
+; CHECK-NEXT:    mov.b v1[10], w9
+; CHECK-NEXT:    ldr w9, [sp, #24]
+; CHECK-NEXT:    mov.b v0[10], w8
 ; CHECK-NEXT:    ldr w8, [sp, #152]
-; CHECK-NEXT:    mov.b v0[11], w10
-; CHECK-NEXT:    ldr w10, [sp, #40]
-; CHECK-NEXT:    mov.b v1[11], w8
+; CHECK-NEXT:    mov.b v1[11], w9
+; CHECK-NEXT:    ldr w9, [sp, #32]
+; CHECK-NEXT:    mov.b v0[11], w8
 ; CHECK-NEXT:    ldr w8, [sp, #160]
-; CHECK-NEXT:    mov.b v0[12], w9
-; CHECK-NEXT:    ldr w9, [sp, #48]
-; CHECK-NEXT:    mov.b v1[12], w8
+; CHECK-NEXT:    mov.b v1[12], w9
+; CHECK-NEXT:    ldr w9, [sp, #40]
+; CHECK-NEXT:    mov.b v0[12], w8
 ; CHECK-NEXT:    ldr w8, [sp, #168]
-; CHECK-NEXT:    mov.b v0[13], w10
-; CHECK-NEXT:    ldr w10, [sp, #56]
-; CHECK-NEXT:    mov.b v1[13], w8
+; CHECK-NEXT:    mov.b v1[13], w9
+; CHECK-NEXT:    ldr w9, [sp, #48]
+; CHECK-NEXT:    mov.b v0[13], w8
 ; CHECK-NEXT:    ldr w8, [sp, #176]
-; CHECK-NEXT:    mov.b v0[14], w9
-; CHECK-NEXT:    mov.b v1[14], w8
+; CHECK-NEXT:    mov.b v1[14], w9
+; CHECK-NEXT:    ldr w9, [sp, #56]
+; CHECK-NEXT:    mov.b v0[14], w8
 ; CHECK-NEXT:    ldr w8, [sp, #184]
-; CHECK-NEXT:    mov.b v0[15], w10
-; CHECK-NEXT:    mov.b v1[15], w8
-; CHECK-NEXT:    shl.16b v0, v0, #7
+; CHECK-NEXT:    mov.b v1[15], w9
+; CHECK-NEXT:    mov.b v0[15], w8
 ; CHECK-NEXT:    shl.16b v1, v1, #7
-; CHECK-NEXT:    cmlt.16b v0, v0, #0
-; CHECK-NEXT:    cmlt.16b v1, v1, #0
+; CHECK-NEXT:    shl.16b v2, v0, #7
+; CHECK-NEXT:    cmlt.16b v0, v1, #0
+; CHECK-NEXT:    cmlt.16b v1, v2, #0
 ; CHECK-NEXT:    ret
   %res = sext <32 x i1> %arg to <32 x i8>
   ret <32 x i8> %res
@@ -489,130 +489,130 @@ define <64 x i8> @zext_v64i1(<64 x i1> %arg) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ldr w8, [sp, #336]
+; CHECK-NEXT:    ldr w9, [sp, #208]
 ; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    ldr w9, [sp, #80]
-; CHECK-NEXT:    ldr w10, [sp, #208]
+; CHECK-NEXT:    ldr w10, [sp, #80]
+; CHECK-NEXT:    ldr w11, [sp, #216]
+; CHECK-NEXT:    movi.16b v4, #1
 ; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov s2, w9
 ; CHECK-NEXT:    ldr w8, [sp, #344]
-; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    ldr w9, [sp, #216]
-; CHECK-NEXT:    fmov s2, w10
-; CHECK-NEXT:    ldr w10, [sp, #352]
-; CHECK-NEXT:    mov.b v3[1], w8
-; CHECK-NEXT:    ldr w8, [sp, #88]
+; CHECK-NEXT:    fmov s1, w10
+; CHECK-NEXT:    ldr w12, [sp, #88]
 ; CHECK-NEXT:    mov.b v0[1], w1
-; CHECK-NEXT:    ldr w11, [sp, #368]
-; CHECK-NEXT:    mov.b v2[1], w9
-; CHECK-NEXT:    ldr w9, [sp, #96]
-; CHECK-NEXT:    mov.b v1[1], w8
-; CHECK-NEXT:    ldr w8, [sp, #360]
-; CHECK-NEXT:    mov.b v3[2], w10
-; CHECK-NEXT:    ldr w10, [sp, #224]
+; CHECK-NEXT:    ldr w9, [sp, #224]
+; CHECK-NEXT:    ldr w10, [sp, #96]
+; CHECK-NEXT:    mov.b v3[1], w8
+; CHECK-NEXT:    mov.b v2[1], w11
+; CHECK-NEXT:    ldr w8, [sp, #352]
+; CHECK-NEXT:    mov.b v1[1], w12
+; CHECK-NEXT:    ldr w11, [sp, #144]
 ; CHECK-NEXT:    mov.b v0[2], w2
-; CHECK-NEXT:    ldr w12, [sp, #384]
-; CHECK-NEXT:    ldr w13, [sp, #400]
-; CHECK-NEXT:    mov.b v1[2], w9
-; CHECK-NEXT:    ldr w9, [sp, #376]
-; CHECK-NEXT:    mov.b v2[2], w10
+; CHECK-NEXT:    mov.b v3[2], w8
+; CHECK-NEXT:    mov.b v2[2], w9
+; CHECK-NEXT:    ldr w8, [sp, #360]
+; CHECK-NEXT:    mov.b v1[2], w10
+; CHECK-NEXT:    ldr w9, [sp, #232]
 ; CHECK-NEXT:    ldr w10, [sp, #104]
-; CHECK-NEXT:    mov.b v3[3], w8
-; CHECK-NEXT:    ldr w8, [sp, #232]
 ; CHECK-NEXT:    mov.b v0[3], w3
-; CHECK-NEXT:    ldr w14, [sp, #416]
+; CHECK-NEXT:    mov.b v3[3], w8
+; CHECK-NEXT:    mov.b v2[3], w9
+; CHECK-NEXT:    ldr w8, [sp, #368]
 ; CHECK-NEXT:    mov.b v1[3], w10
-; CHECK-NEXT:    ldr w10, [sp, #392]
-; CHECK-NEXT:    mov.b v2[3], w8
-; CHECK-NEXT:    ldr w8, [sp, #112]
-; CHECK-NEXT:    mov.b v3[4], w11
-; CHECK-NEXT:    ldr w11, [sp, #240]
+; CHECK-NEXT:    ldr w9, [sp, #240]
+; CHECK-NEXT:    ldr w10, [sp, #112]
 ; CHECK-NEXT:    mov.b v0[4], w4
-; CHECK-NEXT:    ldr w15, [sp, #432]
-; CHECK-NEXT:    mov.b v1[4], w8
-; CHECK-NEXT:    ldr w8, [sp, #408]
-; CHECK-NEXT:    mov.b v2[4], w11
-; CHECK-NEXT:    ldr w11, [sp, #120]
-; CHECK-NEXT:    mov.b v3[5], w9
+; CHECK-NEXT:    mov.b v3[4], w8
+; CHECK-NEXT:    mov.b v2[4], w9
+; CHECK-NEXT:    ldr w8, [sp, #376]
+; CHECK-NEXT:    mov.b v1[4], w10
 ; CHECK-NEXT:    ldr w9, [sp, #248]
+; CHECK-NEXT:    ldr w10, [sp, #120]
 ; CHECK-NEXT:    mov.b v0[5], w5
-; CHECK-NEXT:    ldr w16, [sp, #448]
-; CHECK-NEXT:    mov.b v1[5], w11
-; CHECK-NEXT:    ldr w11, [sp, #424]
+; CHECK-NEXT:    mov.b v3[5], w8
 ; CHECK-NEXT:    mov.b v2[5], w9
-; CHECK-NEXT:    ldr w9, [sp, #128]
-; CHECK-NEXT:    mov.b v3[6], w12
-; CHECK-NEXT:    ldr w12, [sp, #256]
+; CHECK-NEXT:    ldr w8, [sp, #384]
+; CHECK-NEXT:    mov.b v1[5], w10
+; CHECK-NEXT:    ldr w9, [sp, #256]
+; CHECK-NEXT:    ldr w10, [sp, #128]
 ; CHECK-NEXT:    mov.b v0[6], w6
-; CHECK-NEXT:    mov.b v1[6], w9
-; CHECK-NEXT:    ldr w9, [sp, #440]
-; CHECK-NEXT:    mov.b v2[6], w12
-; CHECK-NEXT:    ldr w12, [sp, #136]
-; CHECK-NEXT:    mov.b v3[7], w10
-; CHECK-NEXT:    ldr w10, [sp, #264]
+; CHECK-NEXT:    mov.b v3[6], w8
+; CHECK-NEXT:    mov.b v2[6], w9
+; CHECK-NEXT:    ldr w8, [sp, #392]
+; CHECK-NEXT:    mov.b v1[6], w10
+; CHECK-NEXT:    ldr w9, [sp, #264]
+; CHECK-NEXT:    ldr w10, [sp, #136]
 ; CHECK-NEXT:    mov.b v0[7], w7
-; CHECK-NEXT:    mov.b v1[7], w12
-; CHECK-NEXT:    ldr w12, [sp, #16]
-; CHECK-NEXT:    mov.b v2[7], w10
-; CHECK-NEXT:    ldr w10, [sp, #144]
-; CHECK-NEXT:    mov.b v3[8], w13
-; CHECK-NEXT:    ldr w13, [sp, #272]
-; CHECK-NEXT:    mov.b v0[8], w12
-; CHECK-NEXT:    ldr w12, [sp, #456]
-; CHECK-NEXT:    mov.b v1[8], w10
-; CHECK-NEXT:    ldr w10, [sp, #24]
-; CHECK-NEXT:    mov.b v2[8], w13
-; CHECK-NEXT:    ldr w13, [sp, #152]
-; CHECK-NEXT:    mov.b v3[9], w8
-; CHECK-NEXT:    ldr w8, [sp, #280]
-; CHECK-NEXT:    mov.b v0[9], w10
+; CHECK-NEXT:    mov.b v3[7], w8
+; CHECK-NEXT:    mov.b v2[7], w9
+; CHECK-NEXT:    ldr w8, [sp, #16]
+; CHECK-NEXT:    mov.b v1[7], w10
+; CHECK-NEXT:    ldr w9, [sp, #400]
+; CHECK-NEXT:    ldr w10, [sp, #272]
+; CHECK-NEXT:    mov.b v0[8], w8
+; CHECK-NEXT:    ldr w8, [sp, #24]
+; CHECK-NEXT:    mov.b v3[8], w9
+; CHECK-NEXT:    mov.b v2[8], w10
+; CHECK-NEXT:    ldr w9, [sp, #408]
+; CHECK-NEXT:    mov.b v1[8], w11
+; CHECK-NEXT:    ldr w10, [sp, #280]
+; CHECK-NEXT:    ldr w11, [sp, #152]
+; CHECK-NEXT:    mov.b v0[9], w8
+; CHECK-NEXT:    ldr w8, [sp, #32]
+; CHECK-NEXT:    mov.b v3[9], w9
+; CHECK-NEXT:    mov.b v2[9], w10
+; CHECK-NEXT:    ldr w9, [sp, #416]
+; CHECK-NEXT:    mov.b v1[9], w11
 ; CHECK-NEXT:    ldr w10, [sp, #288]
-; CHECK-NEXT:    mov.b v1[9], w13
-; CHECK-NEXT:    ldr w13, [sp, #32]
-; CHECK-NEXT:    mov.b v2[9], w8
-; CHECK-NEXT:    ldr w8, [sp, #160]
-; CHECK-NEXT:    mov.b v3[10], w14
-; CHECK-NEXT:    ldr w14, [sp, #296]
-; CHECK-NEXT:    mov.b v0[10], w13
-; CHECK-NEXT:    ldr w13, [sp, #312]
-; CHECK-NEXT:    mov.b v1[10], w8
+; CHECK-NEXT:    ldr w11, [sp, #160]
+; CHECK-NEXT:    mov.b v0[10], w8
 ; CHECK-NEXT:    ldr w8, [sp, #40]
+; CHECK-NEXT:    mov.b v3[10], w9
 ; CHECK-NEXT:    mov.b v2[10], w10
-; CHECK-NEXT:    ldr w10, [sp, #168]
-; CHECK-NEXT:    mov.b v3[11], w11
-; CHECK-NEXT:    ldr w11, [sp, #304]
+; CHECK-NEXT:    ldr w9, [sp, #424]
+; CHECK-NEXT:    mov.b v1[10], w11
+; CHECK-NEXT:    ldr w10, [sp, #296]
+; CHECK-NEXT:    ldr w11, [sp, #168]
 ; CHECK-NEXT:    mov.b v0[11], w8
 ; CHECK-NEXT:    ldr w8, [sp, #48]
-; CHECK-NEXT:    mov.b v1[11], w10
-; CHECK-NEXT:    ldr w10, [sp, #176]
-; CHECK-NEXT:    mov.b v2[11], w14
-; CHECK-NEXT:    mov.b v3[12], w15
+; CHECK-NEXT:    mov.b v3[11], w9
+; CHECK-NEXT:    mov.b v2[11], w10
+; CHECK-NEXT:    ldr w9, [sp, #432]
+; CHECK-NEXT:    mov.b v1[11], w11
+; CHECK-NEXT:    ldr w10, [sp, #304]
+; CHECK-NEXT:    ldr w11, [sp, #176]
 ; CHECK-NEXT:    mov.b v0[12], w8
 ; CHECK-NEXT:    ldr w8, [sp, #56]
-; CHECK-NEXT:    mov.b v1[12], w10
-; CHECK-NEXT:    ldr w10, [sp, #184]
-; CHECK-NEXT:    mov.b v2[12], w11
-; CHECK-NEXT:    ldr w11, [sp, #328]
-; CHECK-NEXT:    mov.b v3[13], w9
-; CHECK-NEXT:    ldr w9, [sp, #320]
+; CHECK-NEXT:    mov.b v3[12], w9
+; CHECK-NEXT:    mov.b v2[12], w10
+; CHECK-NEXT:    ldr w9, [sp, #440]
+; CHECK-NEXT:    mov.b v1[12], w11
+; CHECK-NEXT:    ldr w10, [sp, #312]
+; CHECK-NEXT:    ldr w11, [sp, #184]
 ; CHECK-NEXT:    mov.b v0[13], w8
 ; CHECK-NEXT:    ldr w8, [sp, #64]
-; CHECK-NEXT:    mov.b v1[13], w10
-; CHECK-NEXT:    ldr w10, [sp, #192]
-; CHECK-NEXT:    mov.b v2[13], w13
-; CHECK-NEXT:    mov.b v3[14], w16
+; CHECK-NEXT:    mov.b v3[13], w9
+; CHECK-NEXT:    mov.b v2[13], w10
+; CHECK-NEXT:    ldr w9, [sp, #448]
+; CHECK-NEXT:    mov.b v1[13], w11
+; CHECK-NEXT:    ldr w10, [sp, #320]
+; CHECK-NEXT:    ldr w11, [sp, #192]
 ; CHECK-NEXT:    mov.b v0[14], w8
 ; CHECK-NEXT:    ldr w8, [sp, #72]
-; CHECK-NEXT:    mov.b v1[14], w10
-; CHECK-NEXT:    mov.b v2[14], w9
-; CHECK-NEXT:    ldr w9, [sp, #200]
-; CHECK-NEXT:    movi.16b v4, #1
+; CHECK-NEXT:    mov.b v3[14], w9
+; CHECK-NEXT:    mov.b v2[14], w10
+; CHECK-NEXT:    ldr w9, [sp, #456]
+; CHECK-NEXT:    mov.b v1[14], w11
+; CHECK-NEXT:    ldr w10, [sp, #328]
+; CHECK-NEXT:    ldr w11, [sp, #200]
 ; CHECK-NEXT:    mov.b v0[15], w8
-; CHECK-NEXT:    mov.b v1[15], w9
-; CHECK-NEXT:    mov.b v2[15], w11
-; CHECK-NEXT:    mov.b v3[15], w12
+; CHECK-NEXT:    mov.b v3[15], w9
+; CHECK-NEXT:    mov.b v2[15], w10
+; CHECK-NEXT:    mov.b v1[15], w11
 ; CHECK-NEXT:    and.16b v0, v0, v4
-; CHECK-NEXT:    and.16b v1, v1, v4
 ; CHECK-NEXT:    and.16b v2, v2, v4
 ; CHECK-NEXT:    and.16b v3, v3, v4
+; CHECK-NEXT:    and.16b v1, v1, v4
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %res = zext <64 x i1> %arg to <64 x i8>
@@ -626,133 +626,133 @@ define <64 x i8> @sext_v64i1(<64 x i1> %arg) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ldr w8, [sp, #336]
-; CHECK-NEXT:    fmov s3, w0
-; CHECK-NEXT:    ldr w9, [sp, #80]
-; CHECK-NEXT:    ldr w10, [sp, #208]
+; CHECK-NEXT:    ldr w9, [sp, #208]
+; CHECK-NEXT:    fmov s2, w0
+; CHECK-NEXT:    ldr w10, [sp, #80]
+; CHECK-NEXT:    ldr w11, [sp, #216]
+; CHECK-NEXT:    ldr w12, [sp, #88]
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    ldr w8, [sp, #344]
 ; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    ldr w9, [sp, #88]
-; CHECK-NEXT:    fmov s2, w10
+; CHECK-NEXT:    ldr w8, [sp, #344]
+; CHECK-NEXT:    fmov s3, w10
+; CHECK-NEXT:    mov.b v2[1], w1
+; CHECK-NEXT:    ldr w9, [sp, #224]
 ; CHECK-NEXT:    ldr w10, [sp, #96]
 ; CHECK-NEXT:    mov.b v0[1], w8
-; CHECK-NEXT:    ldr w8, [sp, #216]
-; CHECK-NEXT:    mov.b v1[1], w9
-; CHECK-NEXT:    ldr w9, [sp, #352]
-; CHECK-NEXT:    mov.b v3[1], w1
-; CHECK-NEXT:    ldr w11, [sp, #104]
-; CHECK-NEXT:    mov.b v2[1], w8
+; CHECK-NEXT:    mov.b v1[1], w11
+; CHECK-NEXT:    ldr w8, [sp, #352]
+; CHECK-NEXT:    mov.b v3[1], w12
+; CHECK-NEXT:    ldr w11, [sp, #144]
+; CHECK-NEXT:    mov.b v2[2], w2
+; CHECK-NEXT:    mov.b v0[2], w8
+; CHECK-NEXT:    mov.b v1[2], w9
 ; CHECK-NEXT:    ldr w8, [sp, #360]
-; CHECK-NEXT:    mov.b v0[2], w9
-; CHECK-NEXT:    ldr w9, [sp, #224]
-; CHECK-NEXT:    mov.b v1[2], w10
-; CHECK-NEXT:    ldr w10, [sp, #368]
-; CHECK-NEXT:    mov.b v3[2], w2
-; CHECK-NEXT:    ldr w12, [sp, #112]
-; CHECK-NEXT:    mov.b v2[2], w9
-; CHECK-NEXT:    ldr w9, [sp, #376]
+; CHECK-NEXT:    mov.b v3[2], w10
+; CHECK-NEXT:    ldr w9, [sp, #232]
+; CHECK-NEXT:    ldr w10, [sp, #104]
+; CHECK-NEXT:    mov.b v2[3], w3
 ; CHECK-NEXT:    mov.b v0[3], w8
-; CHECK-NEXT:    ldr w8, [sp, #232]
-; CHECK-NEXT:    mov.b v1[3], w11
-; CHECK-NEXT:    ldr w13, [sp, #120]
-; CHECK-NEXT:    mov.b v3[3], w3
-; CHECK-NEXT:    ldr w11, [sp, #384]
-; CHECK-NEXT:    mov.b v2[3], w8
-; CHECK-NEXT:    ldr w14, [sp, #128]
-; CHECK-NEXT:    mov.b v0[4], w10
-; CHECK-NEXT:    ldr w10, [sp, #240]
-; CHECK-NEXT:    mov.b v1[4], w12
-; CHECK-NEXT:    ldr w8, [sp, #392]
-; CHECK-NEXT:    mov.b v3[4], w4
-; CHECK-NEXT:    ldr w15, [sp, #136]
-; CHECK-NEXT:    mov.b v2[4], w10
-; CHECK-NEXT:    ldr w12, [sp, #400]
-; CHECK-NEXT:    mov.b v0[5], w9
+; CHECK-NEXT:    mov.b v1[3], w9
+; CHECK-NEXT:    ldr w8, [sp, #368]
+; CHECK-NEXT:    mov.b v3[3], w10
+; CHECK-NEXT:    ldr w9, [sp, #240]
+; CHECK-NEXT:    ldr w10, [sp, #112]
+; CHECK-NEXT:    mov.b v2[4], w4
+; CHECK-NEXT:    mov.b v0[4], w8
+; CHECK-NEXT:    mov.b v1[4], w9
+; CHECK-NEXT:    ldr w8, [sp, #376]
+; CHECK-NEXT:    mov.b v3[4], w10
 ; CHECK-NEXT:    ldr w9, [sp, #248]
-; CHECK-NEXT:    mov.b v1[5], w13
-; CHECK-NEXT:    ldr w16, [sp, #144]
-; CHECK-NEXT:    mov.b v3[5], w5
-; CHECK-NEXT:    ldr w10, [sp, #408]
-; CHECK-NEXT:    mov.b v2[5], w9
-; CHECK-NEXT:    ldr w13, [sp, #416]
-; CHECK-NEXT:    mov.b v0[6], w11
-; CHECK-NEXT:    ldr w11, [sp, #256]
-; CHECK-NEXT:    mov.b v1[6], w14
-; CHECK-NEXT:    ldr w9, [sp, #424]
-; CHECK-NEXT:    mov.b v3[6], w6
-; CHECK-NEXT:    ldr w14, [sp, #432]
-; CHECK-NEXT:    mov.b v2[6], w11
-; CHECK-NEXT:    ldr w11, [sp, #440]
+; CHECK-NEXT:    ldr w10, [sp, #120]
+; CHECK-NEXT:    mov.b v2[5], w5
+; CHECK-NEXT:    mov.b v0[5], w8
+; CHECK-NEXT:    mov.b v1[5], w9
+; CHECK-NEXT:    ldr w8, [sp, #384]
+; CHECK-NEXT:    mov.b v3[5], w10
+; CHECK-NEXT:    ldr w9, [sp, #256]
+; CHECK-NEXT:    ldr w10, [sp, #128]
+; CHECK-NEXT:    mov.b v2[6], w6
+; CHECK-NEXT:    mov.b v0[6], w8
+; CHECK-NEXT:    mov.b v1[6], w9
+; CHECK-NEXT:    ldr w8, [sp, #392]
+; CHECK-NEXT:    mov.b v3[6], w10
+; CHECK-NEXT:    ldr w9, [sp, #264]
+; CHECK-NEXT:    ldr w10, [sp, #136]
+; CHECK-NEXT:    mov.b v2[7], w7
 ; CHECK-NEXT:    mov.b v0[7], w8
-; CHECK-NEXT:    ldr w8, [sp, #264]
-; CHECK-NEXT:    mov.b v1[7], w15
-; CHECK-NEXT:    ldr w15, [sp, #448]
-; CHECK-NEXT:    mov.b v3[7], w7
-; CHECK-NEXT:    mov.b v2[7], w8
+; CHECK-NEXT:    mov.b v1[7], w9
 ; CHECK-NEXT:    ldr w8, [sp, #16]
-; CHECK-NEXT:    mov.b v0[8], w12
-; CHECK-NEXT:    ldr w12, [sp, #272]
-; CHECK-NEXT:    mov.b v1[8], w16
-; CHECK-NEXT:    ldr w16, [sp, #456]
-; CHECK-NEXT:    mov.b v3[8], w8
-; CHECK-NEXT:    ldr w8, [sp, #152]
-; CHECK-NEXT:    mov.b v2[8], w12
-; CHECK-NEXT:    ldr w12, [sp, #24]
-; CHECK-NEXT:    mov.b v0[9], w10
+; CHECK-NEXT:    mov.b v3[7], w10
+; CHECK-NEXT:    ldr w9, [sp, #400]
+; CHECK-NEXT:    ldr w10, [sp, #272]
+; CHECK-NEXT:    mov.b v2[8], w8
+; CHECK-NEXT:    ldr w8, [sp, #24]
+; CHECK-NEXT:    mov.b v0[8], w9
+; CHECK-NEXT:    mov.b v1[8], w10
+; CHECK-NEXT:    ldr w9, [sp, #408]
+; CHECK-NEXT:    mov.b v3[8], w11
 ; CHECK-NEXT:    ldr w10, [sp, #280]
-; CHECK-NEXT:    mov.b v1[9], w8
-; CHECK-NEXT:    ldr w8, [sp, #288]
-; CHECK-NEXT:    mov.b v3[9], w12
-; CHECK-NEXT:    ldr w12, [sp, #160]
-; CHECK-NEXT:    mov.b v2[9], w10
-; CHECK-NEXT:    ldr w10, [sp, #32]
-; CHECK-NEXT:    mov.b v0[10], w13
-; CHECK-NEXT:    ldr w13, [sp, #296]
-; CHECK-NEXT:    mov.b v1[10], w12
-; CHECK-NEXT:    ldr w12, [sp, #168]
-; CHECK-NEXT:    mov.b v3[10], w10
-; CHECK-NEXT:    ldr w10, [sp, #176]
+; CHECK-NEXT:    ldr w11, [sp, #152]
+; CHECK-NEXT:    mov.b v2[9], w8
+; CHECK-NEXT:    ldr w8, [sp, #32]
+; CHECK-NEXT:    mov.b v0[9], w9
+; CHECK-NEXT:    mov.b v1[9], w10
+; CHECK-NEXT:    ldr w9, [sp, #416]
+; CHECK-NEXT:    mov.b v3[9], w11
+; CHECK-NEXT:    ldr w10, [sp, #288]
+; CHECK-NEXT:    ldr w11, [sp, #160]
 ; CHECK-NEXT:    mov.b v2[10], w8
 ; CHECK-NEXT:    ldr w8, [sp, #40]
-; CHECK-NEXT:    mov.b v0[11], w9
-; CHECK-NEXT:    ldr w9, [sp, #304]
-; CHECK-NEXT:    mov.b v1[11], w12
-; CHECK-NEXT:    ldr w12, [sp, #312]
-; CHECK-NEXT:    mov.b v3[11], w8
+; CHECK-NEXT:    mov.b v0[10], w9
+; CHECK-NEXT:    mov.b v1[10], w10
+; CHECK-NEXT:    ldr w9, [sp, #424]
+; CHECK-NEXT:    mov.b v3[10], w11
+; CHECK-NEXT:    ldr w10, [sp, #296]
+; CHECK-NEXT:    ldr w11, [sp, #168]
+; CHECK-NEXT:    mov.b v2[11], w8
 ; CHECK-NEXT:    ldr w8, [sp, #48]
-; CHECK-NEXT:    mov.b v2[11], w13
-; CHECK-NEXT:    mov.b v0[12], w14
-; CHECK-NEXT:    mov.b v1[12], w10
-; CHECK-NEXT:    ldr w10, [sp, #184]
-; CHECK-NEXT:    mov.b v3[12], w8
+; CHECK-NEXT:    mov.b v0[11], w9
+; CHECK-NEXT:    mov.b v1[11], w10
+; CHECK-NEXT:    ldr w9, [sp, #432]
+; CHECK-NEXT:    mov.b v3[11], w11
+; CHECK-NEXT:    ldr w10, [sp, #304]
+; CHECK-NEXT:    ldr w11, [sp, #176]
+; CHECK-NEXT:    mov.b v2[12], w8
 ; CHECK-NEXT:    ldr w8, [sp, #56]
-; CHECK-NEXT:    mov.b v2[12], w9
-; CHECK-NEXT:    ldr w9, [sp, #320]
-; CHECK-NEXT:    mov.b v0[13], w11
-; CHECK-NEXT:    ldr w11, [sp, #328]
-; CHECK-NEXT:    mov.b v1[13], w10
-; CHECK-NEXT:    ldr w10, [sp, #192]
-; CHECK-NEXT:    mov.b v3[13], w8
+; CHECK-NEXT:    mov.b v0[12], w9
+; CHECK-NEXT:    mov.b v1[12], w10
+; CHECK-NEXT:    ldr w9, [sp, #440]
+; CHECK-NEXT:    mov.b v3[12], w11
+; CHECK-NEXT:    ldr w10, [sp, #312]
+; CHECK-NEXT:    ldr w11, [sp, #184]
+; CHECK-NEXT:    mov.b v2[13], w8
 ; CHECK-NEXT:    ldr w8, [sp, #64]
-; CHECK-NEXT:    mov.b v2[13], w12
-; CHECK-NEXT:    mov.b v0[14], w15
-; CHECK-NEXT:    mov.b v1[14], w10
-; CHECK-NEXT:    ldr w10, [sp, #200]
-; CHECK-NEXT:    mov.b v3[14], w8
+; CHECK-NEXT:    mov.b v0[13], w9
+; CHECK-NEXT:    mov.b v1[13], w10
+; CHECK-NEXT:    ldr w9, [sp, #448]
+; CHECK-NEXT:    mov.b v3[13], w11
+; CHECK-NEXT:    ldr w10, [sp, #320]
+; CHECK-NEXT:    ldr w11, [sp, #192]
+; CHECK-NEXT:    mov.b v2[14], w8
 ; CHECK-NEXT:    ldr w8, [sp, #72]
-; CHECK-NEXT:    mov.b v2[14], w9
-; CHECK-NEXT:    mov.b v0[15], w16
+; CHECK-NEXT:    mov.b v0[14], w9
+; CHECK-NEXT:    mov.b v1[14], w10
+; CHECK-NEXT:    ldr w9, [sp, #456]
+; CHECK-NEXT:    mov.b v3[14], w11
+; CHECK-NEXT:    ldr w10, [sp, #328]
+; CHECK-NEXT:    ldr w11, [sp, #200]
+; CHECK-NEXT:    mov.b v2[15], w8
+; CHECK-NEXT:    mov.b v0[15], w9
 ; CHECK-NEXT:    mov.b v1[15], w10
-; CHECK-NEXT:    mov.b v3[15], w8
-; CHECK-NEXT:    mov.b v2[15], w11
-; CHECK-NEXT:    shl.16b v4, v0, #7
-; CHECK-NEXT:    shl.16b v1, v1, #7
-; CHECK-NEXT:    shl.16b v3, v3, #7
+; CHECK-NEXT:    mov.b v3[15], w11
 ; CHECK-NEXT:    shl.16b v2, v2, #7
-; CHECK-NEXT:    cmlt.16b v0, v3, #0
-; CHECK-NEXT:    cmlt.16b v1, v1, #0
-; CHECK-NEXT:    cmlt.16b v2, v2, #0
-; CHECK-NEXT:    cmlt.16b v3, v4, #0
+; CHECK-NEXT:    shl.16b v4, v1, #7
+; CHECK-NEXT:    shl.16b v5, v0, #7
+; CHECK-NEXT:    shl.16b v3, v3, #7
+; CHECK-NEXT:    cmlt.16b v0, v2, #0
+; CHECK-NEXT:    cmlt.16b v2, v4, #0
+; CHECK-NEXT:    cmlt.16b v1, v3, #0
+; CHECK-NEXT:    cmlt.16b v3, v5, #0
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %res = sext <64 x i1> %arg to <64 x i8>

diff  --git a/llvm/test/CodeGen/AArch64/arm64-tbl.ll b/llvm/test/CodeGen/AArch64/arm64-tbl.ll
index 300bcbc503d01a..b89232c03f1363 100644
--- a/llvm/test/CodeGen/AArch64/arm64-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-tbl.ll
@@ -107,9 +107,9 @@ define <8 x i8> @shuffled_tbl2_to_tbl4_v8i8(<16 x i8> %a, <16 x i8> %b, <16 x i8
 ; CHECK-NEXT:    adrp x8, .LCPI8_0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
+; CHECK-NEXT:    ldr d4, [x8, :lo12:.LCPI8_0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
-; CHECK-NEXT:    ldr d4, [x8, :lo12:.LCPI8_0]
 ; CHECK-NEXT:    tbl.8b v0, { v0, v1 }, v4
 ; CHECK-NEXT:    tbl.8b v1, { v2, v3 }, v4
 ; CHECK-NEXT:    mov.s v0[1], v1[1]
@@ -142,11 +142,11 @@ define <8 x i8> @shuffled_tbl2_to_tbl4_v8i8(<16 x i8> %a, <16 x i8> %b, <16 x i8
 define <16 x i8> @shuffled_tbl2_to_tbl4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
 ; CHECK-LABEL: shuffled_tbl2_to_tbl4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI9_0
 ; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT:    adrp x8, .LCPI9_0
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI9_0]
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
 ; CHECK-NEXT:    ret
@@ -160,11 +160,11 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x
 ; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov s4, w0
-; CHECK-NEXT:    mov w8, #32
+; CHECK-NEXT:    mov w8, #32 // =0x20
 ; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    mov.b v4[1], w0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT:    mov.b v4[1], w0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    mov.b v4[2], w0
 ; CHECK-NEXT:    mov.b v4[3], w0
@@ -173,19 +173,19 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x
 ; CHECK-NEXT:    mov.b v4[6], w0
 ; CHECK-NEXT:    mov.b v4[7], w0
 ; CHECK-NEXT:    mov.b v4[8], w8
-; CHECK-NEXT:    mov w8, #36
+; CHECK-NEXT:    mov w8, #36 // =0x24
 ; CHECK-NEXT:    mov.b v4[9], w8
-; CHECK-NEXT:    mov w8, #40
+; CHECK-NEXT:    mov w8, #40 // =0x28
 ; CHECK-NEXT:    mov.b v4[10], w8
-; CHECK-NEXT:    mov w8, #44
+; CHECK-NEXT:    mov w8, #44 // =0x2c
 ; CHECK-NEXT:    mov.b v4[11], w8
-; CHECK-NEXT:    mov w8, #48
+; CHECK-NEXT:    mov w8, #48 // =0x30
 ; CHECK-NEXT:    mov.b v4[12], w8
-; CHECK-NEXT:    mov w8, #52
+; CHECK-NEXT:    mov w8, #52 // =0x34
 ; CHECK-NEXT:    mov.b v4[13], w8
-; CHECK-NEXT:    mov w8, #56
+; CHECK-NEXT:    mov w8, #56 // =0x38
 ; CHECK-NEXT:    mov.b v4[14], w8
-; CHECK-NEXT:    mov w8, #60
+; CHECK-NEXT:    mov w8, #60 // =0x3c
 ; CHECK-NEXT:    mov.b v4[15], w8
 ; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
 ; CHECK-NEXT:    ret
@@ -214,11 +214,11 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x
 define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
 ; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT:    fmov s4, w8
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    fmov s4, w8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    mov.b v4[1], w8
 ; CHECK-NEXT:    mov.b v4[2], w8
@@ -226,22 +226,22 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x
 ; CHECK-NEXT:    mov.b v4[4], w8
 ; CHECK-NEXT:    mov.b v4[5], w8
 ; CHECK-NEXT:    mov.b v4[6], w8
-; CHECK-NEXT:    mov w8, #32
+; CHECK-NEXT:    mov w8, #32 // =0x20
 ; CHECK-NEXT:    mov.b v4[7], w0
 ; CHECK-NEXT:    mov.b v4[8], w8
-; CHECK-NEXT:    mov w8, #36
+; CHECK-NEXT:    mov w8, #36 // =0x24
 ; CHECK-NEXT:    mov.b v4[9], w8
-; CHECK-NEXT:    mov w8, #40
+; CHECK-NEXT:    mov w8, #40 // =0x28
 ; CHECK-NEXT:    mov.b v4[10], w8
-; CHECK-NEXT:    mov w8, #44
+; CHECK-NEXT:    mov w8, #44 // =0x2c
 ; CHECK-NEXT:    mov.b v4[11], w8
-; CHECK-NEXT:    mov w8, #48
+; CHECK-NEXT:    mov w8, #48 // =0x30
 ; CHECK-NEXT:    mov.b v4[12], w8
-; CHECK-NEXT:    mov w8, #52
+; CHECK-NEXT:    mov w8, #52 // =0x34
 ; CHECK-NEXT:    mov.b v4[13], w8
-; CHECK-NEXT:    mov w8, #56
+; CHECK-NEXT:    mov w8, #56 // =0x38
 ; CHECK-NEXT:    mov.b v4[14], w8
-; CHECK-NEXT:    mov w8, #31
+; CHECK-NEXT:    mov w8, #31 // =0x1f
 ; CHECK-NEXT:    mov.b v4[15], w8
 ; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
 ; CHECK-NEXT:    ret
@@ -274,11 +274,11 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x
 ; CHECK-NEXT:    adrp x8, .LCPI12_0
 ; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT:    ldr q5, [x8, :lo12:.LCPI12_0]
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    ldr q5, [x8, :lo12:.LCPI12_0]
-; CHECK-NEXT:    mov.b v4[0], w0
 ; CHECK-NEXT:    tbl.16b v2, { v2, v3 }, v5
+; CHECK-NEXT:    mov.b v4[0], w0
 ; CHECK-NEXT:    mov.b v4[1], w0
 ; CHECK-NEXT:    mov.b v4[2], w0
 ; CHECK-NEXT:    mov.b v4[3], w0
@@ -315,22 +315,22 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x
 define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
 ; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #255
 ; CHECK-NEXT:    dup.16b v4, w0
-; CHECK-NEXT:    adrp x9, .LCPI13_0
+; CHECK-NEXT:    mov w8, #255 // =0xff
 ; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    mov.b v4[8], w8
-; CHECK-NEXT:    ldr q5, [x9, :lo12:.LCPI13_0]
 ; CHECK-NEXT:    mov.b v4[9], w8
-; CHECK-NEXT:    tbl.16b v2, { v2, v3 }, v5
 ; CHECK-NEXT:    mov.b v4[10], w8
 ; CHECK-NEXT:    mov.b v4[11], w8
 ; CHECK-NEXT:    mov.b v4[12], w8
 ; CHECK-NEXT:    mov.b v4[13], w8
+; CHECK-NEXT:    adrp x8, .LCPI13_0
+; CHECK-NEXT:    ldr q5, [x8, :lo12:.LCPI13_0]
 ; CHECK-NEXT:    adrp x8, .LCPI13_1
+; CHECK-NEXT:    tbl.16b v2, { v2, v3 }, v5
 ; CHECK-NEXT:    tbl.16b v3, { v0, v1 }, v4
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI13_1]
 ; CHECK-NEXT:    tbl.16b v0, { v2, v3 }, v0
@@ -379,11 +379,11 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16
 define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_shuffle(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
 ; CHECK-LABEL: shuffled_tbl2_to_tbl4_mixed_shuffle:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI14_0
 ; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT:    adrp x8, .LCPI14_0
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI14_0]
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
 ; CHECK-NEXT:    ret
@@ -414,11 +414,11 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_shuffle(<16 x i8> %a, <16 x i8> %b
 define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
 ; CHECK-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI15_0]
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
 ; CHECK-NEXT:    ret
@@ -449,11 +449,11 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask1(<16 x i8> %a, <16 x i8>
 define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
 ; CHECK-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI16_0
 ; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT:    adrp x8, .LCPI16_0
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI16_0]
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    tbl.16b v0, { v0, v1, v2, v3 }, v4
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
index 3003e4c1c411ee..1cc9040b5a2dc3 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
@@ -288,43 +288,42 @@ define i32 @uabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-GI-LABEL: uabd16b_rdx_i32:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    mov d3, v0[1]
-; CHECK-GI-NEXT:    ushll.8h v4, v1, #0
-; CHECK-GI-NEXT:    mov d1, v1[1]
+; CHECK-GI-NEXT:    mov d4, v1[1]
 ; CHECK-GI-NEXT:    ushll.8h v0, v0, #0
-; CHECK-GI-NEXT:    mov d6, v4[1]
-; CHECK-GI-NEXT:    ushll.8h v3, v3, #0
-; CHECK-GI-NEXT:    mov d5, v0[1]
 ; CHECK-GI-NEXT:    ushll.8h v1, v1, #0
-; CHECK-GI-NEXT:    mov d7, v3[1]
-; CHECK-GI-NEXT:    mov d16, v1[1]
 ; CHECK-GI-NEXT:    movi.2d v2, #0000000000000000
-; CHECK-GI-NEXT:    usubl.4s v0, v0, v4
-; CHECK-GI-NEXT:    usubl.4s v5, v5, v6
-; CHECK-GI-NEXT:    usubl.4s v1, v3, v1
-; CHECK-GI-NEXT:    usubl.4s v3, v7, v16
-; CHECK-GI-NEXT:    cmgt.4s v4, v2, v0
-; CHECK-GI-NEXT:    cmgt.4s v6, v2, v5
-; CHECK-GI-NEXT:    cmgt.4s v7, v2, v1
-; CHECK-GI-NEXT:    cmgt.4s v2, v2, v3
-; CHECK-GI-NEXT:    shl.4s v4, v4, #31
-; CHECK-GI-NEXT:    shl.4s v6, v6, #31
+; CHECK-GI-NEXT:    mov d5, v0[1]
+; CHECK-GI-NEXT:    ushll.8h v3, v3, #0
+; CHECK-GI-NEXT:    ushll.8h v4, v4, #0
+; CHECK-GI-NEXT:    mov d7, v1[1]
+; CHECK-GI-NEXT:    usubl.4s v0, v0, v1
+; CHECK-GI-NEXT:    mov d6, v3[1]
+; CHECK-GI-NEXT:    mov d16, v4[1]
+; CHECK-GI-NEXT:    usubl.4s v3, v3, v4
+; CHECK-GI-NEXT:    usubl.4s v1, v5, v7
+; CHECK-GI-NEXT:    cmgt.4s v5, v2, v0
+; CHECK-GI-NEXT:    usubl.4s v4, v6, v16
+; CHECK-GI-NEXT:    cmgt.4s v7, v2, v3
+; CHECK-GI-NEXT:    neg.4s v16, v0
+; CHECK-GI-NEXT:    cmgt.4s v6, v2, v1
+; CHECK-GI-NEXT:    shl.4s v5, v5, #31
+; CHECK-GI-NEXT:    neg.4s v17, v1
+; CHECK-GI-NEXT:    neg.4s v18, v3
 ; CHECK-GI-NEXT:    shl.4s v7, v7, #31
+; CHECK-GI-NEXT:    cmgt.4s v2, v2, v4
+; CHECK-GI-NEXT:    shl.4s v6, v6, #31
+; CHECK-GI-NEXT:    neg.4s v19, v4
+; CHECK-GI-NEXT:    sshr.4s v5, v5, #31
+; CHECK-GI-NEXT:    sshr.4s v7, v7, #31
 ; CHECK-GI-NEXT:    shl.4s v2, v2, #31
-; CHECK-GI-NEXT:    sshr.4s v4, v4, #31
-; CHECK-GI-NEXT:    neg.4s v17, v0
 ; CHECK-GI-NEXT:    sshr.4s v6, v6, #31
-; CHECK-GI-NEXT:    neg.4s v16, v5
-; CHECK-GI-NEXT:    neg.4s v18, v1
-; CHECK-GI-NEXT:    neg.4s v19, v3
-; CHECK-GI-NEXT:    sshr.4s v7, v7, #31
+; CHECK-GI-NEXT:    bit.16b v0, v16, v5
+; CHECK-GI-NEXT:    bit.16b v3, v18, v7
 ; CHECK-GI-NEXT:    sshr.4s v2, v2, #31
-; CHECK-GI-NEXT:    bit.16b v0, v17, v4
-; CHECK-GI-NEXT:    mov.16b v4, v6
-; CHECK-GI-NEXT:    bsl.16b v4, v16, v5
-; CHECK-GI-NEXT:    bit.16b v1, v18, v7
-; CHECK-GI-NEXT:    bsl.16b v2, v19, v3
-; CHECK-GI-NEXT:    add.4s v0, v0, v4
-; CHECK-GI-NEXT:    add.4s v1, v1, v2
+; CHECK-GI-NEXT:    bit.16b v1, v17, v6
+; CHECK-GI-NEXT:    bsl.16b v2, v19, v4
+; CHECK-GI-NEXT:    add.4s v0, v0, v1
+; CHECK-GI-NEXT:    add.4s v1, v3, v2
 ; CHECK-GI-NEXT:    add.4s v0, v0, v1
 ; CHECK-GI-NEXT:    addv.4s s0, v0
 ; CHECK-GI-NEXT:    fmov w0, s0
@@ -351,43 +350,42 @@ define i32 @sabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-GI-LABEL: sabd16b_rdx_i32:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    mov d3, v0[1]
-; CHECK-GI-NEXT:    sshll.8h v4, v1, #0
-; CHECK-GI-NEXT:    mov d1, v1[1]
+; CHECK-GI-NEXT:    mov d4, v1[1]
 ; CHECK-GI-NEXT:    sshll.8h v0, v0, #0
-; CHECK-GI-NEXT:    mov d6, v4[1]
-; CHECK-GI-NEXT:    sshll.8h v3, v3, #0
-; CHECK-GI-NEXT:    mov d5, v0[1]
 ; CHECK-GI-NEXT:    sshll.8h v1, v1, #0
-; CHECK-GI-NEXT:    mov d7, v3[1]
-; CHECK-GI-NEXT:    mov d16, v1[1]
 ; CHECK-GI-NEXT:    movi.2d v2, #0000000000000000
-; CHECK-GI-NEXT:    ssubl.4s v0, v0, v4
-; CHECK-GI-NEXT:    ssubl.4s v5, v5, v6
-; CHECK-GI-NEXT:    ssubl.4s v1, v3, v1
-; CHECK-GI-NEXT:    ssubl.4s v3, v7, v16
-; CHECK-GI-NEXT:    cmgt.4s v4, v2, v0
-; CHECK-GI-NEXT:    cmgt.4s v6, v2, v5
-; CHECK-GI-NEXT:    cmgt.4s v7, v2, v1
-; CHECK-GI-NEXT:    cmgt.4s v2, v2, v3
-; CHECK-GI-NEXT:    shl.4s v4, v4, #31
-; CHECK-GI-NEXT:    shl.4s v6, v6, #31
+; CHECK-GI-NEXT:    mov d5, v0[1]
+; CHECK-GI-NEXT:    sshll.8h v3, v3, #0
+; CHECK-GI-NEXT:    sshll.8h v4, v4, #0
+; CHECK-GI-NEXT:    mov d7, v1[1]
+; CHECK-GI-NEXT:    ssubl.4s v0, v0, v1
+; CHECK-GI-NEXT:    mov d6, v3[1]
+; CHECK-GI-NEXT:    mov d16, v4[1]
+; CHECK-GI-NEXT:    ssubl.4s v3, v3, v4
+; CHECK-GI-NEXT:    ssubl.4s v1, v5, v7
+; CHECK-GI-NEXT:    cmgt.4s v5, v2, v0
+; CHECK-GI-NEXT:    ssubl.4s v4, v6, v16
+; CHECK-GI-NEXT:    cmgt.4s v7, v2, v3
+; CHECK-GI-NEXT:    neg.4s v16, v0
+; CHECK-GI-NEXT:    cmgt.4s v6, v2, v1
+; CHECK-GI-NEXT:    shl.4s v5, v5, #31
+; CHECK-GI-NEXT:    neg.4s v17, v1
+; CHECK-GI-NEXT:    neg.4s v18, v3
 ; CHECK-GI-NEXT:    shl.4s v7, v7, #31
+; CHECK-GI-NEXT:    cmgt.4s v2, v2, v4
+; CHECK-GI-NEXT:    shl.4s v6, v6, #31
+; CHECK-GI-NEXT:    neg.4s v19, v4
+; CHECK-GI-NEXT:    sshr.4s v5, v5, #31
+; CHECK-GI-NEXT:    sshr.4s v7, v7, #31
 ; CHECK-GI-NEXT:    shl.4s v2, v2, #31
-; CHECK-GI-NEXT:    sshr.4s v4, v4, #31
-; CHECK-GI-NEXT:    neg.4s v17, v0
 ; CHECK-GI-NEXT:    sshr.4s v6, v6, #31
-; CHECK-GI-NEXT:    neg.4s v16, v5
-; CHECK-GI-NEXT:    neg.4s v18, v1
-; CHECK-GI-NEXT:    neg.4s v19, v3
-; CHECK-GI-NEXT:    sshr.4s v7, v7, #31
+; CHECK-GI-NEXT:    bit.16b v0, v16, v5
+; CHECK-GI-NEXT:    bit.16b v3, v18, v7
 ; CHECK-GI-NEXT:    sshr.4s v2, v2, #31
-; CHECK-GI-NEXT:    bit.16b v0, v17, v4
-; CHECK-GI-NEXT:    mov.16b v4, v6
-; CHECK-GI-NEXT:    bsl.16b v4, v16, v5
-; CHECK-GI-NEXT:    bit.16b v1, v18, v7
-; CHECK-GI-NEXT:    bsl.16b v2, v19, v3
-; CHECK-GI-NEXT:    add.4s v0, v0, v4
-; CHECK-GI-NEXT:    add.4s v1, v1, v2
+; CHECK-GI-NEXT:    bit.16b v1, v17, v6
+; CHECK-GI-NEXT:    bsl.16b v2, v19, v4
+; CHECK-GI-NEXT:    add.4s v0, v0, v1
+; CHECK-GI-NEXT:    add.4s v1, v3, v2
 ; CHECK-GI-NEXT:    add.4s v0, v0, v1
 ; CHECK-GI-NEXT:    addv.4s s0, v0
 ; CHECK-GI-NEXT:    fmov w0, s0
@@ -427,13 +425,13 @@ define i32 @uabd8h_rdx(ptr %a, ptr %b) {
 ; CHECK-GI-NEXT:    usubl.4s v2, v3, v4
 ; CHECK-GI-NEXT:    cmgt.4s v3, v0, v1
 ; CHECK-GI-NEXT:    neg.4s v4, v1
-; CHECK-GI-NEXT:    cmgt.4s v0, v0, v2
 ; CHECK-GI-NEXT:    shl.4s v3, v3, #31
-; CHECK-GI-NEXT:    shl.4s v0, v0, #31
+; CHECK-GI-NEXT:    cmgt.4s v0, v0, v2
 ; CHECK-GI-NEXT:    neg.4s v5, v2
 ; CHECK-GI-NEXT:    sshr.4s v3, v3, #31
-; CHECK-GI-NEXT:    sshr.4s v0, v0, #31
+; CHECK-GI-NEXT:    shl.4s v0, v0, #31
 ; CHECK-GI-NEXT:    bit.16b v1, v4, v3
+; CHECK-GI-NEXT:    sshr.4s v0, v0, #31
 ; CHECK-GI-NEXT:    bsl.16b v0, v5, v2
 ; CHECK-GI-NEXT:    add.4s v0, v1, v0
 ; CHECK-GI-NEXT:    addv.4s s0, v0
@@ -470,8 +468,8 @@ define i32 @sabd8h_rdx(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-GI-NEXT:    neg.4s v4, v0
 ; CHECK-GI-NEXT:    cmgt.4s v2, v2, v1
 ; CHECK-GI-NEXT:    shl.4s v3, v3, #31
-; CHECK-GI-NEXT:    shl.4s v2, v2, #31
 ; CHECK-GI-NEXT:    neg.4s v5, v1
+; CHECK-GI-NEXT:    shl.4s v2, v2, #31
 ; CHECK-GI-NEXT:    sshr.4s v3, v3, #31
 ; CHECK-GI-NEXT:    sshr.4s v2, v2, #31
 ; CHECK-GI-NEXT:    bit.16b v0, v4, v3
@@ -500,10 +498,10 @@ define i32 @uabdl4s_rdx_i32(<4 x i16> %a, <4 x i16> %b) {
 ;
 ; CHECK-GI-LABEL: uabdl4s_rdx_i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    movi.2d v2, #0000000000000000
 ; CHECK-GI-NEXT:    usubl.4s v0, v0, v1
-; CHECK-GI-NEXT:    cmgt.4s v1, v2, v0
+; CHECK-GI-NEXT:    movi.2d v1, #0000000000000000
 ; CHECK-GI-NEXT:    neg.4s v2, v0
+; CHECK-GI-NEXT:    cmgt.4s v1, v1, v0
 ; CHECK-GI-NEXT:    bit.16b v0, v2, v1
 ; CHECK-GI-NEXT:    addv.4s s0, v0
 ; CHECK-GI-NEXT:    fmov w0, s0
@@ -569,10 +567,10 @@ define i64 @uabdl2d_rdx_i64(<2 x i32> %a, <2 x i32> %b) {
 ;
 ; CHECK-GI-LABEL: uabdl2d_rdx_i64:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    movi.2d v2, #0000000000000000
 ; CHECK-GI-NEXT:    usubl.2d v0, v0, v1
-; CHECK-GI-NEXT:    cmgt.2d v1, v2, v0
+; CHECK-GI-NEXT:    movi.2d v1, #0000000000000000
 ; CHECK-GI-NEXT:    neg.2d v2, v0
+; CHECK-GI-NEXT:    cmgt.2d v1, v1, v0
 ; CHECK-GI-NEXT:    bit.16b v0, v2, v1
 ; CHECK-GI-NEXT:    addp.2d d0, v0
 ; CHECK-GI-NEXT:    fmov x0, d0
@@ -1088,21 +1086,13 @@ declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone
 declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone
 
 define <8 x i16> @sabal8h(ptr %A, ptr %B,  ptr %C) nounwind {
-; CHECK-SD-LABEL: sabal8h:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr d1, [x1]
-; CHECK-SD-NEXT:    ldr d2, [x0]
-; CHECK-SD-NEXT:    ldr q0, [x2]
-; CHECK-SD-NEXT:    sabal.8h v0, v2, v1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: sabal8h:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr d1, [x0]
-; CHECK-GI-NEXT:    ldr d2, [x1]
-; CHECK-GI-NEXT:    ldr q0, [x2]
-; CHECK-GI-NEXT:    sabal.8h v0, v1, v2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: sabal8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
+; CHECK-NEXT:    ldr q0, [x2]
+; CHECK-NEXT:    sabal.8h v0, v1, v2
+; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = load <8 x i8>, ptr %B
   %tmp3 = load <8 x i16>, ptr %C
@@ -1113,21 +1103,13 @@ define <8 x i16> @sabal8h(ptr %A, ptr %B,  ptr %C) nounwind {
 }
 
 define <4 x i32> @sabal4s(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-SD-LABEL: sabal4s:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr d1, [x1]
-; CHECK-SD-NEXT:    ldr d2, [x0]
-; CHECK-SD-NEXT:    ldr q0, [x2]
-; CHECK-SD-NEXT:    sabal.4s v0, v2, v1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: sabal4s:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr d1, [x0]
-; CHECK-GI-NEXT:    ldr d2, [x1]
-; CHECK-GI-NEXT:    ldr q0, [x2]
-; CHECK-GI-NEXT:    sabal.4s v0, v1, v2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: sabal4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
+; CHECK-NEXT:    ldr q0, [x2]
+; CHECK-NEXT:    sabal.4s v0, v1, v2
+; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
   %tmp3 = load <4 x i32>, ptr %C
@@ -1138,21 +1120,13 @@ define <4 x i32> @sabal4s(ptr %A, ptr %B, ptr %C) nounwind {
 }
 
 define <2 x i64> @sabal2d(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-SD-LABEL: sabal2d:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr d1, [x1]
-; CHECK-SD-NEXT:    ldr d2, [x0]
-; CHECK-SD-NEXT:    ldr q0, [x2]
-; CHECK-SD-NEXT:    sabal.2d v0, v2, v1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: sabal2d:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr d1, [x0]
-; CHECK-GI-NEXT:    ldr d2, [x1]
-; CHECK-GI-NEXT:    ldr q0, [x2]
-; CHECK-GI-NEXT:    sabal.2d v0, v1, v2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: sabal2d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
+; CHECK-NEXT:    ldr q0, [x2]
+; CHECK-NEXT:    sabal.2d v0, v1, v2
+; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
   %tmp3 = load <2 x i64>, ptr %C
@@ -1251,21 +1225,13 @@ define <2 x i64> @sabal2_2d(ptr %A, ptr %B, ptr %C) nounwind {
 }
 
 define <8 x i16> @uabal8h(ptr %A, ptr %B,  ptr %C) nounwind {
-; CHECK-SD-LABEL: uabal8h:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr d1, [x1]
-; CHECK-SD-NEXT:    ldr d2, [x0]
-; CHECK-SD-NEXT:    ldr q0, [x2]
-; CHECK-SD-NEXT:    uabal.8h v0, v2, v1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: uabal8h:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr d1, [x0]
-; CHECK-GI-NEXT:    ldr d2, [x1]
-; CHECK-GI-NEXT:    ldr q0, [x2]
-; CHECK-GI-NEXT:    uabal.8h v0, v1, v2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: uabal8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
+; CHECK-NEXT:    ldr q0, [x2]
+; CHECK-NEXT:    uabal.8h v0, v1, v2
+; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = load <8 x i8>, ptr %B
   %tmp3 = load <8 x i16>, ptr %C
@@ -1276,21 +1242,13 @@ define <8 x i16> @uabal8h(ptr %A, ptr %B,  ptr %C) nounwind {
 }
 
 define <4 x i32> @uabal4s(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-SD-LABEL: uabal4s:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr d1, [x1]
-; CHECK-SD-NEXT:    ldr d2, [x0]
-; CHECK-SD-NEXT:    ldr q0, [x2]
-; CHECK-SD-NEXT:    uabal.4s v0, v2, v1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: uabal4s:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr d1, [x0]
-; CHECK-GI-NEXT:    ldr d2, [x1]
-; CHECK-GI-NEXT:    ldr q0, [x2]
-; CHECK-GI-NEXT:    uabal.4s v0, v1, v2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: uabal4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
+; CHECK-NEXT:    ldr q0, [x2]
+; CHECK-NEXT:    uabal.4s v0, v1, v2
+; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
   %tmp3 = load <4 x i32>, ptr %C
@@ -1301,21 +1259,13 @@ define <4 x i32> @uabal4s(ptr %A, ptr %B, ptr %C) nounwind {
 }
 
 define <2 x i64> @uabal2d(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-SD-LABEL: uabal2d:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr d1, [x1]
-; CHECK-SD-NEXT:    ldr d2, [x0]
-; CHECK-SD-NEXT:    ldr q0, [x2]
-; CHECK-SD-NEXT:    uabal.2d v0, v2, v1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: uabal2d:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr d1, [x0]
-; CHECK-GI-NEXT:    ldr d2, [x1]
-; CHECK-GI-NEXT:    ldr q0, [x2]
-; CHECK-GI-NEXT:    uabal.2d v0, v1, v2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: uabal2d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
+; CHECK-NEXT:    ldr q0, [x2]
+; CHECK-NEXT:    uabal.2d v0, v1, v2
+; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
   %tmp3 = load <2 x i64>, ptr %C
@@ -1413,21 +1363,13 @@ define <2 x i64> @uabal2_2d(ptr %A, ptr %B, ptr %C) nounwind {
 }
 
 define <8 x i8> @saba_8b(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-SD-LABEL: saba_8b:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr d1, [x1]
-; CHECK-SD-NEXT:    ldr d2, [x0]
-; CHECK-SD-NEXT:    ldr d0, [x2]
-; CHECK-SD-NEXT:    saba.8b v0, v2, v1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: saba_8b:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr d1, [x0]
-; CHECK-GI-NEXT:    ldr d2, [x1]
-; CHECK-GI-NEXT:    ldr d0, [x2]
-; CHECK-GI-NEXT:    saba.8b v0, v1, v2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: saba_8b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
+; CHECK-NEXT:    ldr d0, [x2]
+; CHECK-NEXT:    saba.8b v0, v1, v2
+; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = load <8 x i8>, ptr %B
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
@@ -1437,21 +1379,13 @@ define <8 x i8> @saba_8b(ptr %A, ptr %B, ptr %C) nounwind {
 }
 
 define <16 x i8> @saba_16b(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-SD-LABEL: saba_16b:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q1, [x1]
-; CHECK-SD-NEXT:    ldr q2, [x0]
-; CHECK-SD-NEXT:    ldr q0, [x2]
-; CHECK-SD-NEXT:    saba.16b v0, v2, v1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: saba_16b:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr q1, [x0]
-; CHECK-GI-NEXT:    ldr q2, [x1]
-; CHECK-GI-NEXT:    ldr q0, [x2]
-; CHECK-GI-NEXT:    saba.16b v0, v1, v2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: saba_16b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
+; CHECK-NEXT:    ldr q0, [x2]
+; CHECK-NEXT:    saba.16b v0, v1, v2
+; CHECK-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp2 = load <16 x i8>, ptr %B
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
@@ -1461,21 +1395,13 @@ define <16 x i8> @saba_16b(ptr %A, ptr %B, ptr %C) nounwind {
 }
 
 define <4 x i16> @saba_4h(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-SD-LABEL: saba_4h:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr d1, [x1]
-; CHECK-SD-NEXT:    ldr d2, [x0]
-; CHECK-SD-NEXT:    ldr d0, [x2]
-; CHECK-SD-NEXT:    saba.4h v0, v2, v1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: saba_4h:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr d1, [x0]
-; CHECK-GI-NEXT:    ldr d2, [x1]
-; CHECK-GI-NEXT:    ldr d0, [x2]
-; CHECK-GI-NEXT:    saba.4h v0, v1, v2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: saba_4h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
+; CHECK-NEXT:    ldr d0, [x2]
+; CHECK-NEXT:    saba.4h v0, v1, v2
+; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
@@ -1485,21 +1411,13 @@ define <4 x i16> @saba_4h(ptr %A, ptr %B, ptr %C) nounwind {
 }
 
 define <8 x i16> @saba_8h(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-SD-LABEL: saba_8h:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q1, [x1]
-; CHECK-SD-NEXT:    ldr q2, [x0]
-; CHECK-SD-NEXT:    ldr q0, [x2]
-; CHECK-SD-NEXT:    saba.8h v0, v2, v1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: saba_8h:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr q1, [x0]
-; CHECK-GI-NEXT:    ldr q2, [x1]
-; CHECK-GI-NEXT:    ldr q0, [x2]
-; CHECK-GI-NEXT:    saba.8h v0, v1, v2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: saba_8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
+; CHECK-NEXT:    ldr q0, [x2]
+; CHECK-NEXT:    saba.8h v0, v1, v2
+; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp2 = load <8 x i16>, ptr %B
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
@@ -1509,21 +1427,13 @@ define <8 x i16> @saba_8h(ptr %A, ptr %B, ptr %C) nounwind {
 }
 
 define <2 x i32> @saba_2s(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-SD-LABEL: saba_2s:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr d1, [x1]
-; CHECK-SD-NEXT:    ldr d2, [x0]
-; CHECK-SD-NEXT:    ldr d0, [x2]
-; CHECK-SD-NEXT:    saba.2s v0, v2, v1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: saba_2s:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr d1, [x0]
-; CHECK-GI-NEXT:    ldr d2, [x1]
-; CHECK-GI-NEXT:    ldr d0, [x2]
-; CHECK-GI-NEXT:    saba.2s v0, v1, v2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: saba_2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
+; CHECK-NEXT:    ldr d0, [x2]
+; CHECK-NEXT:    saba.2s v0, v1, v2
+; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
@@ -1533,21 +1443,13 @@ define <2 x i32> @saba_2s(ptr %A, ptr %B, ptr %C) nounwind {
 }
 
 define <4 x i32> @saba_4s(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-SD-LABEL: saba_4s:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q1, [x1]
-; CHECK-SD-NEXT:    ldr q2, [x0]
-; CHECK-SD-NEXT:    ldr q0, [x2]
-; CHECK-SD-NEXT:    saba.4s v0, v2, v1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: saba_4s:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr q1, [x0]
-; CHECK-GI-NEXT:    ldr q2, [x1]
-; CHECK-GI-NEXT:    ldr q0, [x2]
-; CHECK-GI-NEXT:    saba.4s v0, v1, v2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: saba_4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
+; CHECK-NEXT:    ldr q0, [x2]
+; CHECK-NEXT:    saba.4s v0, v1, v2
+; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp2 = load <4 x i32>, ptr %B
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
@@ -1557,21 +1459,13 @@ define <4 x i32> @saba_4s(ptr %A, ptr %B, ptr %C) nounwind {
 }
 
 define <8 x i8> @uaba_8b(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-SD-LABEL: uaba_8b:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr d1, [x1]
-; CHECK-SD-NEXT:    ldr d2, [x0]
-; CHECK-SD-NEXT:    ldr d0, [x2]
-; CHECK-SD-NEXT:    uaba.8b v0, v2, v1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: uaba_8b:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr d1, [x0]
-; CHECK-GI-NEXT:    ldr d2, [x1]
-; CHECK-GI-NEXT:    ldr d0, [x2]
-; CHECK-GI-NEXT:    uaba.8b v0, v1, v2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: uaba_8b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
+; CHECK-NEXT:    ldr d0, [x2]
+; CHECK-NEXT:    uaba.8b v0, v1, v2
+; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = load <8 x i8>, ptr %B
   %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
@@ -1581,21 +1475,13 @@ define <8 x i8> @uaba_8b(ptr %A, ptr %B, ptr %C) nounwind {
 }
 
 define <16 x i8> @uaba_16b(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-SD-LABEL: uaba_16b:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q1, [x1]
-; CHECK-SD-NEXT:    ldr q2, [x0]
-; CHECK-SD-NEXT:    ldr q0, [x2]
-; CHECK-SD-NEXT:    uaba.16b v0, v2, v1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: uaba_16b:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr q1, [x0]
-; CHECK-GI-NEXT:    ldr q2, [x1]
-; CHECK-GI-NEXT:    ldr q0, [x2]
-; CHECK-GI-NEXT:    uaba.16b v0, v1, v2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: uaba_16b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
+; CHECK-NEXT:    ldr q0, [x2]
+; CHECK-NEXT:    uaba.16b v0, v1, v2
+; CHECK-NEXT:    ret
   %tmp1 = load <16 x i8>, ptr %A
   %tmp2 = load <16 x i8>, ptr %B
   %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
@@ -1605,21 +1491,13 @@ define <16 x i8> @uaba_16b(ptr %A, ptr %B, ptr %C) nounwind {
 }
 
 define <4 x i16> @uaba_4h(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-SD-LABEL: uaba_4h:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr d1, [x1]
-; CHECK-SD-NEXT:    ldr d2, [x0]
-; CHECK-SD-NEXT:    ldr d0, [x2]
-; CHECK-SD-NEXT:    uaba.4h v0, v2, v1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: uaba_4h:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr d1, [x0]
-; CHECK-GI-NEXT:    ldr d2, [x1]
-; CHECK-GI-NEXT:    ldr d0, [x2]
-; CHECK-GI-NEXT:    uaba.4h v0, v1, v2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: uaba_4h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
+; CHECK-NEXT:    ldr d0, [x2]
+; CHECK-NEXT:    uaba.4h v0, v1, v2
+; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
   %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
@@ -1629,21 +1507,13 @@ define <4 x i16> @uaba_4h(ptr %A, ptr %B, ptr %C) nounwind {
 }
 
 define <8 x i16> @uaba_8h(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-SD-LABEL: uaba_8h:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q1, [x1]
-; CHECK-SD-NEXT:    ldr q2, [x0]
-; CHECK-SD-NEXT:    ldr q0, [x2]
-; CHECK-SD-NEXT:    uaba.8h v0, v2, v1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: uaba_8h:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr q1, [x0]
-; CHECK-GI-NEXT:    ldr q2, [x1]
-; CHECK-GI-NEXT:    ldr q0, [x2]
-; CHECK-GI-NEXT:    uaba.8h v0, v1, v2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: uaba_8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
+; CHECK-NEXT:    ldr q0, [x2]
+; CHECK-NEXT:    uaba.8h v0, v1, v2
+; CHECK-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp2 = load <8 x i16>, ptr %B
   %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
@@ -1653,21 +1523,13 @@ define <8 x i16> @uaba_8h(ptr %A, ptr %B, ptr %C) nounwind {
 }
 
 define <2 x i32> @uaba_2s(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-SD-LABEL: uaba_2s:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr d1, [x1]
-; CHECK-SD-NEXT:    ldr d2, [x0]
-; CHECK-SD-NEXT:    ldr d0, [x2]
-; CHECK-SD-NEXT:    uaba.2s v0, v2, v1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: uaba_2s:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr d1, [x0]
-; CHECK-GI-NEXT:    ldr d2, [x1]
-; CHECK-GI-NEXT:    ldr d0, [x2]
-; CHECK-GI-NEXT:    uaba.2s v0, v1, v2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: uaba_2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
+; CHECK-NEXT:    ldr d0, [x2]
+; CHECK-NEXT:    uaba.2s v0, v1, v2
+; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
   %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
@@ -1677,21 +1539,13 @@ define <2 x i32> @uaba_2s(ptr %A, ptr %B, ptr %C) nounwind {
 }
 
 define <4 x i32> @uaba_4s(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-SD-LABEL: uaba_4s:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q1, [x1]
-; CHECK-SD-NEXT:    ldr q2, [x0]
-; CHECK-SD-NEXT:    ldr q0, [x2]
-; CHECK-SD-NEXT:    uaba.4s v0, v2, v1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: uaba_4s:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr q1, [x0]
-; CHECK-GI-NEXT:    ldr q2, [x1]
-; CHECK-GI-NEXT:    ldr q0, [x2]
-; CHECK-GI-NEXT:    uaba.4s v0, v1, v2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: uaba_4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
+; CHECK-NEXT:    ldr q0, [x2]
+; CHECK-NEXT:    uaba.4s v0, v1, v2
+; CHECK-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp2 = load <4 x i32>, ptr %B
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
@@ -1957,10 +1811,10 @@ define <2 x i64> @uabd_i32(<2 x i32> %a, <2 x i32> %b) {
 ;
 ; CHECK-GI-LABEL: uabd_i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    movi.2d v2, #0000000000000000
 ; CHECK-GI-NEXT:    ssubl.2d v0, v0, v1
-; CHECK-GI-NEXT:    cmgt.2d v1, v2, v0
+; CHECK-GI-NEXT:    movi.2d v1, #0000000000000000
 ; CHECK-GI-NEXT:    neg.2d v2, v0
+; CHECK-GI-NEXT:    cmgt.2d v1, v1, v0
 ; CHECK-GI-NEXT:    bit.16b v0, v2, v1
 ; CHECK-GI-NEXT:    ret
   %aext = sext <2 x i32> %a to <2 x i64>
@@ -1976,28 +1830,28 @@ define <2 x i128> @uabd_i64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: uabd_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov.d x8, v0[1]
-; CHECK-NEXT:    fmov x9, d0
-; CHECK-NEXT:    mov.d x10, v1[1]
+; CHECK-NEXT:    mov.d x9, v1[1]
+; CHECK-NEXT:    fmov x10, d0
 ; CHECK-NEXT:    fmov x11, d1
-; CHECK-NEXT:    asr x12, x9, #63
+; CHECK-NEXT:    asr x12, x10, #63
 ; CHECK-NEXT:    asr x13, x11, #63
-; CHECK-NEXT:    subs x9, x9, x11
-; CHECK-NEXT:    sbc x11, x12, x13
-; CHECK-NEXT:    asr x12, x8, #63
-; CHECK-NEXT:    asr x13, x10, #63
-; CHECK-NEXT:    subs x8, x8, x10
-; CHECK-NEXT:    sbc x10, x12, x13
-; CHECK-NEXT:    asr x12, x11, #63
-; CHECK-NEXT:    asr x13, x10, #63
-; CHECK-NEXT:    eor x9, x9, x12
-; CHECK-NEXT:    eor x8, x8, x13
+; CHECK-NEXT:    subs x10, x10, x11
+; CHECK-NEXT:    asr x11, x8, #63
+; CHECK-NEXT:    asr x14, x9, #63
+; CHECK-NEXT:    sbc x12, x12, x13
+; CHECK-NEXT:    subs x8, x8, x9
+; CHECK-NEXT:    sbc x9, x11, x14
+; CHECK-NEXT:    asr x13, x12, #63
+; CHECK-NEXT:    asr x11, x9, #63
 ; CHECK-NEXT:    eor x10, x10, x13
-; CHECK-NEXT:    subs x2, x8, x13
-; CHECK-NEXT:    sbc x3, x10, x13
-; CHECK-NEXT:    subs x8, x9, x12
-; CHECK-NEXT:    eor x9, x11, x12
-; CHECK-NEXT:    sbc x1, x9, x12
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    eor x8, x8, x11
+; CHECK-NEXT:    eor x9, x9, x11
+; CHECK-NEXT:    subs x2, x8, x11
+; CHECK-NEXT:    eor x8, x12, x13
+; CHECK-NEXT:    sbc x3, x9, x11
+; CHECK-NEXT:    subs x9, x10, x13
+; CHECK-NEXT:    fmov d0, x9
+; CHECK-NEXT:    sbc x1, x8, x13
 ; CHECK-NEXT:    mov.d v0[1], x1
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
index 54a23a03b5e716..3542b26b53539c 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
@@ -52,12 +52,12 @@ entry:
 define void @fct1_64x2(ptr nocapture %array, i64 %offset) nounwind ssp {
 ; CHECK-LABEL: fct1_64x2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    adrp x9, :got:globalArray64x2
-; CHECK-NEXT:    lsl x8, x1, #4
-; CHECK-NEXT:    ldr x9, [x9, :got_lo12:globalArray64x2]
-; CHECK-NEXT:    ldr q0, [x0, x8]
-; CHECK-NEXT:    ldr x9, [x9]
-; CHECK-NEXT:    str q0, [x9, x8]
+; CHECK-NEXT:    adrp x8, :got:globalArray64x2
+; CHECK-NEXT:    lsl x9, x1, #4
+; CHECK-NEXT:    ldr x8, [x8, :got_lo12:globalArray64x2]
+; CHECK-NEXT:    ldr q0, [x0, x9]
+; CHECK-NEXT:    ldr x8, [x8]
+; CHECK-NEXT:    str q0, [x8, x9]
 ; CHECK-NEXT:    ret
 entry:
   %arrayidx = getelementptr inbounds <2 x i64>, ptr %array, i64 %offset
@@ -89,12 +89,12 @@ entry:
 define void @fct1_32x4(ptr nocapture %array, i64 %offset) nounwind ssp {
 ; CHECK-LABEL: fct1_32x4:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    adrp x9, :got:globalArray32x4
-; CHECK-NEXT:    lsl x8, x1, #4
-; CHECK-NEXT:    ldr x9, [x9, :got_lo12:globalArray32x4]
-; CHECK-NEXT:    ldr q0, [x0, x8]
-; CHECK-NEXT:    ldr x9, [x9]
-; CHECK-NEXT:    str q0, [x9, x8]
+; CHECK-NEXT:    adrp x8, :got:globalArray32x4
+; CHECK-NEXT:    lsl x9, x1, #4
+; CHECK-NEXT:    ldr x8, [x8, :got_lo12:globalArray32x4]
+; CHECK-NEXT:    ldr q0, [x0, x9]
+; CHECK-NEXT:    ldr x8, [x8]
+; CHECK-NEXT:    str q0, [x8, x9]
 ; CHECK-NEXT:    ret
 entry:
   %arrayidx = getelementptr inbounds <4 x i32>, ptr %array, i64 %offset
@@ -126,12 +126,12 @@ entry:
 define void @fct1_16x8(ptr nocapture %array, i64 %offset) nounwind ssp {
 ; CHECK-LABEL: fct1_16x8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    adrp x9, :got:globalArray16x8
-; CHECK-NEXT:    lsl x8, x1, #4
-; CHECK-NEXT:    ldr x9, [x9, :got_lo12:globalArray16x8]
-; CHECK-NEXT:    ldr q0, [x0, x8]
-; CHECK-NEXT:    ldr x9, [x9]
-; CHECK-NEXT:    str q0, [x9, x8]
+; CHECK-NEXT:    adrp x8, :got:globalArray16x8
+; CHECK-NEXT:    lsl x9, x1, #4
+; CHECK-NEXT:    ldr x8, [x8, :got_lo12:globalArray16x8]
+; CHECK-NEXT:    ldr q0, [x0, x9]
+; CHECK-NEXT:    ldr x8, [x8]
+; CHECK-NEXT:    str q0, [x8, x9]
 ; CHECK-NEXT:    ret
 entry:
   %arrayidx = getelementptr inbounds <8 x i16>, ptr %array, i64 %offset
@@ -163,12 +163,12 @@ entry:
 define void @fct1_8x16(ptr nocapture %array, i64 %offset) nounwind ssp {
 ; CHECK-LABEL: fct1_8x16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    adrp x9, :got:globalArray8x16
-; CHECK-NEXT:    lsl x8, x1, #4
-; CHECK-NEXT:    ldr x9, [x9, :got_lo12:globalArray8x16]
-; CHECK-NEXT:    ldr q0, [x0, x8]
-; CHECK-NEXT:    ldr x9, [x9]
-; CHECK-NEXT:    str q0, [x9, x8]
+; CHECK-NEXT:    adrp x8, :got:globalArray8x16
+; CHECK-NEXT:    lsl x9, x1, #4
+; CHECK-NEXT:    ldr x8, [x8, :got_lo12:globalArray8x16]
+; CHECK-NEXT:    ldr q0, [x0, x9]
+; CHECK-NEXT:    ldr x8, [x8]
+; CHECK-NEXT:    str q0, [x8, x9]
 ; CHECK-NEXT:    ret
 entry:
   %arrayidx = getelementptr inbounds <16 x i8>, ptr %array, i64 %offset
@@ -200,12 +200,12 @@ entry:
 define void @fct1_64x1(ptr nocapture %array, i64 %offset) nounwind ssp {
 ; CHECK-LABEL: fct1_64x1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    adrp x9, :got:globalArray64x1
-; CHECK-NEXT:    lsl x8, x1, #3
-; CHECK-NEXT:    ldr x9, [x9, :got_lo12:globalArray64x1]
-; CHECK-NEXT:    ldr d0, [x0, x8]
-; CHECK-NEXT:    ldr x9, [x9]
-; CHECK-NEXT:    str d0, [x9, x8]
+; CHECK-NEXT:    adrp x8, :got:globalArray64x1
+; CHECK-NEXT:    lsl x9, x1, #3
+; CHECK-NEXT:    ldr x8, [x8, :got_lo12:globalArray64x1]
+; CHECK-NEXT:    ldr d0, [x0, x9]
+; CHECK-NEXT:    ldr x8, [x8]
+; CHECK-NEXT:    str d0, [x8, x9]
 ; CHECK-NEXT:    ret
 entry:
   %arrayidx = getelementptr inbounds <1 x i64>, ptr %array, i64 %offset
@@ -237,12 +237,12 @@ entry:
 define void @fct1_32x2(ptr nocapture %array, i64 %offset) nounwind ssp {
 ; CHECK-LABEL: fct1_32x2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    adrp x9, :got:globalArray32x2
-; CHECK-NEXT:    lsl x8, x1, #3
-; CHECK-NEXT:    ldr x9, [x9, :got_lo12:globalArray32x2]
-; CHECK-NEXT:    ldr d0, [x0, x8]
-; CHECK-NEXT:    ldr x9, [x9]
-; CHECK-NEXT:    str d0, [x9, x8]
+; CHECK-NEXT:    adrp x8, :got:globalArray32x2
+; CHECK-NEXT:    lsl x9, x1, #3
+; CHECK-NEXT:    ldr x8, [x8, :got_lo12:globalArray32x2]
+; CHECK-NEXT:    ldr d0, [x0, x9]
+; CHECK-NEXT:    ldr x8, [x8]
+; CHECK-NEXT:    str d0, [x8, x9]
 ; CHECK-NEXT:    ret
 entry:
   %arrayidx = getelementptr inbounds <2 x i32>, ptr %array, i64 %offset
@@ -274,12 +274,12 @@ entry:
 define void @fct1_16x4(ptr nocapture %array, i64 %offset) nounwind ssp {
 ; CHECK-LABEL: fct1_16x4:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    adrp x9, :got:globalArray16x4
-; CHECK-NEXT:    lsl x8, x1, #3
-; CHECK-NEXT:    ldr x9, [x9, :got_lo12:globalArray16x4]
-; CHECK-NEXT:    ldr d0, [x0, x8]
-; CHECK-NEXT:    ldr x9, [x9]
-; CHECK-NEXT:    str d0, [x9, x8]
+; CHECK-NEXT:    adrp x8, :got:globalArray16x4
+; CHECK-NEXT:    lsl x9, x1, #3
+; CHECK-NEXT:    ldr x8, [x8, :got_lo12:globalArray16x4]
+; CHECK-NEXT:    ldr d0, [x0, x9]
+; CHECK-NEXT:    ldr x8, [x8]
+; CHECK-NEXT:    str d0, [x8, x9]
 ; CHECK-NEXT:    ret
 entry:
   %arrayidx = getelementptr inbounds <4 x i16>, ptr %array, i64 %offset
@@ -311,12 +311,12 @@ entry:
 define void @fct1_8x8(ptr nocapture %array, i64 %offset) nounwind ssp {
 ; CHECK-LABEL: fct1_8x8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    adrp x9, :got:globalArray8x8
-; CHECK-NEXT:    lsl x8, x1, #3
-; CHECK-NEXT:    ldr x9, [x9, :got_lo12:globalArray8x8]
-; CHECK-NEXT:    ldr d0, [x0, x8]
-; CHECK-NEXT:    ldr x9, [x9]
-; CHECK-NEXT:    str d0, [x9, x8]
+; CHECK-NEXT:    adrp x8, :got:globalArray8x8
+; CHECK-NEXT:    lsl x9, x1, #3
+; CHECK-NEXT:    ldr x8, [x8, :got_lo12:globalArray8x8]
+; CHECK-NEXT:    ldr d0, [x0, x9]
+; CHECK-NEXT:    ldr x8, [x8]
+; CHECK-NEXT:    str d0, [x8, x9]
 ; CHECK-NEXT:    ret
 entry:
   %arrayidx = getelementptr inbounds <8 x i8>, ptr %array, i64 %offset

diff  --git a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
index c6848d44c3d6ad..e287eff5abb946 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
@@ -903,10 +903,10 @@ define <2 x i16> @hadd8x2_sext_lsr(<2 x i8> %src1, <2 x i8> %src2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    shl.2s v0, v0, #24
 ; CHECK-NEXT:    shl.2s v1, v1, #24
-; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
 ; CHECK-NEXT:    sshr.2s v0, v0, #24
 ; CHECK-NEXT:    ssra.2s v0, v1, #24
-; CHECK-NEXT:    and.8b v0, v0, v2
+; CHECK-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-NEXT:    and.8b v0, v0, v1
 ; CHECK-NEXT:    ushr.2s v0, v0, #1
 ; CHECK-NEXT:    ret
   %zextsrc1 = sext <2 x i8> %src1 to <2 x i16>
@@ -968,10 +968,10 @@ define <4 x i16> @rhadd8_sext_lsr(<4 x i8> %src1, <4 x i8> %src2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    shl.4h v0, v0, #8
 ; CHECK-NEXT:    shl.4h v1, v1, #8
-; CHECK-NEXT:    movi.4h v2, #1
 ; CHECK-NEXT:    sshr.4h v0, v0, #8
 ; CHECK-NEXT:    ssra.4h v0, v1, #8
-; CHECK-NEXT:    add.4h v0, v0, v2
+; CHECK-NEXT:    movi.4h v1, #1
+; CHECK-NEXT:    add.4h v0, v0, v1
 ; CHECK-NEXT:    ushr.4h v0, v0, #1
 ; CHECK-NEXT:    ret
   %zextsrc1 = sext <4 x i8> %src1 to <4 x i16>
@@ -1283,13 +1283,13 @@ define <16 x i8> @andmaskv16i8(<16 x i16> %src1, <16 x i8> %src2) {
 define <16 x i8> @andmask2v16i8(<16 x i16> %src1, <16 x i16> %src2) {
 ; CHECK-LABEL: andmask2v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi.16b v4, #3
-; CHECK-NEXT:    movi.16b v5, #7
 ; CHECK-NEXT:    uzp1.16b v2, v2, v3
+; CHECK-NEXT:    movi.16b v3, #3
 ; CHECK-NEXT:    uzp1.16b v0, v0, v1
-; CHECK-NEXT:    and.16b v1, v2, v4
-; CHECK-NEXT:    and.16b v0, v0, v5
-; CHECK-NEXT:    uhadd.16b v0, v0, v1
+; CHECK-NEXT:    movi.16b v1, #7
+; CHECK-NEXT:    and.16b v2, v2, v3
+; CHECK-NEXT:    and.16b v0, v0, v1
+; CHECK-NEXT:    uhadd.16b v0, v0, v2
 ; CHECK-NEXT:    ret
   %zextsrc1 = and <16 x i16> %src1, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %zextsrc2 = and <16 x i16> %src2, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>

diff  --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
index 1f71d9fdfc4ea3..499786470d4ac1 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
@@ -356,10 +356,10 @@ declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
 define <4 x i32> @smlal4s(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: smlal4s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr d2, [x0]
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
 ; CHECK-NEXT:    ldr q0, [x2]
-; CHECK-NEXT:    smlal.4s v0, v2, v1
+; CHECK-NEXT:    smlal.4s v0, v1, v2
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
@@ -372,10 +372,10 @@ define <4 x i32> @smlal4s(ptr %A, ptr %B, ptr %C) nounwind {
 define <2 x i64> @smlal2d(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: smlal2d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr d2, [x0]
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
 ; CHECK-NEXT:    ldr q0, [x2]
-; CHECK-NEXT:    smlal.2d v0, v2, v1
+; CHECK-NEXT:    smlal.2d v0, v1, v2
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
@@ -406,7 +406,7 @@ define void @smlal8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <
 define void @smlal2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
 ; CHECK-LABEL: smlal2d_chain_with_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #257
+; CHECK-NEXT:    mov w8, #257 // =0x101
 ; CHECK-NEXT:    dup.2d v3, x8
 ; CHECK-NEXT:    smlal.2d v3, v0, v2
 ; CHECK-NEXT:    mvn.8b v0, v2
@@ -425,10 +425,10 @@ define void @smlal2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2,
 define <4 x i32> @smlsl4s(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: smlsl4s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr d2, [x0]
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
 ; CHECK-NEXT:    ldr q0, [x2]
-; CHECK-NEXT:    smlsl.4s v0, v2, v1
+; CHECK-NEXT:    smlsl.4s v0, v1, v2
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
@@ -441,10 +441,10 @@ define <4 x i32> @smlsl4s(ptr %A, ptr %B, ptr %C) nounwind {
 define <2 x i64> @smlsl2d(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: smlsl2d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr d2, [x0]
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
 ; CHECK-NEXT:    ldr q0, [x2]
-; CHECK-NEXT:    smlsl.2d v0, v2, v1
+; CHECK-NEXT:    smlsl.2d v0, v1, v2
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
@@ -475,7 +475,7 @@ define void @smlsl8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <
 define void @smlsl2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
 ; CHECK-LABEL: smlsl2d_chain_with_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #257
+; CHECK-NEXT:    mov w8, #257 // =0x101
 ; CHECK-NEXT:    dup.2d v3, x8
 ; CHECK-NEXT:    smlsl.2d v3, v0, v2
 ; CHECK-NEXT:    mvn.8b v0, v2
@@ -499,10 +499,10 @@ declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
 define <4 x i32> @sqdmlal4s(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: sqdmlal4s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr d2, [x0]
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
 ; CHECK-NEXT:    ldr q0, [x2]
-; CHECK-NEXT:    sqdmlal.4s v0, v2, v1
+; CHECK-NEXT:    sqdmlal.4s v0, v1, v2
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
@@ -515,10 +515,10 @@ define <4 x i32> @sqdmlal4s(ptr %A, ptr %B, ptr %C) nounwind {
 define <2 x i64> @sqdmlal2d(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: sqdmlal2d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr d2, [x0]
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
 ; CHECK-NEXT:    ldr q0, [x2]
-; CHECK-NEXT:    sqdmlal.2d v0, v2, v1
+; CHECK-NEXT:    sqdmlal.2d v0, v1, v2
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
@@ -567,10 +567,10 @@ define <2 x i64> @sqdmlal2_2d(ptr %A, ptr %B, ptr %C) nounwind {
 define <4 x i32> @sqdmlsl4s(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: sqdmlsl4s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr d2, [x0]
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
 ; CHECK-NEXT:    ldr q0, [x2]
-; CHECK-NEXT:    sqdmlsl.4s v0, v2, v1
+; CHECK-NEXT:    sqdmlsl.4s v0, v1, v2
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
@@ -583,10 +583,10 @@ define <4 x i32> @sqdmlsl4s(ptr %A, ptr %B, ptr %C) nounwind {
 define <2 x i64> @sqdmlsl2d(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: sqdmlsl2d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr d2, [x0]
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
 ; CHECK-NEXT:    ldr q0, [x2]
-; CHECK-NEXT:    sqdmlsl.2d v0, v2, v1
+; CHECK-NEXT:    sqdmlsl.2d v0, v1, v2
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
@@ -635,10 +635,10 @@ define <2 x i64> @sqdmlsl2_2d(ptr %A, ptr %B, ptr %C) nounwind {
 define <4 x i32> @umlal4s(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: umlal4s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr d2, [x0]
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
 ; CHECK-NEXT:    ldr q0, [x2]
-; CHECK-NEXT:    umlal.4s v0, v2, v1
+; CHECK-NEXT:    umlal.4s v0, v1, v2
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
@@ -651,10 +651,10 @@ define <4 x i32> @umlal4s(ptr %A, ptr %B, ptr %C) nounwind {
 define <2 x i64> @umlal2d(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: umlal2d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr d2, [x0]
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
 ; CHECK-NEXT:    ldr q0, [x2]
-; CHECK-NEXT:    umlal.2d v0, v2, v1
+; CHECK-NEXT:    umlal.2d v0, v1, v2
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
@@ -685,7 +685,7 @@ define void @umlal8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <
 define void @umlal2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
 ; CHECK-LABEL: umlal2d_chain_with_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #257
+; CHECK-NEXT:    mov w8, #257 // =0x101
 ; CHECK-NEXT:    dup.2d v3, x8
 ; CHECK-NEXT:    umlal.2d v3, v0, v2
 ; CHECK-NEXT:    mvn.8b v0, v2
@@ -704,10 +704,10 @@ define void @umlal2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2,
 define <4 x i32> @umlsl4s(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: umlsl4s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr d2, [x0]
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
 ; CHECK-NEXT:    ldr q0, [x2]
-; CHECK-NEXT:    umlsl.4s v0, v2, v1
+; CHECK-NEXT:    umlsl.4s v0, v1, v2
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
@@ -720,10 +720,10 @@ define <4 x i32> @umlsl4s(ptr %A, ptr %B, ptr %C) nounwind {
 define <2 x i64> @umlsl2d(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: umlsl2d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr d2, [x0]
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
 ; CHECK-NEXT:    ldr q0, [x2]
-; CHECK-NEXT:    umlsl.2d v0, v2, v1
+; CHECK-NEXT:    umlsl.2d v0, v1, v2
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
@@ -754,7 +754,7 @@ define void @umlsl8h_chain_with_constant(ptr %dst, <8 x i8> %v1, <8 x i8> %v2, <
 define void @umlsl2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
 ; CHECK-LABEL: umlsl2d_chain_with_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #257
+; CHECK-NEXT:    mov w8, #257 // =0x101
 ; CHECK-NEXT:    dup.2d v3, x8
 ; CHECK-NEXT:    umlsl.2d v3, v0, v2
 ; CHECK-NEXT:    mvn.8b v0, v2
@@ -773,10 +773,10 @@ define void @umlsl2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2,
 define <2 x float> @fmla_2s(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: fmla_2s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr d2, [x0]
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
 ; CHECK-NEXT:    ldr d0, [x2]
-; CHECK-NEXT:    fmla.2s v0, v1, v2
+; CHECK-NEXT:    fmla.2s v0, v2, v1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x float>, ptr %A
   %tmp2 = load <2 x float>, ptr %B
@@ -788,10 +788,10 @@ define <2 x float> @fmla_2s(ptr %A, ptr %B, ptr %C) nounwind {
 define <4 x float> @fmla_4s(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: fmla_4s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ldr q2, [x0]
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
 ; CHECK-NEXT:    ldr q0, [x2]
-; CHECK-NEXT:    fmla.4s v0, v1, v2
+; CHECK-NEXT:    fmla.4s v0, v2, v1
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x float>, ptr %A
   %tmp2 = load <4 x float>, ptr %B
@@ -803,10 +803,10 @@ define <4 x float> @fmla_4s(ptr %A, ptr %B, ptr %C) nounwind {
 define <2 x double> @fmla_2d(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: fmla_2d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ldr q2, [x0]
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
 ; CHECK-NEXT:    ldr q0, [x2]
-; CHECK-NEXT:    fmla.2d v0, v1, v2
+; CHECK-NEXT:    fmla.2d v0, v2, v1
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x double>, ptr %A
   %tmp2 = load <2 x double>, ptr %B
@@ -822,10 +822,10 @@ declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) n
 define <2 x float> @fmls_2s(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: fmls_2s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr d2, [x0]
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
 ; CHECK-NEXT:    ldr d0, [x2]
-; CHECK-NEXT:    fmls.2s v0, v2, v1
+; CHECK-NEXT:    fmls.2s v0, v1, v2
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x float>, ptr %A
   %tmp2 = load <2 x float>, ptr %B
@@ -838,10 +838,10 @@ define <2 x float> @fmls_2s(ptr %A, ptr %B, ptr %C) nounwind {
 define <4 x float> @fmls_4s(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: fmls_4s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ldr q2, [x0]
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
 ; CHECK-NEXT:    ldr q0, [x2]
-; CHECK-NEXT:    fmls.4s v0, v2, v1
+; CHECK-NEXT:    fmls.4s v0, v1, v2
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x float>, ptr %A
   %tmp2 = load <4 x float>, ptr %B
@@ -854,10 +854,10 @@ define <4 x float> @fmls_4s(ptr %A, ptr %B, ptr %C) nounwind {
 define <2 x double> @fmls_2d(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: fmls_2d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ldr q2, [x0]
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
 ; CHECK-NEXT:    ldr q0, [x2]
-; CHECK-NEXT:    fmls.2d v0, v2, v1
+; CHECK-NEXT:    fmls.2d v0, v1, v2
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x double>, ptr %A
   %tmp2 = load <2 x double>, ptr %B
@@ -870,10 +870,10 @@ define <2 x double> @fmls_2d(ptr %A, ptr %B, ptr %C) nounwind {
 define <2 x float> @fmls_commuted_neg_2s(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: fmls_commuted_neg_2s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr d2, [x0]
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
 ; CHECK-NEXT:    ldr d0, [x2]
-; CHECK-NEXT:    fmls.2s v0, v2, v1
+; CHECK-NEXT:    fmls.2s v0, v1, v2
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x float>, ptr %A
   %tmp2 = load <2 x float>, ptr %B
@@ -886,10 +886,10 @@ define <2 x float> @fmls_commuted_neg_2s(ptr %A, ptr %B, ptr %C) nounwind {
 define <4 x float> @fmls_commuted_neg_4s(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: fmls_commuted_neg_4s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ldr q2, [x0]
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
 ; CHECK-NEXT:    ldr q0, [x2]
-; CHECK-NEXT:    fmls.4s v0, v2, v1
+; CHECK-NEXT:    fmls.4s v0, v1, v2
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x float>, ptr %A
   %tmp2 = load <4 x float>, ptr %B
@@ -902,10 +902,10 @@ define <4 x float> @fmls_commuted_neg_4s(ptr %A, ptr %B, ptr %C) nounwind {
 define <2 x double> @fmls_commuted_neg_2d(ptr %A, ptr %B, ptr %C) nounwind {
 ; CHECK-LABEL: fmls_commuted_neg_2d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ldr q2, [x0]
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
 ; CHECK-NEXT:    ldr q0, [x2]
-; CHECK-NEXT:    fmls.2d v0, v2, v1
+; CHECK-NEXT:    fmls.2d v0, v1, v2
 ; CHECK-NEXT:    ret
   %tmp1 = load <2 x double>, ptr %A
   %tmp2 = load <2 x double>, ptr %B
@@ -1122,13 +1122,13 @@ define <4 x i32> @mul_4s(<4 x i32> %A, <4 x i32> %B) nounwind {
 define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind {
 ; CHECK-LABEL: mul_2d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov x9, d1
-; CHECK-NEXT:    fmov x10, d0
+; CHECK-NEXT:    fmov x10, d1
+; CHECK-NEXT:    fmov x11, d0
 ; CHECK-NEXT:    mov.d x8, v1[1]
-; CHECK-NEXT:    mov.d x11, v0[1]
-; CHECK-NEXT:    mul x9, x10, x9
-; CHECK-NEXT:    mul x8, x11, x8
-; CHECK-NEXT:    fmov d0, x9
+; CHECK-NEXT:    mov.d x9, v0[1]
+; CHECK-NEXT:    mul x10, x11, x10
+; CHECK-NEXT:    mul x8, x9, x8
+; CHECK-NEXT:    fmov d0, x10
 ; CHECK-NEXT:    mov.d v0[1], x8
 ; CHECK-NEXT:    ret
   %tmp1 = mul <2 x i64> %A, %B
@@ -1533,10 +1533,10 @@ define i32 @sqadd_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind {
 ; CHECK-LABEL: sqadd_lane1_sqdmull4s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sqdmull.4s v0, v0, v1
-; CHECK-NEXT:    fmov s1, w0
 ; CHECK-NEXT:    mov.s w8, v0[1]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    sqadd s0, s1, s0
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    fmov s1, w8
+; CHECK-NEXT:    sqadd s0, s0, s1
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %B, <4 x i16> %C)
@@ -1549,10 +1549,10 @@ define i32 @sqsub_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind {
 ; CHECK-LABEL: sqsub_lane1_sqdmull4s:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sqdmull.4s v0, v0, v1
-; CHECK-NEXT:    fmov s1, w0
 ; CHECK-NEXT:    mov.s w8, v0[1]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    sqsub s0, s1, s0
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    fmov s1, w8
+; CHECK-NEXT:    sqsub s0, s0, s1
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %B, <4 x i16> %C)
@@ -1564,11 +1564,11 @@ define i32 @sqsub_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind {
 define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
 ; CHECK-LABEL: sqdmlal_lane_1d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov s1, w1
-; CHECK-NEXT:    fmov d2, x0
+; CHECK-NEXT:    fmov d1, x0
+; CHECK-NEXT:    fmov s2, w1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    sqdmlal.s d2, s1, v0[1]
-; CHECK-NEXT:    fmov x0, d2
+; CHECK-NEXT:    sqdmlal.s d1, s2, v0[1]
+; CHECK-NEXT:    fmov x0, d1
 ; CHECK-NEXT:    ret
   %rhs = extractelement <2 x i32> %C, i32 1
   %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
@@ -1581,11 +1581,11 @@ declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64)
 define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
 ; CHECK-LABEL: sqdmlsl_lane_1d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov s1, w1
-; CHECK-NEXT:    fmov d2, x0
+; CHECK-NEXT:    fmov d1, x0
+; CHECK-NEXT:    fmov s2, w1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    sqdmlsl.s d2, s1, v0[1]
-; CHECK-NEXT:    fmov x0, d2
+; CHECK-NEXT:    sqdmlsl.s d1, s2, v0[1]
+; CHECK-NEXT:    fmov x0, d1
 ; CHECK-NEXT:    ret
   %rhs = extractelement <2 x i32> %C, i32 1
   %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
@@ -2767,10 +2767,10 @@ define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind
 define i32 @sqdmlal_s(i16 %A, i16 %B, i32 %C) nounwind {
 ; CHECK-LABEL: sqdmlal_s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov s0, w1
-; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    fmov s1, w1
 ; CHECK-NEXT:    fmov s2, w2
-; CHECK-NEXT:    sqdmlal.h s2, h1, v0[0]
+; CHECK-NEXT:    sqdmlal.h s2, h0, v1[0]
 ; CHECK-NEXT:    fmov w0, s2
 ; CHECK-NEXT:    ret
   %tmp1 = insertelement <4 x i16> undef, i16 %A, i64 0
@@ -2798,10 +2798,10 @@ define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind {
 define i32 @sqdmlsl_s(i16 %A, i16 %B, i32 %C) nounwind {
 ; CHECK-LABEL: sqdmlsl_s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov s0, w1
-; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    fmov s1, w1
 ; CHECK-NEXT:    fmov s2, w2
-; CHECK-NEXT:    sqdmlsl.h s2, h1, v0[0]
+; CHECK-NEXT:    sqdmlsl.h s2, h0, v1[0]
 ; CHECK-NEXT:    fmov w0, s2
 ; CHECK-NEXT:    ret
   %tmp1 = insertelement <4 x i16> undef, i16 %A, i64 0

diff  --git a/llvm/test/CodeGen/AArch64/arm64-vshift.ll b/llvm/test/CodeGen/AArch64/arm64-vshift.ll
index 50fae337031688..4fe1fe8a0ebc98 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vshift.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vshift.ll
@@ -396,8 +396,8 @@ define i64 @srshl_scalar_constant(ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    mov w9, #1 // =0x1
-; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    srshl d0, d0, d1
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -492,8 +492,8 @@ define i64 @urshl_scalar_constant(ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    mov w9, #1 // =0x1
-; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    urshl d0, d0, d1
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -805,8 +805,8 @@ define i64 @sqrshl_scalar_constant(ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    mov w9, #1 // =0x1
-; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    sqrshl d0, d0, d1
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -914,8 +914,8 @@ define i64 @uqrshl_scalar_constant(ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    mov w9, #1 // =0x1
-; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    uqrshl d0, d0, d1
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -2378,8 +2378,8 @@ define i64 @neon.sshll_scalar_constant_shift_m1(ptr %A) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    mov x9, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    sshl d0, d0, d1
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/arm64-xaluo.ll b/llvm/test/CodeGen/AArch64/arm64-xaluo.ll
index 118aca76abec55..77c70668b65a01 100644
--- a/llvm/test/CodeGen/AArch64/arm64-xaluo.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-xaluo.ll
@@ -18,8 +18,8 @@ define zeroext i1 @saddo1.i32(i32 %v1, i32 %v2, ptr %res) {
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    adds w8, w0, w1
 ; FAST-NEXT:    cset w9, vs
-; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    str w8, [x2]
+; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: saddo1.i32:
@@ -49,8 +49,8 @@ define zeroext i1 @saddo2.i32(i32 %v1, ptr %res) {
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    adds w8, w0, #4
 ; FAST-NEXT:    cset w9, vs
-; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    str w8, [x1]
+; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: saddo2.i32:
@@ -80,8 +80,8 @@ define zeroext i1 @saddo3.i32(i32 %v1, ptr %res) {
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    subs w8, w0, #4
 ; FAST-NEXT:    cset w9, vs
-; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    str w8, [x1]
+; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: saddo3.i32:
@@ -102,7 +102,7 @@ entry:
 define zeroext i1 @saddo4.i32(i32 %v1, ptr %res) {
 ; SDAG-LABEL: saddo4.i32:
 ; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    mov w8, #16777215
+; SDAG-NEXT:    mov w8, #16777215 // =0xffffff
 ; SDAG-NEXT:    adds w8, w0, w8
 ; SDAG-NEXT:    cset w0, vs
 ; SDAG-NEXT:    str w8, [x1]
@@ -110,16 +110,16 @@ define zeroext i1 @saddo4.i32(i32 %v1, ptr %res) {
 ;
 ; FAST-LABEL: saddo4.i32:
 ; FAST:       // %bb.0: // %entry
-; FAST-NEXT:    mov w8, #16777215
+; FAST-NEXT:    mov w8, #16777215 // =0xffffff
 ; FAST-NEXT:    adds w8, w0, w8
 ; FAST-NEXT:    cset w9, vs
-; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    str w8, [x1]
+; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: saddo4.i32:
 ; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    mov w8, #16777215
+; GISEL-NEXT:    mov w8, #16777215 // =0xffffff
 ; GISEL-NEXT:    adds w8, w0, w8
 ; GISEL-NEXT:    cset w0, vs
 ; GISEL-NEXT:    str w8, [x1]
@@ -176,8 +176,8 @@ define zeroext i1 @saddo1.i64(i64 %v1, i64 %v2, ptr %res) {
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    adds x8, x0, x1
 ; FAST-NEXT:    cset w9, vs
-; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    str x8, [x2]
+; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: saddo1.i64:
@@ -206,8 +206,8 @@ define zeroext i1 @saddo2.i64(i64 %v1, ptr %res) {
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    adds x8, x0, #4
 ; FAST-NEXT:    cset w9, vs
-; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    str x8, [x1]
+; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: saddo2.i64:
@@ -236,8 +236,8 @@ define zeroext i1 @saddo3.i64(i64 %v1, ptr %res) {
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    subs x8, x0, #4
 ; FAST-NEXT:    cset w9, vs
-; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    str x8, [x1]
+; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: saddo3.i64:
@@ -266,8 +266,8 @@ define zeroext i1 @uaddo.i32(i32 %v1, i32 %v2, ptr %res) {
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    adds w8, w0, w1
 ; FAST-NEXT:    cset w9, hs
-; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    str w8, [x2]
+; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: uaddo.i32:
@@ -296,8 +296,8 @@ define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    adds x8, x0, x1
 ; FAST-NEXT:    cset w9, hs
-; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    str x8, [x2]
+; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: uaddo.i64:
@@ -326,8 +326,8 @@ define zeroext i1 @ssubo1.i32(i32 %v1, i32 %v2, ptr %res) {
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    subs w8, w0, w1
 ; FAST-NEXT:    cset w9, vs
-; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    str w8, [x2]
+; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: ssubo1.i32:
@@ -356,8 +356,8 @@ define zeroext i1 @ssubo2.i32(i32 %v1, ptr %res) {
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    adds w8, w0, #4
 ; FAST-NEXT:    cset w9, vs
-; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    str w8, [x1]
+; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: ssubo2.i32:
@@ -386,8 +386,8 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    subs x8, x0, x1
 ; FAST-NEXT:    cset w9, vs
-; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    str x8, [x2]
+; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: ssubo.i64:
@@ -416,8 +416,8 @@ define zeroext i1 @usubo.i32(i32 %v1, i32 %v2, ptr %res) {
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    subs w8, w0, w1
 ; FAST-NEXT:    cset w9, lo
-; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    str w8, [x2]
+; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: usubo.i32:
@@ -446,8 +446,8 @@ define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    subs x8, x0, x1
 ; FAST-NEXT:    cset w9, lo
-; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    str x8, [x2]
+; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: usubo.i64:
@@ -469,16 +469,16 @@ define zeroext i1 @smulo.i32(i32 %v1, i32 %v2, ptr %res) {
 ; SDAG:       // %bb.0: // %entry
 ; SDAG-NEXT:    smull x8, w0, w1
 ; SDAG-NEXT:    cmp x8, w8, sxtw
-; SDAG-NEXT:    cset w0, ne
 ; SDAG-NEXT:    str w8, [x2]
+; SDAG-NEXT:    cset w0, ne
 ; SDAG-NEXT:    ret
 ;
 ; FAST-LABEL: smulo.i32:
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    smull x8, w0, w1
 ; FAST-NEXT:    cmp x8, w8, sxtw
-; FAST-NEXT:    cset w9, ne
 ; FAST-NEXT:    str w8, [x2]
+; FAST-NEXT:    cset w9, ne
 ; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    ret
 ;
@@ -487,9 +487,9 @@ define zeroext i1 @smulo.i32(i32 %v1, i32 %v2, ptr %res) {
 ; GISEL-NEXT:    smull x8, w0, w1
 ; GISEL-NEXT:    mul w9, w0, w1
 ; GISEL-NEXT:    asr x8, x8, #32
+; GISEL-NEXT:    str w9, [x2]
 ; GISEL-NEXT:    cmp w8, w9, asr #31
 ; GISEL-NEXT:    cset w0, ne
-; GISEL-NEXT:    str w9, [x2]
 ; GISEL-NEXT:    ret
 entry:
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
@@ -504,28 +504,28 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; SDAG:       // %bb.0: // %entry
 ; SDAG-NEXT:    mul x8, x0, x1
 ; SDAG-NEXT:    smulh x9, x0, x1
+; SDAG-NEXT:    str x8, [x2]
 ; SDAG-NEXT:    cmp x9, x8, asr #63
 ; SDAG-NEXT:    cset w0, ne
-; SDAG-NEXT:    str x8, [x2]
 ; SDAG-NEXT:    ret
 ;
 ; FAST-LABEL: smulo.i64:
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    mul x8, x0, x1
 ; FAST-NEXT:    smulh x9, x0, x1
+; FAST-NEXT:    str x8, [x2]
 ; FAST-NEXT:    cmp x9, x8, asr #63
 ; FAST-NEXT:    cset w9, ne
-; FAST-NEXT:    str x8, [x2]
 ; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: smulo.i64:
 ; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    mul x8, x0, x1
-; GISEL-NEXT:    smulh x9, x0, x1
-; GISEL-NEXT:    cmp x9, x8, asr #63
+; GISEL-NEXT:    smulh x8, x0, x1
+; GISEL-NEXT:    mul x9, x0, x1
+; GISEL-NEXT:    cmp x8, x9, asr #63
+; GISEL-NEXT:    str x9, [x2]
 ; GISEL-NEXT:    cset w0, ne
-; GISEL-NEXT:    str x8, [x2]
 ; GISEL-NEXT:    ret
 entry:
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
@@ -547,8 +547,8 @@ define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) {
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    adds x8, x0, x0
 ; FAST-NEXT:    cset w9, vs
-; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    str x8, [x1]
+; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: smulo2.i64:
@@ -570,17 +570,17 @@ define zeroext i1 @umulo.i32(i32 %v1, i32 %v2, ptr %res) {
 ; SDAG:       // %bb.0: // %entry
 ; SDAG-NEXT:    umull x8, w0, w1
 ; SDAG-NEXT:    tst x8, #0xffffffff00000000
-; SDAG-NEXT:    cset w0, ne
 ; SDAG-NEXT:    str w8, [x2]
+; SDAG-NEXT:    cset w0, ne
 ; SDAG-NEXT:    ret
 ;
 ; FAST-LABEL: umulo.i32:
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    umull x8, w0, w1
 ; FAST-NEXT:    tst x8, #0xffffffff00000000
+; FAST-NEXT:    str w8, [x2]
 ; FAST-NEXT:    cset w9, ne
 ; FAST-NEXT:    and w0, w9, #0x1
-; FAST-NEXT:    str w8, [x2]
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: umulo.i32:
@@ -588,10 +588,9 @@ define zeroext i1 @umulo.i32(i32 %v1, i32 %v2, ptr %res) {
 ; GISEL-NEXT:    umull x8, w0, w1
 ; GISEL-NEXT:    mul w9, w0, w1
 ; GISEL-NEXT:    lsr x8, x8, #32
-; GISEL-NEXT:    cmp w8, #0
-; GISEL-NEXT:    cset w8, ne
-; GISEL-NEXT:    mov w0, w8
 ; GISEL-NEXT:    str w9, [x2]
+; GISEL-NEXT:    cmp w8, #0
+; GISEL-NEXT:    cset w0, ne
 ; GISEL-NEXT:    ret
 entry:
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
@@ -607,8 +606,7 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; SDAG-NEXT:    umulh x8, x0, x1
 ; SDAG-NEXT:    mul x9, x0, x1
 ; SDAG-NEXT:    cmp xzr, x8
-; SDAG-NEXT:    cset w8, ne
-; SDAG-NEXT:    mov w0, w8
+; SDAG-NEXT:    cset w0, ne
 ; SDAG-NEXT:    str x9, [x2]
 ; SDAG-NEXT:    ret
 ;
@@ -618,8 +616,7 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; FAST-NEXT:    mul x9, x0, x1
 ; FAST-NEXT:    cmp xzr, x8
 ; FAST-NEXT:    cset w8, ne
-; FAST-NEXT:    and w8, w8, #0x1
-; FAST-NEXT:    mov w0, w8
+; FAST-NEXT:    and w0, w8, #0x1
 ; FAST-NEXT:    str x9, [x2]
 ; FAST-NEXT:    ret
 ;
@@ -628,8 +625,7 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; GISEL-NEXT:    umulh x8, x0, x1
 ; GISEL-NEXT:    mul x9, x0, x1
 ; GISEL-NEXT:    cmp x8, #0
-; GISEL-NEXT:    cset w8, ne
-; GISEL-NEXT:    mov w0, w8
+; GISEL-NEXT:    cset w0, ne
 ; GISEL-NEXT:    str x9, [x2]
 ; GISEL-NEXT:    ret
 entry:
@@ -652,8 +648,8 @@ define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) {
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    adds x8, x0, x0
 ; FAST-NEXT:    cset w9, hs
-; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    str x8, [x1]
+; FAST-NEXT:    and w0, w9, #0x1
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: umulo2.i64:
@@ -1160,13 +1156,29 @@ entry:
 }
 
 define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
-; CHECK-LABEL: smulo.select.i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mul x8, x0, x1
-; CHECK-NEXT:    smulh x9, x0, x1
-; CHECK-NEXT:    cmp x9, x8, asr #63
-; CHECK-NEXT:    csel x0, x0, x1, ne
-; CHECK-NEXT:    ret
+; SDAG-LABEL: smulo.select.i64:
+; SDAG:       // %bb.0: // %entry
+; SDAG-NEXT:    mul x8, x0, x1
+; SDAG-NEXT:    smulh x9, x0, x1
+; SDAG-NEXT:    cmp x9, x8, asr #63
+; SDAG-NEXT:    csel x0, x0, x1, ne
+; SDAG-NEXT:    ret
+;
+; FAST-LABEL: smulo.select.i64:
+; FAST:       // %bb.0: // %entry
+; FAST-NEXT:    mul x8, x0, x1
+; FAST-NEXT:    smulh x9, x0, x1
+; FAST-NEXT:    cmp x9, x8, asr #63
+; FAST-NEXT:    csel x0, x0, x1, ne
+; FAST-NEXT:    ret
+;
+; GISEL-LABEL: smulo.select.i64:
+; GISEL:       // %bb.0: // %entry
+; GISEL-NEXT:    smulh x8, x0, x1
+; GISEL-NEXT:    mul x9, x0, x1
+; GISEL-NEXT:    cmp x8, x9, asr #63
+; GISEL-NEXT:    csel x0, x0, x1, ne
+; GISEL-NEXT:    ret
 entry:
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
@@ -1193,9 +1205,9 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ;
 ; GISEL-LABEL: smulo.not.i64:
 ; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    mul x8, x0, x1
-; GISEL-NEXT:    smulh x9, x0, x1
-; GISEL-NEXT:    cmp x9, x8, asr #63
+; GISEL-NEXT:    smulh x8, x0, x1
+; GISEL-NEXT:    mul x9, x0, x1
+; GISEL-NEXT:    cmp x8, x9, asr #63
 ; GISEL-NEXT:    cset w8, ne
 ; GISEL-NEXT:    eor w0, w8, #0x1
 ; GISEL-NEXT:    ret
@@ -1326,29 +1338,29 @@ entry:
 define i8 @uaddo.selectboth.i8(i8 %a, i8 %b) {
 ; SDAG-LABEL: uaddo.selectboth.i8:
 ; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    and w8, w0, #0xff
-; SDAG-NEXT:    mov w9, #10
-; SDAG-NEXT:    add w8, w8, w1, uxtb
-; SDAG-NEXT:    tst w8, #0x100
-; SDAG-NEXT:    csel w0, w8, w9, ne
+; SDAG-NEXT:    and w9, w0, #0xff
+; SDAG-NEXT:    mov w8, #10 // =0xa
+; SDAG-NEXT:    add w9, w9, w1, uxtb
+; SDAG-NEXT:    tst w9, #0x100
+; SDAG-NEXT:    csel w0, w9, w8, ne
 ; SDAG-NEXT:    ret
 ;
 ; FAST-LABEL: uaddo.selectboth.i8:
 ; FAST:       // %bb.0: // %entry
-; FAST-NEXT:    and w8, w0, #0xff
-; FAST-NEXT:    mov w9, #10
-; FAST-NEXT:    add w8, w8, w1, uxtb
-; FAST-NEXT:    tst w8, #0x100
-; FAST-NEXT:    csel w0, w8, w9, ne
+; FAST-NEXT:    and w9, w0, #0xff
+; FAST-NEXT:    mov w8, #10 // =0xa
+; FAST-NEXT:    add w9, w9, w1, uxtb
+; FAST-NEXT:    tst w9, #0x100
+; FAST-NEXT:    csel w0, w9, w8, ne
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: uaddo.selectboth.i8:
 ; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    and w8, w1, #0xff
-; GISEL-NEXT:    mov w9, #10
-; GISEL-NEXT:    add w8, w8, w0, uxtb
-; GISEL-NEXT:    cmp w8, w8, uxtb
-; GISEL-NEXT:    csel w0, w8, w9, ne
+; GISEL-NEXT:    and w9, w1, #0xff
+; GISEL-NEXT:    mov w8, #10 // =0xa
+; GISEL-NEXT:    add w9, w9, w0, uxtb
+; GISEL-NEXT:    cmp w9, w9, uxtb
+; GISEL-NEXT:    csel w0, w9, w8, ne
 ; GISEL-NEXT:    ret
 entry:
   %m = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 %a, i8 %b)
@@ -1361,29 +1373,29 @@ entry:
 define i8 @saddo.selectboth.i8(i8 %a, i8 %b) {
 ; SDAG-LABEL: saddo.selectboth.i8:
 ; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    sxtb w8, w0
-; SDAG-NEXT:    mov w9, #10
-; SDAG-NEXT:    add w8, w8, w1, sxtb
-; SDAG-NEXT:    cmp w8, w8, sxtb
-; SDAG-NEXT:    csel w0, w8, w9, ne
+; SDAG-NEXT:    sxtb w9, w0
+; SDAG-NEXT:    mov w8, #10 // =0xa
+; SDAG-NEXT:    add w9, w9, w1, sxtb
+; SDAG-NEXT:    cmp w9, w9, sxtb
+; SDAG-NEXT:    csel w0, w9, w8, ne
 ; SDAG-NEXT:    ret
 ;
 ; FAST-LABEL: saddo.selectboth.i8:
 ; FAST:       // %bb.0: // %entry
-; FAST-NEXT:    sxtb w8, w0
-; FAST-NEXT:    mov w9, #10
-; FAST-NEXT:    add w8, w8, w1, sxtb
-; FAST-NEXT:    cmp w8, w8, sxtb
-; FAST-NEXT:    csel w0, w8, w9, ne
+; FAST-NEXT:    sxtb w9, w0
+; FAST-NEXT:    mov w8, #10 // =0xa
+; FAST-NEXT:    add w9, w9, w1, sxtb
+; FAST-NEXT:    cmp w9, w9, sxtb
+; FAST-NEXT:    csel w0, w9, w8, ne
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: saddo.selectboth.i8:
 ; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    sxtb w8, w1
-; GISEL-NEXT:    mov w9, #10
-; GISEL-NEXT:    add w8, w8, w0, sxtb
-; GISEL-NEXT:    cmp w8, w8, sxtb
-; GISEL-NEXT:    csel w0, w8, w9, ne
+; GISEL-NEXT:    sxtb w9, w1
+; GISEL-NEXT:    mov w8, #10 // =0xa
+; GISEL-NEXT:    add w9, w9, w0, sxtb
+; GISEL-NEXT:    cmp w9, w9, sxtb
+; GISEL-NEXT:    csel w0, w9, w8, ne
 ; GISEL-NEXT:    ret
 entry:
   %m = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 %a, i8 %b)
@@ -1396,29 +1408,29 @@ entry:
 define i16 @uaddo.selectboth.i16(i16 %a, i16 %b) {
 ; SDAG-LABEL: uaddo.selectboth.i16:
 ; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    and w8, w0, #0xffff
-; SDAG-NEXT:    mov w9, #10
-; SDAG-NEXT:    add w8, w8, w1, uxth
-; SDAG-NEXT:    tst w8, #0x10000
-; SDAG-NEXT:    csel w0, w8, w9, ne
+; SDAG-NEXT:    and w9, w0, #0xffff
+; SDAG-NEXT:    mov w8, #10 // =0xa
+; SDAG-NEXT:    add w9, w9, w1, uxth
+; SDAG-NEXT:    tst w9, #0x10000
+; SDAG-NEXT:    csel w0, w9, w8, ne
 ; SDAG-NEXT:    ret
 ;
 ; FAST-LABEL: uaddo.selectboth.i16:
 ; FAST:       // %bb.0: // %entry
-; FAST-NEXT:    and w8, w0, #0xffff
-; FAST-NEXT:    mov w9, #10
-; FAST-NEXT:    add w8, w8, w1, uxth
-; FAST-NEXT:    tst w8, #0x10000
-; FAST-NEXT:    csel w0, w8, w9, ne
+; FAST-NEXT:    and w9, w0, #0xffff
+; FAST-NEXT:    mov w8, #10 // =0xa
+; FAST-NEXT:    add w9, w9, w1, uxth
+; FAST-NEXT:    tst w9, #0x10000
+; FAST-NEXT:    csel w0, w9, w8, ne
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: uaddo.selectboth.i16:
 ; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    and w8, w1, #0xffff
-; GISEL-NEXT:    mov w9, #10
-; GISEL-NEXT:    add w8, w8, w0, uxth
-; GISEL-NEXT:    cmp w8, w8, uxth
-; GISEL-NEXT:    csel w0, w8, w9, ne
+; GISEL-NEXT:    and w9, w1, #0xffff
+; GISEL-NEXT:    mov w8, #10 // =0xa
+; GISEL-NEXT:    add w9, w9, w0, uxth
+; GISEL-NEXT:    cmp w9, w9, uxth
+; GISEL-NEXT:    csel w0, w9, w8, ne
 ; GISEL-NEXT:    ret
 entry:
   %m = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 %a, i16 %b)
@@ -1431,29 +1443,29 @@ entry:
 define i16 @saddo.selectboth.i16(i16 %a, i16 %b) {
 ; SDAG-LABEL: saddo.selectboth.i16:
 ; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    sxth w8, w0
-; SDAG-NEXT:    mov w9, #10
-; SDAG-NEXT:    add w8, w8, w1, sxth
-; SDAG-NEXT:    cmp w8, w8, sxth
-; SDAG-NEXT:    csel w0, w8, w9, ne
+; SDAG-NEXT:    sxth w9, w0
+; SDAG-NEXT:    mov w8, #10 // =0xa
+; SDAG-NEXT:    add w9, w9, w1, sxth
+; SDAG-NEXT:    cmp w9, w9, sxth
+; SDAG-NEXT:    csel w0, w9, w8, ne
 ; SDAG-NEXT:    ret
 ;
 ; FAST-LABEL: saddo.selectboth.i16:
 ; FAST:       // %bb.0: // %entry
-; FAST-NEXT:    sxth w8, w0
-; FAST-NEXT:    mov w9, #10
-; FAST-NEXT:    add w8, w8, w1, sxth
-; FAST-NEXT:    cmp w8, w8, sxth
-; FAST-NEXT:    csel w0, w8, w9, ne
+; FAST-NEXT:    sxth w9, w0
+; FAST-NEXT:    mov w8, #10 // =0xa
+; FAST-NEXT:    add w9, w9, w1, sxth
+; FAST-NEXT:    cmp w9, w9, sxth
+; FAST-NEXT:    csel w0, w9, w8, ne
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: saddo.selectboth.i16:
 ; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    sxth w8, w1
-; GISEL-NEXT:    mov w9, #10
-; GISEL-NEXT:    add w8, w8, w0, sxth
-; GISEL-NEXT:    cmp w8, w8, sxth
-; GISEL-NEXT:    csel w0, w8, w9, ne
+; GISEL-NEXT:    sxth w9, w1
+; GISEL-NEXT:    mov w8, #10 // =0xa
+; GISEL-NEXT:    add w9, w9, w0, sxth
+; GISEL-NEXT:    cmp w9, w9, sxth
+; GISEL-NEXT:    csel w0, w9, w8, ne
 ; GISEL-NEXT:    ret
 entry:
   %m = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 %a, i16 %b)
@@ -1466,25 +1478,25 @@ entry:
 define i32 @uaddo.selectboth.i32(i32 %a, i32 %b) {
 ; SDAG-LABEL: uaddo.selectboth.i32:
 ; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    adds w8, w0, w1
-; SDAG-NEXT:    mov w9, #10
-; SDAG-NEXT:    csel w0, w8, w9, hs
+; SDAG-NEXT:    mov w8, #10 // =0xa
+; SDAG-NEXT:    adds w9, w0, w1
+; SDAG-NEXT:    csel w0, w9, w8, hs
 ; SDAG-NEXT:    ret
 ;
 ; FAST-LABEL: uaddo.selectboth.i32:
 ; FAST:       // %bb.0: // %entry
-; FAST-NEXT:    adds w8, w0, w1
-; FAST-NEXT:    mov w9, #10
-; FAST-NEXT:    csel w0, w8, w9, hs
+; FAST-NEXT:    mov w8, #10 // =0xa
+; FAST-NEXT:    adds w9, w0, w1
+; FAST-NEXT:    csel w0, w9, w8, hs
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: uaddo.selectboth.i32:
 ; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    adds w8, w0, w1
-; GISEL-NEXT:    mov w10, #10
-; GISEL-NEXT:    cset w9, hs
-; GISEL-NEXT:    tst w9, #0x1
-; GISEL-NEXT:    csel w0, w8, w10, ne
+; GISEL-NEXT:    adds w9, w0, w1
+; GISEL-NEXT:    mov w8, #10 // =0xa
+; GISEL-NEXT:    cset w10, hs
+; GISEL-NEXT:    tst w10, #0x1
+; GISEL-NEXT:    csel w0, w9, w8, ne
 ; GISEL-NEXT:    ret
 entry:
   %m = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
@@ -1497,25 +1509,25 @@ entry:
 define i32 @saddo.selectboth.i32(i32 %a, i32 %b) {
 ; SDAG-LABEL: saddo.selectboth.i32:
 ; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    adds w8, w0, w1
-; SDAG-NEXT:    mov w9, #10
-; SDAG-NEXT:    csel w0, w8, w9, vs
+; SDAG-NEXT:    mov w8, #10 // =0xa
+; SDAG-NEXT:    adds w9, w0, w1
+; SDAG-NEXT:    csel w0, w9, w8, vs
 ; SDAG-NEXT:    ret
 ;
 ; FAST-LABEL: saddo.selectboth.i32:
 ; FAST:       // %bb.0: // %entry
-; FAST-NEXT:    adds w8, w0, w1
-; FAST-NEXT:    mov w9, #10
-; FAST-NEXT:    csel w0, w8, w9, vs
+; FAST-NEXT:    mov w8, #10 // =0xa
+; FAST-NEXT:    adds w9, w0, w1
+; FAST-NEXT:    csel w0, w9, w8, vs
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: saddo.selectboth.i32:
 ; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    adds w8, w0, w1
-; GISEL-NEXT:    mov w10, #10
-; GISEL-NEXT:    cset w9, vs
-; GISEL-NEXT:    tst w9, #0x1
-; GISEL-NEXT:    csel w0, w8, w10, ne
+; GISEL-NEXT:    adds w9, w0, w1
+; GISEL-NEXT:    mov w8, #10 // =0xa
+; GISEL-NEXT:    cset w10, vs
+; GISEL-NEXT:    tst w10, #0x1
+; GISEL-NEXT:    csel w0, w9, w8, ne
 ; GISEL-NEXT:    ret
 entry:
   %m = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)
@@ -1528,25 +1540,25 @@ entry:
 define i64 @uaddo.selectboth.i64(i64 %a, i64 %b) {
 ; SDAG-LABEL: uaddo.selectboth.i64:
 ; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    adds x8, x0, x1
-; SDAG-NEXT:    mov w9, #10
-; SDAG-NEXT:    csel x0, x8, x9, hs
+; SDAG-NEXT:    mov w8, #10 // =0xa
+; SDAG-NEXT:    adds x9, x0, x1
+; SDAG-NEXT:    csel x0, x9, x8, hs
 ; SDAG-NEXT:    ret
 ;
 ; FAST-LABEL: uaddo.selectboth.i64:
 ; FAST:       // %bb.0: // %entry
-; FAST-NEXT:    adds x8, x0, x1
-; FAST-NEXT:    mov x9, #10
-; FAST-NEXT:    csel x0, x8, x9, hs
+; FAST-NEXT:    mov x8, #10 // =0xa
+; FAST-NEXT:    adds x9, x0, x1
+; FAST-NEXT:    csel x0, x9, x8, hs
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: uaddo.selectboth.i64:
 ; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    adds x8, x0, x1
-; GISEL-NEXT:    mov w10, #10
-; GISEL-NEXT:    cset w9, hs
-; GISEL-NEXT:    tst w9, #0x1
-; GISEL-NEXT:    csel x0, x8, x10, ne
+; GISEL-NEXT:    adds x9, x0, x1
+; GISEL-NEXT:    mov w8, #10 // =0xa
+; GISEL-NEXT:    cset w10, hs
+; GISEL-NEXT:    tst w10, #0x1
+; GISEL-NEXT:    csel x0, x9, x8, ne
 ; GISEL-NEXT:    ret
 entry:
   %m = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
@@ -1559,25 +1571,25 @@ entry:
 define i64 @saddo.selectboth.i64(i64 %a, i64 %b) {
 ; SDAG-LABEL: saddo.selectboth.i64:
 ; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    adds x8, x0, x1
-; SDAG-NEXT:    mov w9, #10
-; SDAG-NEXT:    csel x0, x8, x9, vs
+; SDAG-NEXT:    mov w8, #10 // =0xa
+; SDAG-NEXT:    adds x9, x0, x1
+; SDAG-NEXT:    csel x0, x9, x8, vs
 ; SDAG-NEXT:    ret
 ;
 ; FAST-LABEL: saddo.selectboth.i64:
 ; FAST:       // %bb.0: // %entry
-; FAST-NEXT:    adds x8, x0, x1
-; FAST-NEXT:    mov x9, #10
-; FAST-NEXT:    csel x0, x8, x9, vs
+; FAST-NEXT:    mov x8, #10 // =0xa
+; FAST-NEXT:    adds x9, x0, x1
+; FAST-NEXT:    csel x0, x9, x8, vs
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: saddo.selectboth.i64:
 ; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    adds x8, x0, x1
-; GISEL-NEXT:    mov w10, #10
-; GISEL-NEXT:    cset w9, vs
-; GISEL-NEXT:    tst w9, #0x1
-; GISEL-NEXT:    csel x0, x8, x10, ne
+; GISEL-NEXT:    adds x9, x0, x1
+; GISEL-NEXT:    mov w8, #10 // =0xa
+; GISEL-NEXT:    cset w10, vs
+; GISEL-NEXT:    tst w10, #0x1
+; GISEL-NEXT:    csel x0, x9, x8, ne
 ; GISEL-NEXT:    ret
 entry:
   %m = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b)
@@ -1590,29 +1602,29 @@ entry:
 define i8 @usubo.selectboth.i8(i8 %a, i8 %b) {
 ; SDAG-LABEL: usubo.selectboth.i8:
 ; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    and w8, w0, #0xff
-; SDAG-NEXT:    mov w9, #10
-; SDAG-NEXT:    sub w8, w8, w1, uxtb
-; SDAG-NEXT:    tst w8, #0xffffff00
-; SDAG-NEXT:    csel w0, w8, w9, ne
+; SDAG-NEXT:    and w9, w0, #0xff
+; SDAG-NEXT:    mov w8, #10 // =0xa
+; SDAG-NEXT:    sub w9, w9, w1, uxtb
+; SDAG-NEXT:    tst w9, #0xffffff00
+; SDAG-NEXT:    csel w0, w9, w8, ne
 ; SDAG-NEXT:    ret
 ;
 ; FAST-LABEL: usubo.selectboth.i8:
 ; FAST:       // %bb.0: // %entry
-; FAST-NEXT:    and w8, w0, #0xff
-; FAST-NEXT:    mov w9, #10
-; FAST-NEXT:    sub w8, w8, w1, uxtb
-; FAST-NEXT:    tst w8, #0xffffff00
-; FAST-NEXT:    csel w0, w8, w9, ne
+; FAST-NEXT:    and w9, w0, #0xff
+; FAST-NEXT:    mov w8, #10 // =0xa
+; FAST-NEXT:    sub w9, w9, w1, uxtb
+; FAST-NEXT:    tst w9, #0xffffff00
+; FAST-NEXT:    csel w0, w9, w8, ne
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: usubo.selectboth.i8:
 ; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    and w8, w0, #0xff
-; GISEL-NEXT:    mov w9, #10
-; GISEL-NEXT:    sub w8, w8, w1, uxtb
-; GISEL-NEXT:    cmp w8, w8, uxtb
-; GISEL-NEXT:    csel w0, w8, w9, ne
+; GISEL-NEXT:    and w9, w0, #0xff
+; GISEL-NEXT:    mov w8, #10 // =0xa
+; GISEL-NEXT:    sub w9, w9, w1, uxtb
+; GISEL-NEXT:    cmp w9, w9, uxtb
+; GISEL-NEXT:    csel w0, w9, w8, ne
 ; GISEL-NEXT:    ret
 entry:
   %m = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 %a, i8 %b)
@@ -1625,11 +1637,11 @@ entry:
 define i8 @ssubo.selectboth.i8(i8 %a, i8 %b) {
 ; CHECK-LABEL: ssubo.selectboth.i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    mov w9, #10
-; CHECK-NEXT:    sub w8, w8, w1, sxtb
-; CHECK-NEXT:    cmp w8, w8, sxtb
-; CHECK-NEXT:    csel w0, w8, w9, ne
+; CHECK-NEXT:    sxtb w9, w0
+; CHECK-NEXT:    mov w8, #10 // =0xa
+; CHECK-NEXT:    sub w9, w9, w1, sxtb
+; CHECK-NEXT:    cmp w9, w9, sxtb
+; CHECK-NEXT:    csel w0, w9, w8, ne
 ; CHECK-NEXT:    ret
 entry:
   %m = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 %a, i8 %b)
@@ -1642,29 +1654,29 @@ entry:
 define i16 @usubo.selectboth.i16(i16 %a, i16 %b) {
 ; SDAG-LABEL: usubo.selectboth.i16:
 ; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    and w8, w0, #0xffff
-; SDAG-NEXT:    mov w9, #10
-; SDAG-NEXT:    sub w8, w8, w1, uxth
-; SDAG-NEXT:    tst w8, #0xffff0000
-; SDAG-NEXT:    csel w0, w8, w9, ne
+; SDAG-NEXT:    and w9, w0, #0xffff
+; SDAG-NEXT:    mov w8, #10 // =0xa
+; SDAG-NEXT:    sub w9, w9, w1, uxth
+; SDAG-NEXT:    tst w9, #0xffff0000
+; SDAG-NEXT:    csel w0, w9, w8, ne
 ; SDAG-NEXT:    ret
 ;
 ; FAST-LABEL: usubo.selectboth.i16:
 ; FAST:       // %bb.0: // %entry
-; FAST-NEXT:    and w8, w0, #0xffff
-; FAST-NEXT:    mov w9, #10
-; FAST-NEXT:    sub w8, w8, w1, uxth
-; FAST-NEXT:    tst w8, #0xffff0000
-; FAST-NEXT:    csel w0, w8, w9, ne
+; FAST-NEXT:    and w9, w0, #0xffff
+; FAST-NEXT:    mov w8, #10 // =0xa
+; FAST-NEXT:    sub w9, w9, w1, uxth
+; FAST-NEXT:    tst w9, #0xffff0000
+; FAST-NEXT:    csel w0, w9, w8, ne
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: usubo.selectboth.i16:
 ; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    and w8, w0, #0xffff
-; GISEL-NEXT:    mov w9, #10
-; GISEL-NEXT:    sub w8, w8, w1, uxth
-; GISEL-NEXT:    cmp w8, w8, uxth
-; GISEL-NEXT:    csel w0, w8, w9, ne
+; GISEL-NEXT:    and w9, w0, #0xffff
+; GISEL-NEXT:    mov w8, #10 // =0xa
+; GISEL-NEXT:    sub w9, w9, w1, uxth
+; GISEL-NEXT:    cmp w9, w9, uxth
+; GISEL-NEXT:    csel w0, w9, w8, ne
 ; GISEL-NEXT:    ret
 entry:
   %m = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 %a, i16 %b)
@@ -1677,11 +1689,11 @@ entry:
 define i16 @ssubo.selectboth.i16(i16 %a, i16 %b) {
 ; CHECK-LABEL: ssubo.selectboth.i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    mov w9, #10
-; CHECK-NEXT:    sub w8, w8, w1, sxth
-; CHECK-NEXT:    cmp w8, w8, sxth
-; CHECK-NEXT:    csel w0, w8, w9, ne
+; CHECK-NEXT:    sxth w9, w0
+; CHECK-NEXT:    mov w8, #10 // =0xa
+; CHECK-NEXT:    sub w9, w9, w1, sxth
+; CHECK-NEXT:    cmp w9, w9, sxth
+; CHECK-NEXT:    csel w0, w9, w8, ne
 ; CHECK-NEXT:    ret
 entry:
   %m = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 %a, i16 %b)
@@ -1694,25 +1706,25 @@ entry:
 define i32 @usubo.selectboth.i32(i32 %a, i32 %b) {
 ; SDAG-LABEL: usubo.selectboth.i32:
 ; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    subs w8, w0, w1
-; SDAG-NEXT:    mov w9, #10
-; SDAG-NEXT:    csel w0, w8, w9, lo
+; SDAG-NEXT:    mov w8, #10 // =0xa
+; SDAG-NEXT:    subs w9, w0, w1
+; SDAG-NEXT:    csel w0, w9, w8, lo
 ; SDAG-NEXT:    ret
 ;
 ; FAST-LABEL: usubo.selectboth.i32:
 ; FAST:       // %bb.0: // %entry
-; FAST-NEXT:    subs w8, w0, w1
-; FAST-NEXT:    mov w9, #10
-; FAST-NEXT:    csel w0, w8, w9, lo
+; FAST-NEXT:    mov w8, #10 // =0xa
+; FAST-NEXT:    subs w9, w0, w1
+; FAST-NEXT:    csel w0, w9, w8, lo
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: usubo.selectboth.i32:
 ; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    subs w8, w0, w1
-; GISEL-NEXT:    mov w10, #10
-; GISEL-NEXT:    cset w9, lo
-; GISEL-NEXT:    tst w9, #0x1
-; GISEL-NEXT:    csel w0, w8, w10, ne
+; GISEL-NEXT:    subs w9, w0, w1
+; GISEL-NEXT:    mov w8, #10 // =0xa
+; GISEL-NEXT:    cset w10, lo
+; GISEL-NEXT:    tst w10, #0x1
+; GISEL-NEXT:    csel w0, w9, w8, ne
 ; GISEL-NEXT:    ret
 entry:
   %m = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
@@ -1725,25 +1737,25 @@ entry:
 define i32 @ssubo.selectboth.i32(i32 %a, i32 %b) {
 ; SDAG-LABEL: ssubo.selectboth.i32:
 ; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    subs w8, w0, w1
-; SDAG-NEXT:    mov w9, #10
-; SDAG-NEXT:    csel w0, w8, w9, vs
+; SDAG-NEXT:    mov w8, #10 // =0xa
+; SDAG-NEXT:    subs w9, w0, w1
+; SDAG-NEXT:    csel w0, w9, w8, vs
 ; SDAG-NEXT:    ret
 ;
 ; FAST-LABEL: ssubo.selectboth.i32:
 ; FAST:       // %bb.0: // %entry
-; FAST-NEXT:    subs w8, w0, w1
-; FAST-NEXT:    mov w9, #10
-; FAST-NEXT:    csel w0, w8, w9, vs
+; FAST-NEXT:    mov w8, #10 // =0xa
+; FAST-NEXT:    subs w9, w0, w1
+; FAST-NEXT:    csel w0, w9, w8, vs
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: ssubo.selectboth.i32:
 ; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    subs w8, w0, w1
-; GISEL-NEXT:    mov w10, #10
-; GISEL-NEXT:    cset w9, vs
-; GISEL-NEXT:    tst w9, #0x1
-; GISEL-NEXT:    csel w0, w8, w10, ne
+; GISEL-NEXT:    subs w9, w0, w1
+; GISEL-NEXT:    mov w8, #10 // =0xa
+; GISEL-NEXT:    cset w10, vs
+; GISEL-NEXT:    tst w10, #0x1
+; GISEL-NEXT:    csel w0, w9, w8, ne
 ; GISEL-NEXT:    ret
 entry:
   %m = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b)
@@ -1756,25 +1768,25 @@ entry:
 define i64 @usubo.selectboth.i64(i64 %a, i64 %b) {
 ; SDAG-LABEL: usubo.selectboth.i64:
 ; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    subs x8, x0, x1
-; SDAG-NEXT:    mov w9, #10
-; SDAG-NEXT:    csel x0, x8, x9, lo
+; SDAG-NEXT:    mov w8, #10 // =0xa
+; SDAG-NEXT:    subs x9, x0, x1
+; SDAG-NEXT:    csel x0, x9, x8, lo
 ; SDAG-NEXT:    ret
 ;
 ; FAST-LABEL: usubo.selectboth.i64:
 ; FAST:       // %bb.0: // %entry
-; FAST-NEXT:    subs x8, x0, x1
-; FAST-NEXT:    mov x9, #10
-; FAST-NEXT:    csel x0, x8, x9, lo
+; FAST-NEXT:    mov x8, #10 // =0xa
+; FAST-NEXT:    subs x9, x0, x1
+; FAST-NEXT:    csel x0, x9, x8, lo
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: usubo.selectboth.i64:
 ; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    subs x8, x0, x1
-; GISEL-NEXT:    mov w10, #10
-; GISEL-NEXT:    cset w9, lo
-; GISEL-NEXT:    tst w9, #0x1
-; GISEL-NEXT:    csel x0, x8, x10, ne
+; GISEL-NEXT:    subs x9, x0, x1
+; GISEL-NEXT:    mov w8, #10 // =0xa
+; GISEL-NEXT:    cset w10, lo
+; GISEL-NEXT:    tst w10, #0x1
+; GISEL-NEXT:    csel x0, x9, x8, ne
 ; GISEL-NEXT:    ret
 entry:
   %m = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
@@ -1787,25 +1799,25 @@ entry:
 define i64 @ssubo.selectboth.i64(i64 %a, i64 %b) {
 ; SDAG-LABEL: ssubo.selectboth.i64:
 ; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    subs x8, x0, x1
-; SDAG-NEXT:    mov w9, #10
-; SDAG-NEXT:    csel x0, x8, x9, vs
+; SDAG-NEXT:    mov w8, #10 // =0xa
+; SDAG-NEXT:    subs x9, x0, x1
+; SDAG-NEXT:    csel x0, x9, x8, vs
 ; SDAG-NEXT:    ret
 ;
 ; FAST-LABEL: ssubo.selectboth.i64:
 ; FAST:       // %bb.0: // %entry
-; FAST-NEXT:    subs x8, x0, x1
-; FAST-NEXT:    mov x9, #10
-; FAST-NEXT:    csel x0, x8, x9, vs
+; FAST-NEXT:    mov x8, #10 // =0xa
+; FAST-NEXT:    subs x9, x0, x1
+; FAST-NEXT:    csel x0, x9, x8, vs
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: ssubo.selectboth.i64:
 ; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    subs x8, x0, x1
-; GISEL-NEXT:    mov w10, #10
-; GISEL-NEXT:    cset w9, vs
-; GISEL-NEXT:    tst w9, #0x1
-; GISEL-NEXT:    csel x0, x8, x10, ne
+; GISEL-NEXT:    subs x9, x0, x1
+; GISEL-NEXT:    mov w8, #10 // =0xa
+; GISEL-NEXT:    cset w10, vs
+; GISEL-NEXT:    tst w10, #0x1
+; GISEL-NEXT:    csel x0, x9, x8, ne
 ; GISEL-NEXT:    ret
 entry:
   %m = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b)
@@ -1819,32 +1831,32 @@ entry:
 define i8 @umulo.selectboth.i8(i8 %a, i8 %b) {
 ; SDAG-LABEL: umulo.selectboth.i8:
 ; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    and w8, w1, #0xff
-; SDAG-NEXT:    and w9, w0, #0xff
-; SDAG-NEXT:    mul w8, w9, w8
-; SDAG-NEXT:    mov w9, #10
-; SDAG-NEXT:    tst w8, #0xff00
-; SDAG-NEXT:    csel w0, w8, w9, ne
+; SDAG-NEXT:    and w9, w1, #0xff
+; SDAG-NEXT:    and w10, w0, #0xff
+; SDAG-NEXT:    mov w8, #10 // =0xa
+; SDAG-NEXT:    mul w9, w10, w9
+; SDAG-NEXT:    tst w9, #0xff00
+; SDAG-NEXT:    csel w0, w9, w8, ne
 ; SDAG-NEXT:    ret
 ;
 ; FAST-LABEL: umulo.selectboth.i8:
 ; FAST:       // %bb.0: // %entry
-; FAST-NEXT:    and w8, w1, #0xff
-; FAST-NEXT:    and w9, w0, #0xff
-; FAST-NEXT:    mul w8, w9, w8
-; FAST-NEXT:    mov w9, #10
-; FAST-NEXT:    tst w8, #0xff00
-; FAST-NEXT:    csel w0, w8, w9, ne
+; FAST-NEXT:    and w9, w1, #0xff
+; FAST-NEXT:    and w10, w0, #0xff
+; FAST-NEXT:    mov w8, #10 // =0xa
+; FAST-NEXT:    mul w9, w10, w9
+; FAST-NEXT:    tst w9, #0xff00
+; FAST-NEXT:    csel w0, w9, w8, ne
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: umulo.selectboth.i8:
 ; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    and w8, w0, #0xff
-; GISEL-NEXT:    and w9, w1, #0xff
-; GISEL-NEXT:    mul w8, w8, w9
-; GISEL-NEXT:    mov w9, #10
-; GISEL-NEXT:    cmp w8, w8, uxtb
-; GISEL-NEXT:    csel w0, w8, w9, ne
+; GISEL-NEXT:    and w9, w0, #0xff
+; GISEL-NEXT:    and w10, w1, #0xff
+; GISEL-NEXT:    mov w8, #10 // =0xa
+; GISEL-NEXT:    mul w9, w9, w10
+; GISEL-NEXT:    cmp w9, w9, uxtb
+; GISEL-NEXT:    csel w0, w9, w8, ne
 ; GISEL-NEXT:    ret
 entry:
   %m = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 %a, i8 %b)
@@ -1857,32 +1869,32 @@ entry:
 define i8 @smulo.selectboth.i8(i8 %a, i8 %b) {
 ; SDAG-LABEL: smulo.selectboth.i8:
 ; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    sxtb w8, w1
-; SDAG-NEXT:    sxtb w9, w0
-; SDAG-NEXT:    mul w8, w9, w8
-; SDAG-NEXT:    mov w9, #10
-; SDAG-NEXT:    cmp w8, w8, sxtb
-; SDAG-NEXT:    csel w0, w8, w9, ne
+; SDAG-NEXT:    sxtb w9, w1
+; SDAG-NEXT:    sxtb w10, w0
+; SDAG-NEXT:    mov w8, #10 // =0xa
+; SDAG-NEXT:    mul w9, w10, w9
+; SDAG-NEXT:    cmp w9, w9, sxtb
+; SDAG-NEXT:    csel w0, w9, w8, ne
 ; SDAG-NEXT:    ret
 ;
 ; FAST-LABEL: smulo.selectboth.i8:
 ; FAST:       // %bb.0: // %entry
-; FAST-NEXT:    sxtb w8, w1
-; FAST-NEXT:    sxtb w9, w0
-; FAST-NEXT:    mul w8, w9, w8
-; FAST-NEXT:    mov w9, #10
-; FAST-NEXT:    cmp w8, w8, sxtb
-; FAST-NEXT:    csel w0, w8, w9, ne
+; FAST-NEXT:    sxtb w9, w1
+; FAST-NEXT:    sxtb w10, w0
+; FAST-NEXT:    mov w8, #10 // =0xa
+; FAST-NEXT:    mul w9, w10, w9
+; FAST-NEXT:    cmp w9, w9, sxtb
+; FAST-NEXT:    csel w0, w9, w8, ne
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: smulo.selectboth.i8:
 ; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    sxtb w8, w0
-; GISEL-NEXT:    sxtb w9, w1
-; GISEL-NEXT:    mul w8, w8, w9
-; GISEL-NEXT:    mov w9, #10
-; GISEL-NEXT:    cmp w8, w8, sxtb
-; GISEL-NEXT:    csel w0, w8, w9, ne
+; GISEL-NEXT:    sxtb w9, w0
+; GISEL-NEXT:    sxtb w10, w1
+; GISEL-NEXT:    mov w8, #10 // =0xa
+; GISEL-NEXT:    mul w9, w9, w10
+; GISEL-NEXT:    cmp w9, w9, sxtb
+; GISEL-NEXT:    csel w0, w9, w8, ne
 ; GISEL-NEXT:    ret
 entry:
   %m = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 %a, i8 %b)
@@ -1895,32 +1907,32 @@ entry:
 define i16 @umulo.selectboth.i16(i16 %a, i16 %b) {
 ; SDAG-LABEL: umulo.selectboth.i16:
 ; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    and w8, w1, #0xffff
-; SDAG-NEXT:    and w9, w0, #0xffff
-; SDAG-NEXT:    mul w8, w9, w8
-; SDAG-NEXT:    mov w9, #10
-; SDAG-NEXT:    tst w8, #0xffff0000
-; SDAG-NEXT:    csel w0, w8, w9, ne
+; SDAG-NEXT:    and w9, w1, #0xffff
+; SDAG-NEXT:    and w10, w0, #0xffff
+; SDAG-NEXT:    mov w8, #10 // =0xa
+; SDAG-NEXT:    mul w9, w10, w9
+; SDAG-NEXT:    tst w9, #0xffff0000
+; SDAG-NEXT:    csel w0, w9, w8, ne
 ; SDAG-NEXT:    ret
 ;
 ; FAST-LABEL: umulo.selectboth.i16:
 ; FAST:       // %bb.0: // %entry
-; FAST-NEXT:    and w8, w1, #0xffff
-; FAST-NEXT:    and w9, w0, #0xffff
-; FAST-NEXT:    mul w8, w9, w8
-; FAST-NEXT:    mov w9, #10
-; FAST-NEXT:    tst w8, #0xffff0000
-; FAST-NEXT:    csel w0, w8, w9, ne
+; FAST-NEXT:    and w9, w1, #0xffff
+; FAST-NEXT:    and w10, w0, #0xffff
+; FAST-NEXT:    mov w8, #10 // =0xa
+; FAST-NEXT:    mul w9, w10, w9
+; FAST-NEXT:    tst w9, #0xffff0000
+; FAST-NEXT:    csel w0, w9, w8, ne
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: umulo.selectboth.i16:
 ; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    and w8, w0, #0xffff
-; GISEL-NEXT:    and w9, w1, #0xffff
-; GISEL-NEXT:    mul w8, w8, w9
-; GISEL-NEXT:    mov w9, #10
-; GISEL-NEXT:    cmp w8, w8, uxth
-; GISEL-NEXT:    csel w0, w8, w9, ne
+; GISEL-NEXT:    and w9, w0, #0xffff
+; GISEL-NEXT:    and w10, w1, #0xffff
+; GISEL-NEXT:    mov w8, #10 // =0xa
+; GISEL-NEXT:    mul w9, w9, w10
+; GISEL-NEXT:    cmp w9, w9, uxth
+; GISEL-NEXT:    csel w0, w9, w8, ne
 ; GISEL-NEXT:    ret
 entry:
   %m = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 %a, i16 %b)
@@ -1933,32 +1945,32 @@ entry:
 define i16 @smulo.selectboth.i16(i16 %a, i16 %b) {
 ; SDAG-LABEL: smulo.selectboth.i16:
 ; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    sxth w8, w1
-; SDAG-NEXT:    sxth w9, w0
-; SDAG-NEXT:    mul w8, w9, w8
-; SDAG-NEXT:    mov w9, #10
-; SDAG-NEXT:    cmp w8, w8, sxth
-; SDAG-NEXT:    csel w0, w8, w9, ne
+; SDAG-NEXT:    sxth w9, w1
+; SDAG-NEXT:    sxth w10, w0
+; SDAG-NEXT:    mov w8, #10 // =0xa
+; SDAG-NEXT:    mul w9, w10, w9
+; SDAG-NEXT:    cmp w9, w9, sxth
+; SDAG-NEXT:    csel w0, w9, w8, ne
 ; SDAG-NEXT:    ret
 ;
 ; FAST-LABEL: smulo.selectboth.i16:
 ; FAST:       // %bb.0: // %entry
-; FAST-NEXT:    sxth w8, w1
-; FAST-NEXT:    sxth w9, w0
-; FAST-NEXT:    mul w8, w9, w8
-; FAST-NEXT:    mov w9, #10
-; FAST-NEXT:    cmp w8, w8, sxth
-; FAST-NEXT:    csel w0, w8, w9, ne
+; FAST-NEXT:    sxth w9, w1
+; FAST-NEXT:    sxth w10, w0
+; FAST-NEXT:    mov w8, #10 // =0xa
+; FAST-NEXT:    mul w9, w10, w9
+; FAST-NEXT:    cmp w9, w9, sxth
+; FAST-NEXT:    csel w0, w9, w8, ne
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: smulo.selectboth.i16:
 ; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    sxth w8, w0
-; GISEL-NEXT:    sxth w9, w1
-; GISEL-NEXT:    mul w8, w8, w9
-; GISEL-NEXT:    mov w9, #10
-; GISEL-NEXT:    cmp w8, w8, sxth
-; GISEL-NEXT:    csel w0, w8, w9, ne
+; GISEL-NEXT:    sxth w9, w0
+; GISEL-NEXT:    sxth w10, w1
+; GISEL-NEXT:    mov w8, #10 // =0xa
+; GISEL-NEXT:    mul w9, w9, w10
+; GISEL-NEXT:    cmp w9, w9, sxth
+; GISEL-NEXT:    csel w0, w9, w8, ne
 ; GISEL-NEXT:    ret
 entry:
   %m = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 %a, i16 %b)
@@ -1972,7 +1984,7 @@ define i32 @umulo.selectboth.i32(i32 %a, i32 %b) {
 ; SDAG-LABEL: umulo.selectboth.i32:
 ; SDAG:       // %bb.0: // %entry
 ; SDAG-NEXT:    umull x9, w0, w1
-; SDAG-NEXT:    mov w8, #10
+; SDAG-NEXT:    mov w8, #10 // =0xa
 ; SDAG-NEXT:    tst x9, #0xffffffff00000000
 ; SDAG-NEXT:    csel w0, w9, w8, ne
 ; SDAG-NEXT:    ret
@@ -1980,7 +1992,7 @@ define i32 @umulo.selectboth.i32(i32 %a, i32 %b) {
 ; FAST-LABEL: umulo.selectboth.i32:
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    umull x9, w0, w1
-; FAST-NEXT:    mov w8, #10
+; FAST-NEXT:    mov w8, #10 // =0xa
 ; FAST-NEXT:    tst x9, #0xffffffff00000000
 ; FAST-NEXT:    csel w0, w9, w8, ne
 ; FAST-NEXT:    ret
@@ -1988,7 +2000,7 @@ define i32 @umulo.selectboth.i32(i32 %a, i32 %b) {
 ; GISEL-LABEL: umulo.selectboth.i32:
 ; GISEL:       // %bb.0: // %entry
 ; GISEL-NEXT:    umull x9, w0, w1
-; GISEL-NEXT:    mov w8, #10
+; GISEL-NEXT:    mov w8, #10 // =0xa
 ; GISEL-NEXT:    mul w10, w0, w1
 ; GISEL-NEXT:    lsr x9, x9, #32
 ; GISEL-NEXT:    cmp w9, #0
@@ -2006,7 +2018,7 @@ define i32 @smulo.selectboth.i32(i32 %a, i32 %b) {
 ; SDAG-LABEL: smulo.selectboth.i32:
 ; SDAG:       // %bb.0: // %entry
 ; SDAG-NEXT:    smull x9, w0, w1
-; SDAG-NEXT:    mov w8, #10
+; SDAG-NEXT:    mov w8, #10 // =0xa
 ; SDAG-NEXT:    cmp x9, w9, sxtw
 ; SDAG-NEXT:    csel w0, w9, w8, ne
 ; SDAG-NEXT:    ret
@@ -2014,7 +2026,7 @@ define i32 @smulo.selectboth.i32(i32 %a, i32 %b) {
 ; FAST-LABEL: smulo.selectboth.i32:
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    smull x9, w0, w1
-; FAST-NEXT:    mov w8, #10
+; FAST-NEXT:    mov w8, #10 // =0xa
 ; FAST-NEXT:    cmp x9, w9, sxtw
 ; FAST-NEXT:    csel w0, w9, w8, ne
 ; FAST-NEXT:    ret
@@ -2022,7 +2034,7 @@ define i32 @smulo.selectboth.i32(i32 %a, i32 %b) {
 ; GISEL-LABEL: smulo.selectboth.i32:
 ; GISEL:       // %bb.0: // %entry
 ; GISEL-NEXT:    smull x9, w0, w1
-; GISEL-NEXT:    mov w8, #10
+; GISEL-NEXT:    mov w8, #10 // =0xa
 ; GISEL-NEXT:    mul w10, w0, w1
 ; GISEL-NEXT:    asr x9, x9, #32
 ; GISEL-NEXT:    cmp w9, w10, asr #31
@@ -2040,7 +2052,7 @@ define i64 @umulo.selectboth.i64(i64 %a, i64 %b) {
 ; SDAG-LABEL: umulo.selectboth.i64:
 ; SDAG:       // %bb.0: // %entry
 ; SDAG-NEXT:    umulh x9, x0, x1
-; SDAG-NEXT:    mov w8, #10
+; SDAG-NEXT:    mov w8, #10 // =0xa
 ; SDAG-NEXT:    mul x10, x0, x1
 ; SDAG-NEXT:    cmp xzr, x9
 ; SDAG-NEXT:    csel x0, x10, x8, ne
@@ -2049,7 +2061,7 @@ define i64 @umulo.selectboth.i64(i64 %a, i64 %b) {
 ; FAST-LABEL: umulo.selectboth.i64:
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    umulh x9, x0, x1
-; FAST-NEXT:    mov x8, #10
+; FAST-NEXT:    mov x8, #10 // =0xa
 ; FAST-NEXT:    mul x10, x0, x1
 ; FAST-NEXT:    cmp xzr, x9
 ; FAST-NEXT:    csel x0, x10, x8, ne
@@ -2058,7 +2070,7 @@ define i64 @umulo.selectboth.i64(i64 %a, i64 %b) {
 ; GISEL-LABEL: umulo.selectboth.i64:
 ; GISEL:       // %bb.0: // %entry
 ; GISEL-NEXT:    umulh x9, x0, x1
-; GISEL-NEXT:    mov w8, #10
+; GISEL-NEXT:    mov w8, #10 // =0xa
 ; GISEL-NEXT:    mul x10, x0, x1
 ; GISEL-NEXT:    cmp x9, #0
 ; GISEL-NEXT:    csel x0, x10, x8, ne
@@ -2075,7 +2087,7 @@ define i64 @smulo.selectboth.i64(i64 %a, i64 %b) {
 ; SDAG-LABEL: smulo.selectboth.i64:
 ; SDAG:       // %bb.0: // %entry
 ; SDAG-NEXT:    mul x9, x0, x1
-; SDAG-NEXT:    mov w8, #10
+; SDAG-NEXT:    mov w8, #10 // =0xa
 ; SDAG-NEXT:    smulh x10, x0, x1
 ; SDAG-NEXT:    cmp x10, x9, asr #63
 ; SDAG-NEXT:    csel x0, x9, x8, ne
@@ -2084,7 +2096,7 @@ define i64 @smulo.selectboth.i64(i64 %a, i64 %b) {
 ; FAST-LABEL: smulo.selectboth.i64:
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    mul x9, x0, x1
-; FAST-NEXT:    mov x8, #10
+; FAST-NEXT:    mov x8, #10 // =0xa
 ; FAST-NEXT:    smulh x10, x0, x1
 ; FAST-NEXT:    cmp x10, x9, asr #63
 ; FAST-NEXT:    csel x0, x9, x8, ne
@@ -2092,11 +2104,11 @@ define i64 @smulo.selectboth.i64(i64 %a, i64 %b) {
 ;
 ; GISEL-LABEL: smulo.selectboth.i64:
 ; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    mul x9, x0, x1
-; GISEL-NEXT:    mov w8, #10
-; GISEL-NEXT:    smulh x10, x0, x1
-; GISEL-NEXT:    cmp x10, x9, asr #63
-; GISEL-NEXT:    csel x0, x9, x8, ne
+; GISEL-NEXT:    smulh x9, x0, x1
+; GISEL-NEXT:    mov w8, #10 // =0xa
+; GISEL-NEXT:    mul x10, x0, x1
+; GISEL-NEXT:    cmp x9, x10, asr #63
+; GISEL-NEXT:    csel x0, x10, x8, ne
 ; GISEL-NEXT:    ret
 entry:
   %m = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %a, i64 %b)
@@ -2120,9 +2132,9 @@ define zeroext i1 @saddo.br.i32(i32 %v1, i32 %v2) {
 ; FAST-LABEL: saddo.br.i32:
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    cmn w0, w1
-; FAST-NEXT:    mov w9, #1
-; FAST-NEXT:    cset w8, vs
-; FAST-NEXT:    bic w8, w9, w8
+; FAST-NEXT:    mov w8, #1 // =0x1
+; FAST-NEXT:    cset w9, vs
+; FAST-NEXT:    bic w8, w8, w9
 ; FAST-NEXT:    and w0, w8, #0x1
 ; FAST-NEXT:    ret
 ;
@@ -2155,9 +2167,9 @@ define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) {
 ; FAST-LABEL: saddo.br.i64:
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    cmn x0, x1
-; FAST-NEXT:    mov w9, #1
-; FAST-NEXT:    cset w8, vs
-; FAST-NEXT:    bic w8, w9, w8
+; FAST-NEXT:    mov w8, #1 // =0x1
+; FAST-NEXT:    cset w9, vs
+; FAST-NEXT:    bic w8, w8, w9
 ; FAST-NEXT:    and w0, w8, #0x1
 ; FAST-NEXT:    ret
 ;
@@ -2190,9 +2202,9 @@ define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
 ; FAST-LABEL: uaddo.br.i32:
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    cmn w0, w1
-; FAST-NEXT:    mov w9, #1
-; FAST-NEXT:    cset w8, hs
-; FAST-NEXT:    bic w8, w9, w8
+; FAST-NEXT:    mov w8, #1 // =0x1
+; FAST-NEXT:    cset w9, hs
+; FAST-NEXT:    bic w8, w8, w9
 ; FAST-NEXT:    and w0, w8, #0x1
 ; FAST-NEXT:    ret
 ;
@@ -2225,9 +2237,9 @@ define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
 ; FAST-LABEL: uaddo.br.i64:
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    cmn x0, x1
-; FAST-NEXT:    mov w9, #1
-; FAST-NEXT:    cset w8, hs
-; FAST-NEXT:    bic w8, w9, w8
+; FAST-NEXT:    mov w8, #1 // =0x1
+; FAST-NEXT:    cset w9, hs
+; FAST-NEXT:    bic w8, w8, w9
 ; FAST-NEXT:    and w0, w8, #0x1
 ; FAST-NEXT:    ret
 ;
@@ -2260,9 +2272,9 @@ define zeroext i1 @ssubo.br.i32(i32 %v1, i32 %v2) {
 ; FAST-LABEL: ssubo.br.i32:
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    cmp w0, w1
-; FAST-NEXT:    mov w9, #1
-; FAST-NEXT:    cset w8, vs
-; FAST-NEXT:    bic w8, w9, w8
+; FAST-NEXT:    mov w8, #1 // =0x1
+; FAST-NEXT:    cset w9, vs
+; FAST-NEXT:    bic w8, w8, w9
 ; FAST-NEXT:    and w0, w8, #0x1
 ; FAST-NEXT:    ret
 ;
@@ -2295,9 +2307,9 @@ define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
 ; FAST-LABEL: ssubo.br.i64:
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    cmp x0, x1
-; FAST-NEXT:    mov w9, #1
-; FAST-NEXT:    cset w8, vs
-; FAST-NEXT:    bic w8, w9, w8
+; FAST-NEXT:    mov w8, #1 // =0x1
+; FAST-NEXT:    cset w9, vs
+; FAST-NEXT:    bic w8, w8, w9
 ; FAST-NEXT:    and w0, w8, #0x1
 ; FAST-NEXT:    ret
 ;
@@ -2330,9 +2342,9 @@ define zeroext i1 @usubo.br.i32(i32 %v1, i32 %v2) {
 ; FAST-LABEL: usubo.br.i32:
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    cmp w0, w1
-; FAST-NEXT:    mov w9, #1
-; FAST-NEXT:    cset w8, lo
-; FAST-NEXT:    bic w8, w9, w8
+; FAST-NEXT:    mov w8, #1 // =0x1
+; FAST-NEXT:    cset w9, lo
+; FAST-NEXT:    bic w8, w8, w9
 ; FAST-NEXT:    and w0, w8, #0x1
 ; FAST-NEXT:    ret
 ;
@@ -2365,9 +2377,9 @@ define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
 ; FAST-LABEL: usubo.br.i64:
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    cmp x0, x1
-; FAST-NEXT:    mov w9, #1
-; FAST-NEXT:    cset w8, lo
-; FAST-NEXT:    bic w8, w9, w8
+; FAST-NEXT:    mov w8, #1 // =0x1
+; FAST-NEXT:    cset w9, lo
+; FAST-NEXT:    bic w8, w8, w9
 ; FAST-NEXT:    and w0, w8, #0x1
 ; FAST-NEXT:    ret
 ;
@@ -2401,7 +2413,7 @@ define zeroext i1 @smulo.br.i32(i32 %v1, i32 %v2) {
 ; FAST-LABEL: smulo.br.i32:
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    smull x9, w0, w1
-; FAST-NEXT:    mov w8, #1
+; FAST-NEXT:    mov w8, #1 // =0x1
 ; FAST-NEXT:    cmp x9, w9, sxtw
 ; FAST-NEXT:    cset w9, ne
 ; FAST-NEXT:    bic w8, w8, w9
@@ -2442,7 +2454,7 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; FAST-LABEL: smulo.br.i64:
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    mul x9, x0, x1
-; FAST-NEXT:    mov w8, #1
+; FAST-NEXT:    mov w8, #1 // =0x1
 ; FAST-NEXT:    smulh x10, x0, x1
 ; FAST-NEXT:    cmp x10, x9, asr #63
 ; FAST-NEXT:    cset w9, ne
@@ -2452,9 +2464,9 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ;
 ; GISEL-LABEL: smulo.br.i64:
 ; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    mul x8, x0, x1
-; GISEL-NEXT:    smulh x9, x0, x1
-; GISEL-NEXT:    cmp x9, x8, asr #63
+; GISEL-NEXT:    smulh x8, x0, x1
+; GISEL-NEXT:    mul x9, x0, x1
+; GISEL-NEXT:    cmp x8, x9, asr #63
 ; GISEL-NEXT:    cset w8, ne
 ; GISEL-NEXT:    eor w0, w8, #0x1
 ; GISEL-NEXT:    ret
@@ -2481,7 +2493,7 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) {
 ; FAST-LABEL: smulo2.br.i64:
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    cmn x0, x0
-; FAST-NEXT:    mov w8, #1
+; FAST-NEXT:    mov w8, #1 // =0x1
 ; FAST-NEXT:    cset w9, vs
 ; FAST-NEXT:    bic w8, w8, w9
 ; FAST-NEXT:    and w0, w8, #0x1
@@ -2517,7 +2529,7 @@ define zeroext i1 @umulo.br.i32(i32 %v1, i32 %v2) {
 ; FAST-LABEL: umulo.br.i32:
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    umull x9, w0, w1
-; FAST-NEXT:    mov w8, #1
+; FAST-NEXT:    mov w8, #1 // =0x1
 ; FAST-NEXT:    tst x9, #0xffffffff00000000
 ; FAST-NEXT:    cset w9, ne
 ; FAST-NEXT:    bic w8, w8, w9
@@ -2556,7 +2568,7 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; FAST-LABEL: umulo.br.i64:
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    umulh x9, x0, x1
-; FAST-NEXT:    mov w8, #1
+; FAST-NEXT:    mov w8, #1 // =0x1
 ; FAST-NEXT:    cmp xzr, x9
 ; FAST-NEXT:    cset w9, ne
 ; FAST-NEXT:    bic w8, w8, w9
@@ -2593,7 +2605,7 @@ define zeroext i1 @umulo2.br.i64(i64 %v1) {
 ; FAST-LABEL: umulo2.br.i64:
 ; FAST:       // %bb.0: // %entry
 ; FAST-NEXT:    cmn x0, x0
-; FAST-NEXT:    mov w8, #1
+; FAST-NEXT:    mov w8, #1 // =0x1
 ; FAST-NEXT:    cset w9, hs
 ; FAST-NEXT:    bic w8, w8, w9
 ; FAST-NEXT:    and w0, w8, #0x1
@@ -2621,17 +2633,17 @@ continue:
 define i8 @pr60530() {
 ; SDAG-LABEL: pr60530:
 ; SDAG:       // %bb.0:
-; SDAG-NEXT:    mov w0, #-1
+; SDAG-NEXT:    mov w0, #-1 // =0xffffffff
 ; SDAG-NEXT:    ret
 ;
 ; FAST-LABEL: pr60530:
 ; FAST:       // %bb.0:
-; FAST-NEXT:    mov w0, #-1
+; FAST-NEXT:    mov w0, #-1 // =0xffffffff
 ; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: pr60530:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #1
+; GISEL-NEXT:    mov w8, #1 // =0x1
 ; GISEL-NEXT:    sbfx w0, w8, #0, #1
 ; GISEL-NEXT:    ret
   %1 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 0, i8 1)

diff  --git a/llvm/test/CodeGen/AArch64/arm64-zip.ll b/llvm/test/CodeGen/AArch64/arm64-zip.ll
index 05847394e0f96e..e22b57c8af44a8 100644
--- a/llvm/test/CodeGen/AArch64/arm64-zip.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-zip.ll
@@ -250,8 +250,8 @@ define <8 x i16> @combine_v8i16_undef(<4 x i16> %0, <4 x i16> %1) {
 define <16 x i8> @combine_v8i16_8first(<8 x i8> %0, <8 x i8> %1) {
 ; CHECK-LABEL: combine_v8i16_8first:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI17_0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1_q2
+; CHECK-NEXT:    adrp x8, .LCPI17_0
 ; CHECK-NEXT:    fmov d2, d0
 ; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI17_0]
 ; CHECK-NEXT:    tbl.16b v0, { v1, v2 }, v3
@@ -265,8 +265,8 @@ define <16 x i8> @combine_v8i16_8first(<8 x i8> %0, <8 x i8> %1) {
 define <16 x i8> @combine_v8i16_8firstundef(<8 x i8> %0, <8 x i8> %1) {
 ; CHECK-LABEL: combine_v8i16_8firstundef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI18_0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1_q2
+; CHECK-NEXT:    adrp x8, .LCPI18_0
 ; CHECK-NEXT:    fmov d2, d0
 ; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI18_0]
 ; CHECK-NEXT:    tbl.16b v0, { v1, v2 }, v3

diff  --git a/llvm/test/CodeGen/AArch64/arm64_32-addrs.ll b/llvm/test/CodeGen/AArch64/arm64_32-addrs.ll
index 7c38144ef3f3e3..fee4fd839554c1 100644
--- a/llvm/test/CodeGen/AArch64/arm64_32-addrs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64_32-addrs.ll
@@ -42,10 +42,10 @@ define i8 @test_valid_wrap_optimizable1(ptr %base, i32 %offset) {
 define i8 @test_valid_wrap_optimizable2(ptr %base, i32 %offset) {
 ; CHECK-LABEL: test_valid_wrap_optimizable2:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov w8, #-100
 ; CHECK-NEXT:    ; kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    sxtw x9, w1
-; CHECK-NEXT:    ldrb w0, [x9, x8]
+; CHECK-NEXT:    sxtw x8, w1
+; CHECK-NEXT:    mov w9, #-100 ; =0xffffff9c
+; CHECK-NEXT:    ldrb w0, [x8, x9]
 ; CHECK-NEXT:    ret
 
   %newaddr = getelementptr inbounds i8, ptr inttoptr(i32 -100 to ptr), i32 %offset

diff  --git a/llvm/test/CodeGen/AArch64/arm64_32.ll b/llvm/test/CodeGen/AArch64/arm64_32.ll
index b050bf538b74f0..d3b30d398ffcc3 100644
--- a/llvm/test/CodeGen/AArch64/arm64_32.ll
+++ b/llvm/test/CodeGen/AArch64/arm64_32.ll
@@ -731,9 +731,8 @@ define ptr @test_gep_nonpow2(ptr %a0, i32 %a1) {
 
 define void @test_memset(i64 %in, i8 %value)  {
 ; CHECK-LABEL: test_memset:
-; CHECK-DAG: and x8, x0, #0xffffffff
 ; CHECK-DAG: lsr x2, x0, #32
-; CHECK-DAG: mov x0, x8
+; CHECK-DAG: and x0, x0, #0xffffffff
 ; CHECK: b _memset
 
   %ptr.i32 = trunc i64 %in to i32

diff  --git a/llvm/test/CodeGen/AArch64/arm64ec-reservedregs.ll b/llvm/test/CodeGen/AArch64/arm64ec-reservedregs.ll
index c28f78a2379397..9f1edd93e0bd7b 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-reservedregs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-reservedregs.ll
@@ -18,10 +18,10 @@ define i32 @no_int_regs(i32 %x) nounwind {
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
 ; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr w0, [sp, #28] // 4-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr x27, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x30, x29, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
@@ -41,10 +41,10 @@ define i32 @one_int_reg(i32 %x) nounwind {
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
 ; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    mov w0, w30
+; CHECK-NEXT:    ldr x27, [sp, #16] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    mov w0, w30
 ; CHECK-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x27, [sp, #16] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp x30, x29, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
@@ -64,10 +64,10 @@ define float @no_float_regs(float %x) nounwind {
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #80
 ; CHECK-NEXT:    ret
 entry:
@@ -79,15 +79,15 @@ define float @one_float_reg(float %x) nounwind {
 ; CHECK-LABEL: one_float_reg:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    fmov s15, s0
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    fmov s0, s15
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    fmov s0, s15
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #64 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/atomic-ops-lse.ll b/llvm/test/CodeGen/AArch64/atomic-ops-lse.ll
index 93a2d6cedc3a78..8b9e66c166498a 100644
--- a/llvm/test/CodeGen/AArch64/atomic-ops-lse.ll
+++ b/llvm/test/CodeGen/AArch64/atomic-ops-lse.ll
@@ -551,15 +551,15 @@ define dso_local i8 @test_atomic_load_umin_i8(i8 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umin_i8:
 ; OUTLINE-ATOMICS-LABEL: test_atomic_load_umin_i8:
 ; OUTLINE-ATOMICS:       // %bb.0:
-; OUTLINE-ATOMICS-NEXT:    and w8, w0, #0xff
-; OUTLINE-ATOMICS-NEXT:    adrp x9, var8
-; OUTLINE-ATOMICS-NEXT:    add x9, x9, :lo12:var8
+; OUTLINE-ATOMICS-NEXT:    adrp x8, var8
+; OUTLINE-ATOMICS-NEXT:    add x8, x8, :lo12:var8
+; OUTLINE-ATOMICS-NEXT:    and w9, w0, #0xff
 ; OUTLINE-ATOMICS-NEXT:  .LBB[[LOOPSTART:.*]]: // %atomicrmw.start
 ; OUTLINE-ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE-ATOMICS-NEXT:    ldaxrb w0, [x9]
-; OUTLINE-ATOMICS-NEXT:    cmp w0, w8
-; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w8, ls
-; OUTLINE-ATOMICS-NEXT:    stlxrb w11, w10, [x9]
+; OUTLINE-ATOMICS-NEXT:    ldaxrb w0, [x8]
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w9
+; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w9, ls
+; OUTLINE-ATOMICS-NEXT:    stlxrb w11, w10, [x8]
 ; OUTLINE-ATOMICS-NEXT:    cbnz w11, .LBB[[LOOPSTART]]
 ; OUTLINE-ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
 ; OUTLINE-ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -579,15 +579,15 @@ define dso_local i16 @test_atomic_load_umin_i16(i16 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umin_i16:
 ; OUTLINE-ATOMICS-LABEL: test_atomic_load_umin_i16:
 ; OUTLINE-ATOMICS:       // %bb.0:
-; OUTLINE-ATOMICS-NEXT:    and w8, w0, #0xffff
-; OUTLINE-ATOMICS-NEXT:    adrp x9, var16
-; OUTLINE-ATOMICS-NEXT:    add x9, x9, :lo12:var16
+; OUTLINE-ATOMICS-NEXT:    adrp x8, var16
+; OUTLINE-ATOMICS-NEXT:    add x8, x8, :lo12:var16
+; OUTLINE-ATOMICS-NEXT:    and w9, w0, #0xffff
 ; OUTLINE-ATOMICS-NEXT:  .LBB[[LOOPSTART:.*]]: // %atomicrmw.start
 ; OUTLINE-ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE-ATOMICS-NEXT:    ldaxrh w0, [x9]
-; OUTLINE-ATOMICS-NEXT:    cmp w0, w8
-; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w8, ls
-; OUTLINE-ATOMICS-NEXT:    stlxrh w11, w10, [x9]
+; OUTLINE-ATOMICS-NEXT:    ldaxrh w0, [x8]
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w9
+; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w9, ls
+; OUTLINE-ATOMICS-NEXT:    stlxrh w11, w10, [x8]
 ; OUTLINE-ATOMICS-NEXT:    cbnz w11, .LBB[[LOOPSTART]]
 ; OUTLINE-ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
 ; OUTLINE-ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -871,15 +871,15 @@ define dso_local i8 @test_atomic_load_umax_i8(i8 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umax_i8:
 ; OUTLINE-ATOMICS-LABEL: test_atomic_load_umax_i8:
 ; OUTLINE-ATOMICS:       // %bb.0:
-; OUTLINE-ATOMICS-NEXT:    and w8, w0, #0xff
-; OUTLINE-ATOMICS-NEXT:    adrp x9, var8
-; OUTLINE-ATOMICS-NEXT:    add x9, x9, :lo12:var8
+; OUTLINE-ATOMICS-NEXT:    adrp x8, var8
+; OUTLINE-ATOMICS-NEXT:    add x8, x8, :lo12:var8
+; OUTLINE-ATOMICS-NEXT:    and w9, w0, #0xff
 ; OUTLINE-ATOMICS-NEXT:  .LBB[[LOOPSTART:.*]]: // %atomicrmw.start
 ; OUTLINE-ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE-ATOMICS-NEXT:    ldaxrb w0, [x9]
-; OUTLINE-ATOMICS-NEXT:    cmp w0, w8
-; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w8, hi
-; OUTLINE-ATOMICS-NEXT:    stlxrb w11, w10, [x9]
+; OUTLINE-ATOMICS-NEXT:    ldaxrb w0, [x8]
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w9
+; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w9, hi
+; OUTLINE-ATOMICS-NEXT:    stlxrb w11, w10, [x8]
 ; OUTLINE-ATOMICS-NEXT:    cbnz w11, .LBB[[LOOPSTART]]
 ; OUTLINE-ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
 ; OUTLINE-ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -899,15 +899,15 @@ define dso_local i16 @test_atomic_load_umax_i16(i16 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umax_i16:
 ; OUTLINE-ATOMICS-LABEL: test_atomic_load_umax_i16:
 ; OUTLINE-ATOMICS:       // %bb.0:
-; OUTLINE-ATOMICS-NEXT:    and w8, w0, #0xffff
-; OUTLINE-ATOMICS-NEXT:    adrp x9, var16
-; OUTLINE-ATOMICS-NEXT:    add x9, x9, :lo12:var16
+; OUTLINE-ATOMICS-NEXT:    adrp x8, var16
+; OUTLINE-ATOMICS-NEXT:    add x8, x8, :lo12:var16
+; OUTLINE-ATOMICS-NEXT:    and w9, w0, #0xffff
 ; OUTLINE-ATOMICS-NEXT:  .LBB[[LOOPSTART:.*]]: // %atomicrmw.start
 ; OUTLINE-ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE-ATOMICS-NEXT:    ldaxrh w0, [x9]
-; OUTLINE-ATOMICS-NEXT:    cmp w0, w8
-; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w8, hi
-; OUTLINE-ATOMICS-NEXT:    stlxrh w11, w10, [x9]
+; OUTLINE-ATOMICS-NEXT:    ldaxrh w0, [x8]
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w9
+; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w9, hi
+; OUTLINE-ATOMICS-NEXT:    stlxrh w11, w10, [x8]
 ; OUTLINE-ATOMICS-NEXT:    cbnz w11, .LBB[[LOOPSTART]]
 ; OUTLINE-ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
 ; OUTLINE-ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -7219,15 +7219,15 @@ define dso_local i8 @test_atomic_load_umax_i8_acq_rel(i8 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umax_i8_acq_rel:
 ; OUTLINE-ATOMICS-LABEL: test_atomic_load_umax_i8_acq_rel:
 ; OUTLINE-ATOMICS:       // %bb.0:
-; OUTLINE-ATOMICS-NEXT:    and w8, w0, #0xff
-; OUTLINE-ATOMICS-NEXT:    adrp x9, var8
-; OUTLINE-ATOMICS-NEXT:    add x9, x9, :lo12:var8
+; OUTLINE-ATOMICS-NEXT:    adrp x8, var8
+; OUTLINE-ATOMICS-NEXT:    add x8, x8, :lo12:var8
+; OUTLINE-ATOMICS-NEXT:    and w9, w0, #0xff
 ; OUTLINE-ATOMICS-NEXT:  .LBB[[LOOPSTART:.*]]: // %atomicrmw.start
 ; OUTLINE-ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE-ATOMICS-NEXT:    ldaxrb w0, [x9]
-; OUTLINE-ATOMICS-NEXT:    cmp w0, w8
-; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w8, hi
-; OUTLINE-ATOMICS-NEXT:    stlxrb w11, w10, [x9]
+; OUTLINE-ATOMICS-NEXT:    ldaxrb w0, [x8]
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w9
+; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w9, hi
+; OUTLINE-ATOMICS-NEXT:    stlxrb w11, w10, [x8]
 ; OUTLINE-ATOMICS-NEXT:    cbnz w11, .LBB[[LOOPSTART]]
 ; OUTLINE-ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
 ; OUTLINE-ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -7247,15 +7247,15 @@ define dso_local i16 @test_atomic_load_umax_i16_acq_rel(i16 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umax_i16_acq_rel:
 ; OUTLINE-ATOMICS-LABEL: test_atomic_load_umax_i16_acq_rel:
 ; OUTLINE-ATOMICS:       // %bb.0:
-; OUTLINE-ATOMICS-NEXT:    and w8, w0, #0xffff
-; OUTLINE-ATOMICS-NEXT:    adrp x9, var16
-; OUTLINE-ATOMICS-NEXT:    add x9, x9, :lo12:var16
+; OUTLINE-ATOMICS-NEXT:    adrp x8, var16
+; OUTLINE-ATOMICS-NEXT:    add x8, x8, :lo12:var16
+; OUTLINE-ATOMICS-NEXT:    and w9, w0, #0xffff
 ; OUTLINE-ATOMICS-NEXT:  .LBB[[LOOPSTART:.*]]: // %atomicrmw.start
 ; OUTLINE-ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE-ATOMICS-NEXT:    ldaxrh w0, [x9]
-; OUTLINE-ATOMICS-NEXT:    cmp w0, w8
-; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w8, hi
-; OUTLINE-ATOMICS-NEXT:    stlxrh w11, w10, [x9]
+; OUTLINE-ATOMICS-NEXT:    ldaxrh w0, [x8]
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w9
+; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w9, hi
+; OUTLINE-ATOMICS-NEXT:    stlxrh w11, w10, [x8]
 ; OUTLINE-ATOMICS-NEXT:    cbnz w11, .LBB[[LOOPSTART]]
 ; OUTLINE-ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
 ; OUTLINE-ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -7379,15 +7379,15 @@ define dso_local i8 @test_atomic_load_umax_i8_acquire(i8 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umax_i8_acquire:
 ; OUTLINE-ATOMICS-LABEL: test_atomic_load_umax_i8_acquire:
 ; OUTLINE-ATOMICS:       // %bb.0:
-; OUTLINE-ATOMICS-NEXT:    and w8, w0, #0xff
-; OUTLINE-ATOMICS-NEXT:    adrp x9, var8
-; OUTLINE-ATOMICS-NEXT:    add x9, x9, :lo12:var8
+; OUTLINE-ATOMICS-NEXT:    adrp x8, var8
+; OUTLINE-ATOMICS-NEXT:    add x8, x8, :lo12:var8
+; OUTLINE-ATOMICS-NEXT:    and w9, w0, #0xff
 ; OUTLINE-ATOMICS-NEXT:  .LBB[[LOOPSTART:.*]]: // %atomicrmw.start
 ; OUTLINE-ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE-ATOMICS-NEXT:    ldaxrb w0, [x9]
-; OUTLINE-ATOMICS-NEXT:    cmp w0, w8
-; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w8, hi
-; OUTLINE-ATOMICS-NEXT:    stxrb w11, w10, [x9]
+; OUTLINE-ATOMICS-NEXT:    ldaxrb w0, [x8]
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w9
+; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w9, hi
+; OUTLINE-ATOMICS-NEXT:    stxrb w11, w10, [x8]
 ; OUTLINE-ATOMICS-NEXT:    cbnz w11, .LBB[[LOOPSTART]]
 ; OUTLINE-ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
 ; OUTLINE-ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -7407,15 +7407,15 @@ define dso_local i16 @test_atomic_load_umax_i16_acquire(i16 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umax_i16_acquire:
 ; OUTLINE-ATOMICS-LABEL: test_atomic_load_umax_i16_acquire:
 ; OUTLINE-ATOMICS:       // %bb.0:
-; OUTLINE-ATOMICS-NEXT:    and w8, w0, #0xffff
-; OUTLINE-ATOMICS-NEXT:    adrp x9, var16
-; OUTLINE-ATOMICS-NEXT:    add x9, x9, :lo12:var16
+; OUTLINE-ATOMICS-NEXT:    adrp x8, var16
+; OUTLINE-ATOMICS-NEXT:    add x8, x8, :lo12:var16
+; OUTLINE-ATOMICS-NEXT:    and w9, w0, #0xffff
 ; OUTLINE-ATOMICS-NEXT:  .LBB[[LOOPSTART:.*]]: // %atomicrmw.start
 ; OUTLINE-ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE-ATOMICS-NEXT:    ldaxrh w0, [x9]
-; OUTLINE-ATOMICS-NEXT:    cmp w0, w8
-; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w8, hi
-; OUTLINE-ATOMICS-NEXT:    stxrh w11, w10, [x9]
+; OUTLINE-ATOMICS-NEXT:    ldaxrh w0, [x8]
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w9
+; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w9, hi
+; OUTLINE-ATOMICS-NEXT:    stxrh w11, w10, [x8]
 ; OUTLINE-ATOMICS-NEXT:    cbnz w11, .LBB[[LOOPSTART]]
 ; OUTLINE-ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
 ; OUTLINE-ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -7539,15 +7539,15 @@ define dso_local i8 @test_atomic_load_umax_i8_monotonic(i8 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umax_i8_monotonic:
 ; OUTLINE-ATOMICS-LABEL: test_atomic_load_umax_i8_monotonic:
 ; OUTLINE-ATOMICS:       // %bb.0:
-; OUTLINE-ATOMICS-NEXT:    and w8, w0, #0xff
-; OUTLINE-ATOMICS-NEXT:    adrp x9, var8
-; OUTLINE-ATOMICS-NEXT:    add x9, x9, :lo12:var8
+; OUTLINE-ATOMICS-NEXT:    adrp x8, var8
+; OUTLINE-ATOMICS-NEXT:    add x8, x8, :lo12:var8
+; OUTLINE-ATOMICS-NEXT:    and w9, w0, #0xff
 ; OUTLINE-ATOMICS-NEXT:  .LBB[[LOOPSTART:.*]]: // %atomicrmw.start
 ; OUTLINE-ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE-ATOMICS-NEXT:    ldxrb w0, [x9]
-; OUTLINE-ATOMICS-NEXT:    cmp w0, w8
-; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w8, hi
-; OUTLINE-ATOMICS-NEXT:    stxrb w11, w10, [x9]
+; OUTLINE-ATOMICS-NEXT:    ldxrb w0, [x8]
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w9
+; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w9, hi
+; OUTLINE-ATOMICS-NEXT:    stxrb w11, w10, [x8]
 ; OUTLINE-ATOMICS-NEXT:    cbnz w11, .LBB[[LOOPSTART]]
 ; OUTLINE-ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
 ; OUTLINE-ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -7567,15 +7567,15 @@ define dso_local i16 @test_atomic_load_umax_i16_monotonic(i16 %offset) nounwind
 ; CHECK-LABEL: test_atomic_load_umax_i16_monotonic:
 ; OUTLINE-ATOMICS-LABEL: test_atomic_load_umax_i16_monotonic:
 ; OUTLINE-ATOMICS:       // %bb.0:
-; OUTLINE-ATOMICS-NEXT:    and w8, w0, #0xffff
-; OUTLINE-ATOMICS-NEXT:    adrp x9, var16
-; OUTLINE-ATOMICS-NEXT:    add x9, x9, :lo12:var16
+; OUTLINE-ATOMICS-NEXT:    adrp x8, var16
+; OUTLINE-ATOMICS-NEXT:    add x8, x8, :lo12:var16
+; OUTLINE-ATOMICS-NEXT:    and w9, w0, #0xffff
 ; OUTLINE-ATOMICS-NEXT:  .LBB[[LOOPSTART:.*]]: // %atomicrmw.start
 ; OUTLINE-ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE-ATOMICS-NEXT:    ldxrh w0, [x9]
-; OUTLINE-ATOMICS-NEXT:    cmp w0, w8
-; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w8, hi
-; OUTLINE-ATOMICS-NEXT:    stxrh w11, w10, [x9]
+; OUTLINE-ATOMICS-NEXT:    ldxrh w0, [x8]
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w9
+; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w9, hi
+; OUTLINE-ATOMICS-NEXT:    stxrh w11, w10, [x8]
 ; OUTLINE-ATOMICS-NEXT:    cbnz w11, .LBB[[LOOPSTART]]
 ; OUTLINE-ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
 ; OUTLINE-ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -7699,15 +7699,15 @@ define dso_local i8 @test_atomic_load_umax_i8_release(i8 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umax_i8_release:
 ; OUTLINE-ATOMICS-LABEL: test_atomic_load_umax_i8_release:
 ; OUTLINE-ATOMICS:       // %bb.0:
-; OUTLINE-ATOMICS-NEXT:    and w8, w0, #0xff
-; OUTLINE-ATOMICS-NEXT:    adrp x9, var8
-; OUTLINE-ATOMICS-NEXT:    add x9, x9, :lo12:var8
+; OUTLINE-ATOMICS-NEXT:    adrp x8, var8
+; OUTLINE-ATOMICS-NEXT:    add x8, x8, :lo12:var8
+; OUTLINE-ATOMICS-NEXT:    and w9, w0, #0xff
 ; OUTLINE-ATOMICS-NEXT:  .LBB[[LOOPSTART:.*]]: // %atomicrmw.start
 ; OUTLINE-ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE-ATOMICS-NEXT:    ldxrb w0, [x9]
-; OUTLINE-ATOMICS-NEXT:    cmp w0, w8
-; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w8, hi
-; OUTLINE-ATOMICS-NEXT:    stlxrb w11, w10, [x9]
+; OUTLINE-ATOMICS-NEXT:    ldxrb w0, [x8]
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w9
+; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w9, hi
+; OUTLINE-ATOMICS-NEXT:    stlxrb w11, w10, [x8]
 ; OUTLINE-ATOMICS-NEXT:    cbnz w11, .LBB[[LOOPSTART]]
 ; OUTLINE-ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
 ; OUTLINE-ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -7727,15 +7727,15 @@ define dso_local i16 @test_atomic_load_umax_i16_release(i16 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umax_i16_release:
 ; OUTLINE-ATOMICS-LABEL: test_atomic_load_umax_i16_release:
 ; OUTLINE-ATOMICS:       // %bb.0:
-; OUTLINE-ATOMICS-NEXT:    and w8, w0, #0xffff
-; OUTLINE-ATOMICS-NEXT:    adrp x9, var16
-; OUTLINE-ATOMICS-NEXT:    add x9, x9, :lo12:var16
+; OUTLINE-ATOMICS-NEXT:    adrp x8, var16
+; OUTLINE-ATOMICS-NEXT:    add x8, x8, :lo12:var16
+; OUTLINE-ATOMICS-NEXT:    and w9, w0, #0xffff
 ; OUTLINE-ATOMICS-NEXT:  .LBB[[LOOPSTART:.*]]: // %atomicrmw.start
 ; OUTLINE-ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE-ATOMICS-NEXT:    ldxrh w0, [x9]
-; OUTLINE-ATOMICS-NEXT:    cmp w0, w8
-; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w8, hi
-; OUTLINE-ATOMICS-NEXT:    stlxrh w11, w10, [x9]
+; OUTLINE-ATOMICS-NEXT:    ldxrh w0, [x8]
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w9
+; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w9, hi
+; OUTLINE-ATOMICS-NEXT:    stlxrh w11, w10, [x8]
 ; OUTLINE-ATOMICS-NEXT:    cbnz w11, .LBB[[LOOPSTART]]
 ; OUTLINE-ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
 ; OUTLINE-ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -7859,15 +7859,15 @@ define dso_local i8 @test_atomic_load_umax_i8_seq_cst(i8 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umax_i8_seq_cst:
 ; OUTLINE-ATOMICS-LABEL: test_atomic_load_umax_i8_seq_cst:
 ; OUTLINE-ATOMICS:       // %bb.0:
-; OUTLINE-ATOMICS-NEXT:    and w8, w0, #0xff
-; OUTLINE-ATOMICS-NEXT:    adrp x9, var8
-; OUTLINE-ATOMICS-NEXT:    add x9, x9, :lo12:var8
+; OUTLINE-ATOMICS-NEXT:    adrp x8, var8
+; OUTLINE-ATOMICS-NEXT:    add x8, x8, :lo12:var8
+; OUTLINE-ATOMICS-NEXT:    and w9, w0, #0xff
 ; OUTLINE-ATOMICS-NEXT:  .LBB[[LOOPSTART:.*]]: // %atomicrmw.start
 ; OUTLINE-ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE-ATOMICS-NEXT:    ldaxrb w0, [x9]
-; OUTLINE-ATOMICS-NEXT:    cmp w0, w8
-; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w8, hi
-; OUTLINE-ATOMICS-NEXT:    stlxrb w11, w10, [x9]
+; OUTLINE-ATOMICS-NEXT:    ldaxrb w0, [x8]
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w9
+; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w9, hi
+; OUTLINE-ATOMICS-NEXT:    stlxrb w11, w10, [x8]
 ; OUTLINE-ATOMICS-NEXT:    cbnz w11, .LBB[[LOOPSTART]]
 ; OUTLINE-ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
 ; OUTLINE-ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -7887,15 +7887,15 @@ define dso_local i16 @test_atomic_load_umax_i16_seq_cst(i16 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umax_i16_seq_cst:
 ; OUTLINE-ATOMICS-LABEL: test_atomic_load_umax_i16_seq_cst:
 ; OUTLINE-ATOMICS:       // %bb.0:
-; OUTLINE-ATOMICS-NEXT:    and w8, w0, #0xffff
-; OUTLINE-ATOMICS-NEXT:    adrp x9, var16
-; OUTLINE-ATOMICS-NEXT:    add x9, x9, :lo12:var16
+; OUTLINE-ATOMICS-NEXT:    adrp x8, var16
+; OUTLINE-ATOMICS-NEXT:    add x8, x8, :lo12:var16
+; OUTLINE-ATOMICS-NEXT:    and w9, w0, #0xffff
 ; OUTLINE-ATOMICS-NEXT:  .LBB[[LOOPSTART:.*]]: // %atomicrmw.start
 ; OUTLINE-ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE-ATOMICS-NEXT:    ldaxrh w0, [x9]
-; OUTLINE-ATOMICS-NEXT:    cmp w0, w8
-; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w8, hi
-; OUTLINE-ATOMICS-NEXT:    stlxrh w11, w10, [x9]
+; OUTLINE-ATOMICS-NEXT:    ldaxrh w0, [x8]
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w9
+; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w9, hi
+; OUTLINE-ATOMICS-NEXT:    stlxrh w11, w10, [x8]
 ; OUTLINE-ATOMICS-NEXT:    cbnz w11, .LBB[[LOOPSTART]]
 ; OUTLINE-ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
 ; OUTLINE-ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -8019,15 +8019,15 @@ define dso_local i8 @test_atomic_load_umin_i8_acq_rel(i8 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umin_i8_acq_rel:
 ; OUTLINE-ATOMICS-LABEL: test_atomic_load_umin_i8_acq_rel:
 ; OUTLINE-ATOMICS:       // %bb.0:
-; OUTLINE-ATOMICS-NEXT:    and w8, w0, #0xff
-; OUTLINE-ATOMICS-NEXT:    adrp x9, var8
-; OUTLINE-ATOMICS-NEXT:    add x9, x9, :lo12:var8
+; OUTLINE-ATOMICS-NEXT:    adrp x8, var8
+; OUTLINE-ATOMICS-NEXT:    add x8, x8, :lo12:var8
+; OUTLINE-ATOMICS-NEXT:    and w9, w0, #0xff
 ; OUTLINE-ATOMICS-NEXT:  .LBB[[LOOPSTART:.*]]: // %atomicrmw.start
 ; OUTLINE-ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE-ATOMICS-NEXT:    ldaxrb w0, [x9]
-; OUTLINE-ATOMICS-NEXT:    cmp w0, w8
-; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w8, ls
-; OUTLINE-ATOMICS-NEXT:    stlxrb w11, w10, [x9]
+; OUTLINE-ATOMICS-NEXT:    ldaxrb w0, [x8]
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w9
+; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w9, ls
+; OUTLINE-ATOMICS-NEXT:    stlxrb w11, w10, [x8]
 ; OUTLINE-ATOMICS-NEXT:    cbnz w11, .LBB[[LOOPSTART]]
 ; OUTLINE-ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
 ; OUTLINE-ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -8047,15 +8047,15 @@ define dso_local i16 @test_atomic_load_umin_i16_acq_rel(i16 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umin_i16_acq_rel:
 ; OUTLINE-ATOMICS-LABEL: test_atomic_load_umin_i16_acq_rel:
 ; OUTLINE-ATOMICS:       // %bb.0:
-; OUTLINE-ATOMICS-NEXT:    and w8, w0, #0xffff
-; OUTLINE-ATOMICS-NEXT:    adrp x9, var16
-; OUTLINE-ATOMICS-NEXT:    add x9, x9, :lo12:var16
+; OUTLINE-ATOMICS-NEXT:    adrp x8, var16
+; OUTLINE-ATOMICS-NEXT:    add x8, x8, :lo12:var16
+; OUTLINE-ATOMICS-NEXT:    and w9, w0, #0xffff
 ; OUTLINE-ATOMICS-NEXT:  .LBB[[LOOPSTART:.*]]: // %atomicrmw.start
 ; OUTLINE-ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE-ATOMICS-NEXT:    ldaxrh w0, [x9]
-; OUTLINE-ATOMICS-NEXT:    cmp w0, w8
-; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w8, ls
-; OUTLINE-ATOMICS-NEXT:    stlxrh w11, w10, [x9]
+; OUTLINE-ATOMICS-NEXT:    ldaxrh w0, [x8]
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w9
+; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w9, ls
+; OUTLINE-ATOMICS-NEXT:    stlxrh w11, w10, [x8]
 ; OUTLINE-ATOMICS-NEXT:    cbnz w11, .LBB[[LOOPSTART]]
 ; OUTLINE-ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
 ; OUTLINE-ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -8179,15 +8179,15 @@ define dso_local i8 @test_atomic_load_umin_i8_acquire(i8 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umin_i8_acquire:
 ; OUTLINE-ATOMICS-LABEL: test_atomic_load_umin_i8_acquire:
 ; OUTLINE-ATOMICS:       // %bb.0:
-; OUTLINE-ATOMICS-NEXT:    and w8, w0, #0xff
-; OUTLINE-ATOMICS-NEXT:    adrp x9, var8
-; OUTLINE-ATOMICS-NEXT:    add x9, x9, :lo12:var8
+; OUTLINE-ATOMICS-NEXT:    adrp x8, var8
+; OUTLINE-ATOMICS-NEXT:    add x8, x8, :lo12:var8
+; OUTLINE-ATOMICS-NEXT:    and w9, w0, #0xff
 ; OUTLINE-ATOMICS-NEXT:  .LBB[[LOOPSTART:.*]]: // %atomicrmw.start
 ; OUTLINE-ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE-ATOMICS-NEXT:    ldaxrb w0, [x9]
-; OUTLINE-ATOMICS-NEXT:    cmp w0, w8
-; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w8, ls
-; OUTLINE-ATOMICS-NEXT:    stxrb w11, w10, [x9]
+; OUTLINE-ATOMICS-NEXT:    ldaxrb w0, [x8]
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w9
+; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w9, ls
+; OUTLINE-ATOMICS-NEXT:    stxrb w11, w10, [x8]
 ; OUTLINE-ATOMICS-NEXT:    cbnz w11, .LBB[[LOOPSTART]]
 ; OUTLINE-ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
 ; OUTLINE-ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -8207,15 +8207,15 @@ define dso_local i16 @test_atomic_load_umin_i16_acquire(i16 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umin_i16_acquire:
 ; OUTLINE-ATOMICS-LABEL: test_atomic_load_umin_i16_acquire:
 ; OUTLINE-ATOMICS:       // %bb.0:
-; OUTLINE-ATOMICS-NEXT:    and w8, w0, #0xffff
-; OUTLINE-ATOMICS-NEXT:    adrp x9, var16
-; OUTLINE-ATOMICS-NEXT:    add x9, x9, :lo12:var16
+; OUTLINE-ATOMICS-NEXT:    adrp x8, var16
+; OUTLINE-ATOMICS-NEXT:    add x8, x8, :lo12:var16
+; OUTLINE-ATOMICS-NEXT:    and w9, w0, #0xffff
 ; OUTLINE-ATOMICS-NEXT:  .LBB[[LOOPSTART:.*]]: // %atomicrmw.start
 ; OUTLINE-ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE-ATOMICS-NEXT:    ldaxrh w0, [x9]
-; OUTLINE-ATOMICS-NEXT:    cmp w0, w8
-; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w8, ls
-; OUTLINE-ATOMICS-NEXT:    stxrh w11, w10, [x9]
+; OUTLINE-ATOMICS-NEXT:    ldaxrh w0, [x8]
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w9
+; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w9, ls
+; OUTLINE-ATOMICS-NEXT:    stxrh w11, w10, [x8]
 ; OUTLINE-ATOMICS-NEXT:    cbnz w11, .LBB[[LOOPSTART]]
 ; OUTLINE-ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
 ; OUTLINE-ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -8339,15 +8339,15 @@ define dso_local i8 @test_atomic_load_umin_i8_monotonic(i8 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umin_i8_monotonic:
 ; OUTLINE-ATOMICS-LABEL: test_atomic_load_umin_i8_monotonic:
 ; OUTLINE-ATOMICS:       // %bb.0:
-; OUTLINE-ATOMICS-NEXT:    and w8, w0, #0xff
-; OUTLINE-ATOMICS-NEXT:    adrp x9, var8
-; OUTLINE-ATOMICS-NEXT:    add x9, x9, :lo12:var8
+; OUTLINE-ATOMICS-NEXT:    adrp x8, var8
+; OUTLINE-ATOMICS-NEXT:    add x8, x8, :lo12:var8
+; OUTLINE-ATOMICS-NEXT:    and w9, w0, #0xff
 ; OUTLINE-ATOMICS-NEXT:  .LBB[[LOOPSTART:.*]]: // %atomicrmw.start
 ; OUTLINE-ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE-ATOMICS-NEXT:    ldxrb w0, [x9]
-; OUTLINE-ATOMICS-NEXT:    cmp w0, w8
-; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w8, ls
-; OUTLINE-ATOMICS-NEXT:    stxrb w11, w10, [x9]
+; OUTLINE-ATOMICS-NEXT:    ldxrb w0, [x8]
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w9
+; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w9, ls
+; OUTLINE-ATOMICS-NEXT:    stxrb w11, w10, [x8]
 ; OUTLINE-ATOMICS-NEXT:    cbnz w11, .LBB[[LOOPSTART]]
 ; OUTLINE-ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
 ; OUTLINE-ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -8367,15 +8367,15 @@ define dso_local i16 @test_atomic_load_umin_i16_monotonic(i16 %offset) nounwind
 ; CHECK-LABEL: test_atomic_load_umin_i16_monotonic:
 ; OUTLINE-ATOMICS-LABEL: test_atomic_load_umin_i16_monotonic:
 ; OUTLINE-ATOMICS:       // %bb.0:
-; OUTLINE-ATOMICS-NEXT:    and w8, w0, #0xffff
-; OUTLINE-ATOMICS-NEXT:    adrp x9, var16
-; OUTLINE-ATOMICS-NEXT:    add x9, x9, :lo12:var16
+; OUTLINE-ATOMICS-NEXT:    adrp x8, var16
+; OUTLINE-ATOMICS-NEXT:    add x8, x8, :lo12:var16
+; OUTLINE-ATOMICS-NEXT:    and w9, w0, #0xffff
 ; OUTLINE-ATOMICS-NEXT:  .LBB[[LOOPSTART:.*]]: // %atomicrmw.start
 ; OUTLINE-ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE-ATOMICS-NEXT:    ldxrh w0, [x9]
-; OUTLINE-ATOMICS-NEXT:    cmp w0, w8
-; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w8, ls
-; OUTLINE-ATOMICS-NEXT:    stxrh w11, w10, [x9]
+; OUTLINE-ATOMICS-NEXT:    ldxrh w0, [x8]
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w9
+; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w9, ls
+; OUTLINE-ATOMICS-NEXT:    stxrh w11, w10, [x8]
 ; OUTLINE-ATOMICS-NEXT:    cbnz w11, .LBB[[LOOPSTART]]
 ; OUTLINE-ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
 ; OUTLINE-ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -8499,15 +8499,15 @@ define dso_local i8 @test_atomic_load_umin_i8_release(i8 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umin_i8_release:
 ; OUTLINE-ATOMICS-LABEL: test_atomic_load_umin_i8_release:
 ; OUTLINE-ATOMICS:       // %bb.0:
-; OUTLINE-ATOMICS-NEXT:    and w8, w0, #0xff
-; OUTLINE-ATOMICS-NEXT:    adrp x9, var8
-; OUTLINE-ATOMICS-NEXT:    add x9, x9, :lo12:var8
+; OUTLINE-ATOMICS-NEXT:    adrp x8, var8
+; OUTLINE-ATOMICS-NEXT:    add x8, x8, :lo12:var8
+; OUTLINE-ATOMICS-NEXT:    and w9, w0, #0xff
 ; OUTLINE-ATOMICS-NEXT:  .LBB[[LOOPSTART:.*]]: // %atomicrmw.start
 ; OUTLINE-ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE-ATOMICS-NEXT:    ldxrb w0, [x9]
-; OUTLINE-ATOMICS-NEXT:    cmp w0, w8
-; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w8, ls
-; OUTLINE-ATOMICS-NEXT:    stlxrb w11, w10, [x9]
+; OUTLINE-ATOMICS-NEXT:    ldxrb w0, [x8]
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w9
+; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w9, ls
+; OUTLINE-ATOMICS-NEXT:    stlxrb w11, w10, [x8]
 ; OUTLINE-ATOMICS-NEXT:    cbnz w11, .LBB[[LOOPSTART]]
 ; OUTLINE-ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
 ; OUTLINE-ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -8527,15 +8527,15 @@ define dso_local i16 @test_atomic_load_umin_i16_release(i16 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umin_i16_release:
 ; OUTLINE-ATOMICS-LABEL: test_atomic_load_umin_i16_release:
 ; OUTLINE-ATOMICS:       // %bb.0:
-; OUTLINE-ATOMICS-NEXT:    and w8, w0, #0xffff
-; OUTLINE-ATOMICS-NEXT:    adrp x9, var16
-; OUTLINE-ATOMICS-NEXT:    add x9, x9, :lo12:var16
+; OUTLINE-ATOMICS-NEXT:    adrp x8, var16
+; OUTLINE-ATOMICS-NEXT:    add x8, x8, :lo12:var16
+; OUTLINE-ATOMICS-NEXT:    and w9, w0, #0xffff
 ; OUTLINE-ATOMICS-NEXT:  .LBB[[LOOPSTART:.*]]: // %atomicrmw.start
 ; OUTLINE-ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE-ATOMICS-NEXT:    ldxrh w0, [x9]
-; OUTLINE-ATOMICS-NEXT:    cmp w0, w8
-; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w8, ls
-; OUTLINE-ATOMICS-NEXT:    stlxrh w11, w10, [x9]
+; OUTLINE-ATOMICS-NEXT:    ldxrh w0, [x8]
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w9
+; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w9, ls
+; OUTLINE-ATOMICS-NEXT:    stlxrh w11, w10, [x8]
 ; OUTLINE-ATOMICS-NEXT:    cbnz w11, .LBB[[LOOPSTART]]
 ; OUTLINE-ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
 ; OUTLINE-ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -8659,15 +8659,15 @@ define dso_local i8 @test_atomic_load_umin_i8_seq_cst(i8 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umin_i8_seq_cst:
 ; OUTLINE-ATOMICS-LABEL: test_atomic_load_umin_i8_seq_cst:
 ; OUTLINE-ATOMICS:       // %bb.0:
-; OUTLINE-ATOMICS-NEXT:    and w8, w0, #0xff
-; OUTLINE-ATOMICS-NEXT:    adrp x9, var8
-; OUTLINE-ATOMICS-NEXT:    add x9, x9, :lo12:var8
+; OUTLINE-ATOMICS-NEXT:    adrp x8, var8
+; OUTLINE-ATOMICS-NEXT:    add x8, x8, :lo12:var8
+; OUTLINE-ATOMICS-NEXT:    and w9, w0, #0xff
 ; OUTLINE-ATOMICS-NEXT:  .LBB[[LOOPSTART:.*]]: // %atomicrmw.start
 ; OUTLINE-ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE-ATOMICS-NEXT:    ldaxrb w0, [x9]
-; OUTLINE-ATOMICS-NEXT:    cmp w0, w8
-; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w8, ls
-; OUTLINE-ATOMICS-NEXT:    stlxrb w11, w10, [x9]
+; OUTLINE-ATOMICS-NEXT:    ldaxrb w0, [x8]
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w9
+; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w9, ls
+; OUTLINE-ATOMICS-NEXT:    stlxrb w11, w10, [x8]
 ; OUTLINE-ATOMICS-NEXT:    cbnz w11, .LBB[[LOOPSTART]]
 ; OUTLINE-ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
 ; OUTLINE-ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -8687,15 +8687,15 @@ define dso_local i16 @test_atomic_load_umin_i16_seq_cst(i16 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umin_i16_seq_cst:
 ; OUTLINE-ATOMICS-LABEL: test_atomic_load_umin_i16_seq_cst:
 ; OUTLINE-ATOMICS:       // %bb.0:
-; OUTLINE-ATOMICS-NEXT:    and w8, w0, #0xffff
-; OUTLINE-ATOMICS-NEXT:    adrp x9, var16
-; OUTLINE-ATOMICS-NEXT:    add x9, x9, :lo12:var16
+; OUTLINE-ATOMICS-NEXT:    adrp x8, var16
+; OUTLINE-ATOMICS-NEXT:    add x8, x8, :lo12:var16
+; OUTLINE-ATOMICS-NEXT:    and w9, w0, #0xffff
 ; OUTLINE-ATOMICS-NEXT:  .LBB[[LOOPSTART:.*]]: // %atomicrmw.start
 ; OUTLINE-ATOMICS-NEXT:    // =>This Inner Loop Header: Depth=1
-; OUTLINE-ATOMICS-NEXT:    ldaxrh w0, [x9]
-; OUTLINE-ATOMICS-NEXT:    cmp w0, w8
-; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w8, ls
-; OUTLINE-ATOMICS-NEXT:    stlxrh w11, w10, [x9]
+; OUTLINE-ATOMICS-NEXT:    ldaxrh w0, [x8]
+; OUTLINE-ATOMICS-NEXT:    cmp w0, w9
+; OUTLINE-ATOMICS-NEXT:    csel w10, w0, w9, ls
+; OUTLINE-ATOMICS-NEXT:    stlxrh w11, w10, [x8]
 ; OUTLINE-ATOMICS-NEXT:    cbnz w11, .LBB[[LOOPSTART]]
 ; OUTLINE-ATOMICS-NEXT:  // %bb.2: // %atomicrmw.end
 ; OUTLINE-ATOMICS-NEXT:    // kill: def $w0 killed $w0 killed $x0

diff  --git a/llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll b/llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll
index 762e70a6e78c1a..fdb14606d463b7 100644
--- a/llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll
+++ b/llvm/test/CodeGen/AArch64/atomic-ops-msvc.ll
@@ -605,15 +605,15 @@ define dso_local i64 @test_atomic_load_max_i64(i64 %offset) nounwind {
 define dso_local i8 @test_atomic_load_umin_i8(i8 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umin_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xff
-; CHECK-NEXT:    adrp x9, var8
-; CHECK-NEXT:    add x9, x9, :lo12:var8
+; CHECK-NEXT:    adrp x8, var8
+; CHECK-NEXT:    add x8, x8, :lo12:var8
+; CHECK-NEXT:    and w9, w0, #0xff
 ; CHECK-NEXT:  .LBB32_1: // %atomicrmw.start
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldxrb w0, [x9]
-; CHECK-NEXT:    cmp w0, w8
-; CHECK-NEXT:    csel w10, w0, w8, ls
-; CHECK-NEXT:    stxrb w11, w10, [x9]
+; CHECK-NEXT:    ldxrb w0, [x8]
+; CHECK-NEXT:    cmp w0, w9
+; CHECK-NEXT:    csel w10, w0, w9, ls
+; CHECK-NEXT:    stxrb w11, w10, [x8]
 ; CHECK-NEXT:    cbnz w11, .LBB32_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -625,15 +625,15 @@ define dso_local i8 @test_atomic_load_umin_i8(i8 %offset) nounwind {
 define dso_local i16 @test_atomic_load_umin_i16(i16 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umin_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    adrp x9, var16
-; CHECK-NEXT:    add x9, x9, :lo12:var16
+; CHECK-NEXT:    adrp x8, var16
+; CHECK-NEXT:    add x8, x8, :lo12:var16
+; CHECK-NEXT:    and w9, w0, #0xffff
 ; CHECK-NEXT:  .LBB33_1: // %atomicrmw.start
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldaxrh w0, [x9]
-; CHECK-NEXT:    cmp w0, w8
-; CHECK-NEXT:    csel w10, w0, w8, ls
-; CHECK-NEXT:    stxrh w11, w10, [x9]
+; CHECK-NEXT:    ldaxrh w0, [x8]
+; CHECK-NEXT:    cmp w0, w9
+; CHECK-NEXT:    csel w10, w0, w9, ls
+; CHECK-NEXT:    stxrh w11, w10, [x8]
 ; CHECK-NEXT:    cbnz w11, .LBB33_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -684,15 +684,15 @@ define dso_local i64 @test_atomic_load_umin_i64(i64 %offset) nounwind {
 define dso_local i8 @test_atomic_load_umax_i8(i8 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umax_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xff
-; CHECK-NEXT:    adrp x9, var8
-; CHECK-NEXT:    add x9, x9, :lo12:var8
+; CHECK-NEXT:    adrp x8, var8
+; CHECK-NEXT:    add x8, x8, :lo12:var8
+; CHECK-NEXT:    and w9, w0, #0xff
 ; CHECK-NEXT:  .LBB36_1: // %atomicrmw.start
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldaxrb w0, [x9]
-; CHECK-NEXT:    cmp w0, w8
-; CHECK-NEXT:    csel w10, w0, w8, hi
-; CHECK-NEXT:    stlxrb w11, w10, [x9]
+; CHECK-NEXT:    ldaxrb w0, [x8]
+; CHECK-NEXT:    cmp w0, w9
+; CHECK-NEXT:    csel w10, w0, w9, hi
+; CHECK-NEXT:    stlxrb w11, w10, [x8]
 ; CHECK-NEXT:    cbnz w11, .LBB36_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -704,15 +704,15 @@ define dso_local i8 @test_atomic_load_umax_i8(i8 %offset) nounwind {
 define dso_local i16 @test_atomic_load_umax_i16(i16 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umax_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    adrp x9, var16
-; CHECK-NEXT:    add x9, x9, :lo12:var16
+; CHECK-NEXT:    adrp x8, var16
+; CHECK-NEXT:    add x8, x8, :lo12:var16
+; CHECK-NEXT:    and w9, w0, #0xffff
 ; CHECK-NEXT:  .LBB37_1: // %atomicrmw.start
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldxrh w0, [x9]
-; CHECK-NEXT:    cmp w0, w8
-; CHECK-NEXT:    csel w10, w0, w8, hi
-; CHECK-NEXT:    stxrh w11, w10, [x9]
+; CHECK-NEXT:    ldxrh w0, [x8]
+; CHECK-NEXT:    cmp w0, w9
+; CHECK-NEXT:    csel w10, w0, w9, hi
+; CHECK-NEXT:    stxrh w11, w10, [x8]
 ; CHECK-NEXT:    cbnz w11, .LBB37_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0

diff  --git a/llvm/test/CodeGen/AArch64/atomic-ops.ll b/llvm/test/CodeGen/AArch64/atomic-ops.ll
index 4b227c881f3857..f198affdf22a88 100644
--- a/llvm/test/CodeGen/AArch64/atomic-ops.ll
+++ b/llvm/test/CodeGen/AArch64/atomic-ops.ll
@@ -822,15 +822,15 @@ define dso_local i64 @test_atomic_load_max_i64(i64 %offset) nounwind {
 define dso_local i8 @test_atomic_load_umin_i8(i8 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umin_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xff
-; CHECK-NEXT:    adrp x9, var8
-; CHECK-NEXT:    add x9, x9, :lo12:var8
+; CHECK-NEXT:    adrp x8, var8
+; CHECK-NEXT:    add x8, x8, :lo12:var8
+; CHECK-NEXT:    and w9, w0, #0xff
 ; CHECK-NEXT:  .LBB32_1: // %atomicrmw.start
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldxrb w0, [x9]
-; CHECK-NEXT:    cmp w0, w8
-; CHECK-NEXT:    csel w10, w0, w8, ls
-; CHECK-NEXT:    stxrb w11, w10, [x9]
+; CHECK-NEXT:    ldxrb w0, [x8]
+; CHECK-NEXT:    cmp w0, w9
+; CHECK-NEXT:    csel w10, w0, w9, ls
+; CHECK-NEXT:    stxrb w11, w10, [x8]
 ; CHECK-NEXT:    cbnz w11, .LBB32_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -842,15 +842,15 @@ define dso_local i8 @test_atomic_load_umin_i8(i8 %offset) nounwind {
 define dso_local i16 @test_atomic_load_umin_i16(i16 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umin_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    adrp x9, var16
-; CHECK-NEXT:    add x9, x9, :lo12:var16
+; CHECK-NEXT:    adrp x8, var16
+; CHECK-NEXT:    add x8, x8, :lo12:var16
+; CHECK-NEXT:    and w9, w0, #0xffff
 ; CHECK-NEXT:  .LBB33_1: // %atomicrmw.start
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldaxrh w0, [x9]
-; CHECK-NEXT:    cmp w0, w8
-; CHECK-NEXT:    csel w10, w0, w8, ls
-; CHECK-NEXT:    stxrh w11, w10, [x9]
+; CHECK-NEXT:    ldaxrh w0, [x8]
+; CHECK-NEXT:    cmp w0, w9
+; CHECK-NEXT:    csel w10, w0, w9, ls
+; CHECK-NEXT:    stxrh w11, w10, [x8]
 ; CHECK-NEXT:    cbnz w11, .LBB33_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -900,15 +900,15 @@ define dso_local i64 @test_atomic_load_umin_i64(i64 %offset) nounwind {
 define dso_local i8 @test_atomic_load_umax_i8(i8 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umax_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xff
-; CHECK-NEXT:    adrp x9, var8
-; CHECK-NEXT:    add x9, x9, :lo12:var8
+; CHECK-NEXT:    adrp x8, var8
+; CHECK-NEXT:    add x8, x8, :lo12:var8
+; CHECK-NEXT:    and w9, w0, #0xff
 ; CHECK-NEXT:  .LBB36_1: // %atomicrmw.start
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldaxrb w0, [x9]
-; CHECK-NEXT:    cmp w0, w8
-; CHECK-NEXT:    csel w10, w0, w8, hi
-; CHECK-NEXT:    stlxrb w11, w10, [x9]
+; CHECK-NEXT:    ldaxrb w0, [x8]
+; CHECK-NEXT:    cmp w0, w9
+; CHECK-NEXT:    csel w10, w0, w9, hi
+; CHECK-NEXT:    stlxrb w11, w10, [x8]
 ; CHECK-NEXT:    cbnz w11, .LBB36_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -920,15 +920,15 @@ define dso_local i8 @test_atomic_load_umax_i8(i8 %offset) nounwind {
 define dso_local i16 @test_atomic_load_umax_i16(i16 %offset) nounwind {
 ; CHECK-LABEL: test_atomic_load_umax_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    adrp x9, var16
-; CHECK-NEXT:    add x9, x9, :lo12:var16
+; CHECK-NEXT:    adrp x8, var16
+; CHECK-NEXT:    add x8, x8, :lo12:var16
+; CHECK-NEXT:    and w9, w0, #0xffff
 ; CHECK-NEXT:  .LBB37_1: // %atomicrmw.start
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldxrh w0, [x9]
-; CHECK-NEXT:    cmp w0, w8
-; CHECK-NEXT:    csel w10, w0, w8, hi
-; CHECK-NEXT:    stxrh w11, w10, [x9]
+; CHECK-NEXT:    ldxrh w0, [x8]
+; CHECK-NEXT:    cmp w0, w9
+; CHECK-NEXT:    csel w10, w0, w9, hi
+; CHECK-NEXT:    stxrh w11, w10, [x8]
 ; CHECK-NEXT:    cbnz w11, .LBB37_1
 ; CHECK-NEXT:  // %bb.2: // %atomicrmw.end
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0

diff  --git a/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll b/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll
index 5cb0720d1bc665..4846c46e648178 100644
--- a/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll
+++ b/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll
@@ -22,7 +22,7 @@ define i1 @test_b2(ptr %s1, ptr %s2) {
 ; CHECKS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECKS-NEXT:    .cfi_def_cfa_offset 16
 ; CHECKS-NEXT:    .cfi_offset w30, -16
-; CHECKS-NEXT:    mov w2, #15
+; CHECKS-NEXT:    mov w2, #15 // =0xf
 ; CHECKS-NEXT:    bl bcmp
 ; CHECKS-NEXT:    cmp w0, #0
 ; CHECKS-NEXT:    cset w0, eq
@@ -52,7 +52,7 @@ define i1 @test_b2_align8(ptr align 8 %s1, ptr align 8 %s2) {
 ; CHECKS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECKS-NEXT:    .cfi_def_cfa_offset 16
 ; CHECKS-NEXT:    .cfi_offset w30, -16
-; CHECKS-NEXT:    mov w2, #15
+; CHECKS-NEXT:    mov w2, #15 // =0xf
 ; CHECKS-NEXT:    bl bcmp
 ; CHECKS-NEXT:    cmp w0, #0
 ; CHECKS-NEXT:    cset w0, eq
@@ -67,16 +67,16 @@ entry:
 define i1 @test_bs(ptr %s1, ptr %s2) optsize {
 ; CHECKN-LABEL: test_bs:
 ; CHECKN:       // %bb.0: // %entry
-; CHECKN-NEXT:    ldp x8, x9, [x0]
-; CHECKN-NEXT:    ldp x10, x11, [x1]
+; CHECKN-NEXT:    ldp x8, x11, [x1]
 ; CHECKN-NEXT:    ldr x12, [x0, #16]
-; CHECKN-NEXT:    cmp x8, x10
-; CHECKN-NEXT:    ldr x8, [x1, #16]
-; CHECKN-NEXT:    ccmp x9, x11, #0, eq
-; CHECKN-NEXT:    ldur x9, [x0, #23]
-; CHECKN-NEXT:    ldur x10, [x1, #23]
-; CHECKN-NEXT:    ccmp x12, x8, #0, eq
-; CHECKN-NEXT:    ccmp x9, x10, #0, eq
+; CHECKN-NEXT:    ldp x9, x10, [x0]
+; CHECKN-NEXT:    ldr x13, [x1, #16]
+; CHECKN-NEXT:    cmp x9, x8
+; CHECKN-NEXT:    ldur x8, [x0, #23]
+; CHECKN-NEXT:    ldur x9, [x1, #23]
+; CHECKN-NEXT:    ccmp x10, x11, #0, eq
+; CHECKN-NEXT:    ccmp x12, x13, #0, eq
+; CHECKN-NEXT:    ccmp x8, x9, #0, eq
 ; CHECKN-NEXT:    cset w0, eq
 ; CHECKN-NEXT:    ret
 ;
@@ -85,7 +85,7 @@ define i1 @test_bs(ptr %s1, ptr %s2) optsize {
 ; CHECKS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECKS-NEXT:    .cfi_def_cfa_offset 16
 ; CHECKS-NEXT:    .cfi_offset w30, -16
-; CHECKS-NEXT:    mov w2, #31
+; CHECKS-NEXT:    mov w2, #31 // =0x1f
 ; CHECKS-NEXT:    bl memcmp
 ; CHECKS-NEXT:    cmp w0, #0
 ; CHECKS-NEXT:    cset w0, eq

diff  --git a/llvm/test/CodeGen/AArch64/bcmp.ll b/llvm/test/CodeGen/AArch64/bcmp.ll
index 7b8c3f81a0e3af..fee52ead989629 100644
--- a/llvm/test/CodeGen/AArch64/bcmp.ll
+++ b/llvm/test/CodeGen/AArch64/bcmp.ll
@@ -6,7 +6,7 @@ declare i32 @bcmp(ptr, ptr, i64)
 define i1 @bcmp0(ptr %a, ptr %b) {
 ; CHECK-LABEL: bcmp0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 0)
   %r = icmp eq i32 %cr, 0
@@ -249,10 +249,10 @@ define i1 @bcmp15(ptr %a, ptr %b) {
 define i1 @bcmp16(ptr %a, ptr %b) {
 ; CHECK-LABEL: bcmp16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x9, [x0]
-; CHECK-NEXT:    ldp x10, x11, [x1]
-; CHECK-NEXT:    cmp x8, x10
-; CHECK-NEXT:    ccmp x9, x11, #0, eq
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 16)
@@ -263,13 +263,13 @@ define i1 @bcmp16(ptr %a, ptr %b) {
 define i1 @bcmp20(ptr %a, ptr %b) {
 ; CHECK-LABEL: bcmp20:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x9, [x0]
-; CHECK-NEXT:    ldp x10, x11, [x1]
+; CHECK-NEXT:    ldp x8, x11, [x1]
 ; CHECK-NEXT:    ldr w12, [x0, #16]
-; CHECK-NEXT:    cmp x8, x10
-; CHECK-NEXT:    ldr w8, [x1, #16]
-; CHECK-NEXT:    ccmp x9, x11, #0, eq
-; CHECK-NEXT:    ccmp x12, x8, #0, eq
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldr w13, [x1, #16]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    ccmp x12, x13, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 20)
@@ -280,13 +280,13 @@ define i1 @bcmp20(ptr %a, ptr %b) {
 define i1 @bcmp24(ptr %a, ptr %b) {
 ; CHECK-LABEL: bcmp24:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x9, [x0]
-; CHECK-NEXT:    ldp x10, x11, [x1]
+; CHECK-NEXT:    ldp x8, x11, [x1]
 ; CHECK-NEXT:    ldr x12, [x0, #16]
-; CHECK-NEXT:    cmp x8, x10
-; CHECK-NEXT:    ldr x8, [x1, #16]
-; CHECK-NEXT:    ccmp x9, x11, #0, eq
-; CHECK-NEXT:    ccmp x12, x8, #0, eq
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldr x13, [x1, #16]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    ccmp x12, x13, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 24)
@@ -297,16 +297,16 @@ define i1 @bcmp24(ptr %a, ptr %b) {
 define i1 @bcmp28(ptr %a, ptr %b) {
 ; CHECK-LABEL: bcmp28:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x9, [x0]
-; CHECK-NEXT:    ldp x10, x11, [x1]
+; CHECK-NEXT:    ldp x8, x11, [x1]
 ; CHECK-NEXT:    ldr x12, [x0, #16]
-; CHECK-NEXT:    cmp x8, x10
-; CHECK-NEXT:    ldr x8, [x1, #16]
-; CHECK-NEXT:    ccmp x9, x11, #0, eq
-; CHECK-NEXT:    ldr w9, [x0, #24]
-; CHECK-NEXT:    ldr w10, [x1, #24]
-; CHECK-NEXT:    ccmp x12, x8, #0, eq
-; CHECK-NEXT:    ccmp x9, x10, #0, eq
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldr x13, [x1, #16]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ldr w8, [x0, #24]
+; CHECK-NEXT:    ldr w9, [x1, #24]
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    ccmp x12, x13, #0, eq
+; CHECK-NEXT:    ccmp x8, x9, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 28)
@@ -317,17 +317,17 @@ define i1 @bcmp28(ptr %a, ptr %b) {
 define i1 @bcmp33(ptr %a, ptr %b) {
 ; CHECK-LABEL: bcmp33:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x9, [x0]
-; CHECK-NEXT:    ldp x10, x11, [x1]
-; CHECK-NEXT:    cmp x8, x10
-; CHECK-NEXT:    ccmp x9, x11, #0, eq
-; CHECK-NEXT:    ldrb w11, [x1, #32]
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldp x12, x13, [x1, #16]
+; CHECK-NEXT:    cmp x9, x8
 ; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ldp x12, x10, [x1, #16]
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    ldrb w10, [x0, #32]
+; CHECK-NEXT:    ldrb w11, [x1, #32]
 ; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ldrb w8, [x0, #32]
-; CHECK-NEXT:    ccmp x9, x10, #0, eq
-; CHECK-NEXT:    ccmp x8, x11, #0, eq
+; CHECK-NEXT:    ccmp x9, x13, #0, eq
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 33)
@@ -338,17 +338,17 @@ define i1 @bcmp33(ptr %a, ptr %b) {
 define i1 @bcmp38(ptr %a, ptr %b) {
 ; CHECK-LABEL: bcmp38:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x9, [x0]
-; CHECK-NEXT:    ldp x10, x11, [x1]
-; CHECK-NEXT:    cmp x8, x10
-; CHECK-NEXT:    ccmp x9, x11, #0, eq
-; CHECK-NEXT:    ldur x11, [x1, #30]
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldp x12, x13, [x1, #16]
+; CHECK-NEXT:    cmp x9, x8
 ; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ldp x12, x10, [x1, #16]
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    ldur x10, [x0, #30]
+; CHECK-NEXT:    ldur x11, [x1, #30]
 ; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ldur x8, [x0, #30]
-; CHECK-NEXT:    ccmp x9, x10, #0, eq
-; CHECK-NEXT:    ccmp x8, x11, #0, eq
+; CHECK-NEXT:    ccmp x9, x13, #0, eq
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 38)
@@ -359,20 +359,20 @@ define i1 @bcmp38(ptr %a, ptr %b) {
 define i1 @bcmp45(ptr %a, ptr %b) {
 ; CHECK-LABEL: bcmp45:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x9, [x0]
-; CHECK-NEXT:    ldp x10, x11, [x1]
-; CHECK-NEXT:    cmp x8, x10
-; CHECK-NEXT:    ccmp x9, x11, #0, eq
-; CHECK-NEXT:    ldr x11, [x1, #32]
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldp x12, x13, [x1, #16]
+; CHECK-NEXT:    cmp x9, x8
 ; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ldp x12, x10, [x1, #16]
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    ldr x10, [x0, #32]
+; CHECK-NEXT:    ldr x11, [x1, #32]
+; CHECK-NEXT:    ccmp x8, x12, #0, eq
+; CHECK-NEXT:    ldur x8, [x0, #37]
+; CHECK-NEXT:    ldur x12, [x1, #37]
+; CHECK-NEXT:    ccmp x9, x13, #0, eq
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
 ; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ccmp x9, x10, #0, eq
-; CHECK-NEXT:    ldur x9, [x0, #37]
-; CHECK-NEXT:    ldur x10, [x1, #37]
-; CHECK-NEXT:    ccmp x8, x11, #0, eq
-; CHECK-NEXT:    ccmp x9, x10, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 45)
@@ -389,22 +389,22 @@ define i1 @bcmp45(ptr %a, ptr %b) {
 define i1 @bcmp64(ptr %a, ptr %b) {
 ; CHECK-LABEL: bcmp64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x9, [x0]
-; CHECK-NEXT:    ldp x10, x11, [x1]
-; CHECK-NEXT:    cmp x8, x10
-; CHECK-NEXT:    ccmp x9, x11, #0, eq
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldp x12, x13, [x1, #16]
+; CHECK-NEXT:    cmp x9, x8
 ; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ldp x12, x10, [x1, #16]
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ldp x8, x11, [x1, #32]
-; CHECK-NEXT:    ccmp x9, x10, #0, eq
-; CHECK-NEXT:    ldp x9, x10, [x0, #32]
-; CHECK-NEXT:    ccmp x9, x8, #0, eq
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ldp x9, x10, [x0, #48]
-; CHECK-NEXT:    ldp x8, x11, [x1, #48]
-; CHECK-NEXT:    ccmp x9, x8, #0, eq
 ; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    ccmp x8, x12, #0, eq
+; CHECK-NEXT:    ldp x8, x11, [x0, #32]
+; CHECK-NEXT:    ldp x10, x12, [x1, #32]
+; CHECK-NEXT:    ccmp x9, x13, #0, eq
+; CHECK-NEXT:    ldp x9, x13, [x1, #48]
+; CHECK-NEXT:    ccmp x8, x10, #0, eq
+; CHECK-NEXT:    ldp x8, x10, [x0, #48]
+; CHECK-NEXT:    ccmp x11, x12, #0, eq
+; CHECK-NEXT:    ccmp x8, x9, #0, eq
+; CHECK-NEXT:    ccmp x10, x13, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 64)
@@ -418,7 +418,7 @@ define i1 @bcmp89(ptr %a, ptr %b) {
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov w2, #89
+; CHECK-NEXT:    mov w2, #89 // =0x59
 ; CHECK-NEXT:    bl bcmp
 ; CHECK-NEXT:    cmp w0, #0
 ; CHECK-NEXT:    cset w0, eq
@@ -449,14 +449,14 @@ define i1 @bcmp_zext(i32 %0, i32 %1, i8 %2, i8 %3) {
 define i1 @bcmp_i8(i8 %a0, i8 %b0, i8 %a1, i8 %b1, i8 %a2, i8 %b2) {
 ; CHECK-LABEL: bcmp_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w9, w1, #0xff
-; CHECK-NEXT:    and w8, w2, #0xff
+; CHECK-NEXT:    and w8, w1, #0xff
+; CHECK-NEXT:    and w9, w2, #0xff
 ; CHECK-NEXT:    and w10, w3, #0xff
-; CHECK-NEXT:    cmp w9, w0, uxtb
-; CHECK-NEXT:    ccmp w10, w8, #0, eq
+; CHECK-NEXT:    cmp w8, w0, uxtb
 ; CHECK-NEXT:    and w8, w4, #0xff
-; CHECK-NEXT:    and w9, w5, #0xff
-; CHECK-NEXT:    ccmp w9, w8, #0, eq
+; CHECK-NEXT:    and w11, w5, #0xff
+; CHECK-NEXT:    ccmp w10, w9, #0, eq
+; CHECK-NEXT:    ccmp w11, w8, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %xor0 = xor i8 %b0, %a0
@@ -471,14 +471,14 @@ define i1 @bcmp_i8(i8 %a0, i8 %b0, i8 %a1, i8 %b1, i8 %a2, i8 %b2) {
 define i1 @bcmp_i16(i16 %a0, i16 %b0, i16 %a1, i16 %b1, i16 %a2, i16 %b2) {
 ; CHECK-LABEL: bcmp_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w9, w1, #0xffff
-; CHECK-NEXT:    and w8, w2, #0xffff
+; CHECK-NEXT:    and w8, w1, #0xffff
+; CHECK-NEXT:    and w9, w2, #0xffff
 ; CHECK-NEXT:    and w10, w3, #0xffff
-; CHECK-NEXT:    cmp w9, w0, uxth
-; CHECK-NEXT:    ccmp w10, w8, #0, eq
+; CHECK-NEXT:    cmp w8, w0, uxth
 ; CHECK-NEXT:    and w8, w4, #0xffff
-; CHECK-NEXT:    and w9, w5, #0xffff
-; CHECK-NEXT:    ccmp w9, w8, #0, eq
+; CHECK-NEXT:    and w11, w5, #0xffff
+; CHECK-NEXT:    ccmp w10, w9, #0, eq
+; CHECK-NEXT:    ccmp w11, w8, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %xor0 = xor i16 %b0, %a0
@@ -494,14 +494,14 @@ define i1 @bcmp_i128(i128 %a0, i128 %b0, i128 %a1, i128 %b1, i128 %a2, i128 %b2)
 ; CHECK-LABEL: bcmp_i128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmp x2, x0
+; CHECK-NEXT:    ldp x8, x10, [sp]
 ; CHECK-NEXT:    ccmp x3, x1, #0, eq
-; CHECK-NEXT:    ldp x9, x8, [sp]
+; CHECK-NEXT:    ldp x9, x11, [sp, #16]
 ; CHECK-NEXT:    ccmp x6, x4, #0, eq
-; CHECK-NEXT:    ldp x10, x11, [sp, #16]
 ; CHECK-NEXT:    ccmp x7, x5, #0, eq
 ; CHECK-NEXT:    cset w12, ne
-; CHECK-NEXT:    cmp x10, x9
-; CHECK-NEXT:    ccmp x11, x8, #0, eq
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ccmp x11, x10, #0, eq
 ; CHECK-NEXT:    csinc w0, w12, wzr, eq
 ; CHECK-NEXT:    ret
   %xor0 = xor i128 %b0, %a0
@@ -516,14 +516,14 @@ define i1 @bcmp_i128(i128 %a0, i128 %b0, i128 %a1, i128 %b1, i128 %a2, i128 %b2)
 define i1 @bcmp_i42(i42 %a0, i42 %b0, i42 %a1, i42 %b1, i42 %a2, i42 %b2) {
 ; CHECK-LABEL: bcmp_i42:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and x9, x0, #0x3ffffffffff
-; CHECK-NEXT:    and x10, x1, #0x3ffffffffff
-; CHECK-NEXT:    and x8, x2, #0x3ffffffffff
+; CHECK-NEXT:    and x8, x0, #0x3ffffffffff
+; CHECK-NEXT:    and x9, x1, #0x3ffffffffff
+; CHECK-NEXT:    and x10, x2, #0x3ffffffffff
 ; CHECK-NEXT:    and x11, x3, #0x3ffffffffff
-; CHECK-NEXT:    cmp x10, x9
-; CHECK-NEXT:    and x9, x5, #0x3ffffffffff
-; CHECK-NEXT:    ccmp x11, x8, #0, eq
+; CHECK-NEXT:    cmp x9, x8
 ; CHECK-NEXT:    and x8, x4, #0x3ffffffffff
+; CHECK-NEXT:    and x9, x5, #0x3ffffffffff
+; CHECK-NEXT:    ccmp x11, x10, #0, eq
 ; CHECK-NEXT:    ccmp x9, x8, #0, eq
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/bf16-shuffle.ll b/llvm/test/CodeGen/AArch64/bf16-shuffle.ll
index 8706cd21aa2c8b..cf0b43807f4c73 100644
--- a/llvm/test/CodeGen/AArch64/bf16-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-shuffle.ll
@@ -226,8 +226,8 @@ define <8 x bfloat> @shuffle3step0_bf16(<32 x bfloat> %src) {
 ; CHECK-NEXT:    adrp x8, .LCPI16_0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    mov v3.16b, v2.16b
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI16_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    adrp x8, .LCPI16_1
 ; CHECK-NEXT:    tbl v2.16b, { v0.16b, v1.16b }, v4.16b
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI16_1]
@@ -244,8 +244,8 @@ define <8 x bfloat> @shuffle3step1_bf16(<32 x bfloat> %src) {
 ; CHECK-NEXT:    adrp x8, .LCPI17_0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    mov v3.16b, v2.16b
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI17_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    adrp x8, .LCPI17_1
 ; CHECK-NEXT:    tbl v2.16b, { v0.16b, v1.16b }, v4.16b
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI17_1]
@@ -262,8 +262,8 @@ define <8 x bfloat> @shuffle3step2_bf16(<32 x bfloat> %src) {
 ; CHECK-NEXT:    adrp x8, .LCPI18_0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    mov v3.16b, v2.16b
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI18_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    adrp x8, .LCPI18_1
 ; CHECK-NEXT:    tbl v2.16b, { v0.16b, v1.16b }, v4.16b
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI18_1]

diff  --git a/llvm/test/CodeGen/AArch64/bfis-in-loop.ll b/llvm/test/CodeGen/AArch64/bfis-in-loop.ll
index d139d12d4af046..b66b149bd643fa 100644
--- a/llvm/test/CodeGen/AArch64/bfis-in-loop.ll
+++ b/llvm/test/CodeGen/AArch64/bfis-in-loop.ll
@@ -13,26 +13,26 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 define i64 @bfis_in_loop_zero() {
 ; CHECK-LABEL: bfis_in_loop_zero:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    adrp x9, :got:global
-; CHECK-NEXT:    mov x0, xzr
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    ldr x9, [x9, :got_lo12:global]
-; CHECK-NEXT:    ldr x9, [x9]
-; CHECK-NEXT:  .LBB0_1: // %midblock
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldrh w10, [x9, #72]
-; CHECK-NEXT:    cmp w10, #0
-; CHECK-NEXT:    ubfx x11, x10, #8, #24
-; CHECK-NEXT:    cset w12, ne
-; CHECK-NEXT:    csel w8, w8, w11, eq
-; CHECK-NEXT:    ldr x11, [x9, #8]
-; CHECK-NEXT:    and x9, x10, #0xff
-; CHECK-NEXT:    and x10, x0, #0xffffffff00000000
-; CHECK-NEXT:    orr x9, x9, x8, lsl #8
-; CHECK-NEXT:    orr x10, x10, x12, lsl #16
-; CHECK-NEXT:    orr x0, x10, x9
-; CHECK-NEXT:    ldr x9, [x11, #16]
-; CHECK-NEXT:    cbnz x11, .LBB0_1
+; CHECK-NEXT: 	adrp	x8, :got:global
+; CHECK-NEXT: 	mov	x0, xzr
+; CHECK-NEXT: 	mov	w9, wzr
+; CHECK-NEXT: 	ldr	x8, [x8, :got_lo12:global]
+; CHECK-NEXT: 	ldr	x8, [x8]
+; CHECK-NEXT: .LBB0_1:                                // %midblock
+; CHECK-NEXT:   // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: 	ldrh	w10, [x8, #72]
+; CHECK-NEXT: 	ldr	x13, [x8, #8]
+; CHECK-NEXT: 	ubfx	x11, x10, #8, #24
+; CHECK-NEXT: 	cmp	w10, #0
+; CHECK-NEXT: 	and	x10, x10, #0xff
+; CHECK-NEXT: 	cset	w12, ne
+; CHECK-NEXT: 	ldr	x8, [x13, #16]
+; CHECK-NEXT: 	csel	w9, w9, w11, eq
+; CHECK-NEXT: 	and	x11, x0, #0xffffffff00000000
+; CHECK-NEXT: 	orr	x10, x10, x9, lsl #8
+; CHECK-NEXT: 	orr	x11, x11, x12, lsl #16
+; CHECK-NEXT: 	orr	x0, x11, x10
+; CHECK-NEXT: 	cbnz	x13, .LBB0_1
 ; CHECK-NEXT:  // %bb.2: // %exit
 ; CHECK-NEXT:    ret
 entry:
@@ -81,26 +81,26 @@ exit:
 define i64 @bfis_in_loop_undef() {
 ; CHECK-LABEL: bfis_in_loop_undef:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    adrp x9, :got:global
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    // implicit-def: $x0
-; CHECK-NEXT:    ldr x9, [x9, :got_lo12:global]
-; CHECK-NEXT:    ldr x9, [x9]
-; CHECK-NEXT:  .LBB1_1: // %midblock
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldrh w10, [x9, #72]
-; CHECK-NEXT:    cmp w10, #0
-; CHECK-NEXT:    ubfx x11, x10, #8, #24
-; CHECK-NEXT:    cset w12, ne
-; CHECK-NEXT:    csel w8, w8, w11, eq
-; CHECK-NEXT:    ldr x11, [x9, #8]
-; CHECK-NEXT:    and x9, x10, #0xff
-; CHECK-NEXT:    and x10, x0, #0xffffffff00000000
-; CHECK-NEXT:    orr x9, x9, x8, lsl #8
-; CHECK-NEXT:    orr x10, x10, x12, lsl #16
-; CHECK-NEXT:    orr x0, x10, x9
-; CHECK-NEXT:    ldr x9, [x11, #16]
-; CHECK-NEXT:    cbnz x11, .LBB1_1
+; CHECK-NEXT: 	adrp	x9, :got:global
+; CHECK-NEXT: 	mov	w8, wzr
+; CHECK-NEXT:                                         // implicit-def: $x0
+; CHECK-NEXT: 	ldr	x9, [x9, :got_lo12:global]
+; CHECK-NEXT: 	ldr	x9, [x9]
+; CHECK-NEXT: .LBB1_1:                                // %midblock
+; CHECK-NEXT:                                         // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: 	ldrh	w10, [x9, #72]
+; CHECK-NEXT: 	ldr	x13, [x9, #8]
+; CHECK-NEXT: 	ubfx	x11, x10, #8, #24
+; CHECK-NEXT: 	cmp	w10, #0
+; CHECK-NEXT: 	and	x10, x10, #0xff
+; CHECK-NEXT: 	cset	w12, ne
+; CHECK-NEXT: 	ldr	x9, [x13, #16]
+; CHECK-NEXT: 	csel	w8, w8, w11, eq
+; CHECK-NEXT: 	and	x11, x0, #0xffffffff00000000
+; CHECK-NEXT: 	orr	x10, x10, x8, lsl #8
+; CHECK-NEXT: 	orr	x11, x11, x12, lsl #16
+; CHECK-NEXT: 	orr	x0, x11, x10
+; CHECK-NEXT: 	cbnz	x13, .LBB1_1
 ; CHECK-NEXT:  // %bb.2: // %exit
 ; CHECK-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/bitfield-insert.ll b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
index edd8bcee3b4827..30b5e86c1e6dc8 100644
--- a/llvm/test/CodeGen/AArch64/bitfield-insert.ll
+++ b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
@@ -95,11 +95,11 @@ define void @test_whole32_from64(ptr %existing, ptr %new) {
 define void @test_32bit_masked(ptr %existing, ptr %new) {
 ; CHECK-LABEL: test_32bit_masked:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    mov w10, #135 // =0x87
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    and w8, w8, w10
-; CHECK-NEXT:    bfi w8, w9, #3, #4
+; CHECK-NEXT:    ldr w9, [x0]
+; CHECK-NEXT:    mov w8, #135 // =0x87
+; CHECK-NEXT:    ldr w10, [x1]
+; CHECK-NEXT:    and w8, w9, w8
+; CHECK-NEXT:    bfi w8, w10, #3, #4
 ; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    ret
   %oldval = load volatile i32, ptr %existing
@@ -141,11 +141,11 @@ define void @test_64bit_masked(ptr %existing, ptr %new) {
 define void @test_32bit_complexmask(ptr %existing, ptr %new) {
 ; CHECK-LABEL: test_32bit_complexmask:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    mov w10, #647 // =0x287
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    and w8, w8, w10
-; CHECK-NEXT:    bfi w8, w9, #3, #4
+; CHECK-NEXT:    ldr w9, [x0]
+; CHECK-NEXT:    mov w8, #647 // =0x287
+; CHECK-NEXT:    ldr w10, [x1]
+; CHECK-NEXT:    and w8, w9, w8
+; CHECK-NEXT:    bfi w8, w10, #3, #4
 ; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    ret
   %oldval = load volatile i32, ptr %existing
@@ -166,11 +166,11 @@ define void @test_32bit_badmask(ptr %existing, ptr %new) {
 ; CHECK-LABEL: test_32bit_badmask:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    mov w10, #135 // =0x87
 ; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    mov w11, #632 // =0x278
-; CHECK-NEXT:    and w8, w8, w10
-; CHECK-NEXT:    and w9, w11, w9, lsl #3
+; CHECK-NEXT:    mov w10, #632 // =0x278
+; CHECK-NEXT:    mov w11, #135 // =0x87
+; CHECK-NEXT:    and w9, w10, w9, lsl #3
+; CHECK-NEXT:    and w8, w8, w11
 ; CHECK-NEXT:    orr w8, w8, w9
 ; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    ret
@@ -191,13 +191,13 @@ define void @test_32bit_badmask(ptr %existing, ptr %new) {
 define void @test_64bit_badmask(ptr %existing, ptr %new) {
 ; CHECK-LABEL: test_64bit_badmask:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    mov w8, #135 // =0x87
-; CHECK-NEXT:    ldr x10, [x1]
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    mov w10, #135 // =0x87
 ; CHECK-NEXT:    mov w11, #664 // =0x298
-; CHECK-NEXT:    and x8, x9, x8
-; CHECK-NEXT:    lsl w10, w10, #3
-; CHECK-NEXT:    and x9, x10, x11
+; CHECK-NEXT:    lsl w9, w9, #3
+; CHECK-NEXT:    and x8, x8, x10
+; CHECK-NEXT:    and x9, x9, x11
 ; CHECK-NEXT:    orr x8, x8, x9
 ; CHECK-NEXT:    str x8, [x0]
 ; CHECK-NEXT:    ret
@@ -544,8 +544,8 @@ define i64 @test8(i64 %a) {
 define i32 @test9(i64 %b, i32 %e) {
 ; CHECK-LABEL: test9:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr w8, w1, #23
 ; CHECK-NEXT:    lsr x0, x0, #12
+; CHECK-NEXT:    lsr w8, w1, #23
 ; CHECK-NEXT:    bfi w0, w8, #23, #9
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/bool-ext-inc.ll b/llvm/test/CodeGen/AArch64/bool-ext-inc.ll
index f8d185bedd2b55..850ce200f5ad9b 100644
--- a/llvm/test/CodeGen/AArch64/bool-ext-inc.ll
+++ b/llvm/test/CodeGen/AArch64/bool-ext-inc.ll
@@ -31,8 +31,8 @@ define <4 x i32> @zextbool_sub_vector(<4 x i32> %c1, <4 x i32> %c2, <4 x i32> %x
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    adrp x8, .LCPI1_0
 ; GISEL-NEXT:    cmeq v0.4s, v0.4s, v1.4s
-; GISEL-NEXT:    ldr q3, [x8, :lo12:.LCPI1_0]
-; GISEL-NEXT:    and v0.16b, v0.16b, v3.16b
+; GISEL-NEXT:    ldr q1, [x8, :lo12:.LCPI1_0]
+; GISEL-NEXT:    and v0.16b, v0.16b, v1.16b
 ; GISEL-NEXT:    sub v0.4s, v2.4s, v0.4s
 ; GISEL-NEXT:    ret
   %c = icmp eq <4 x i32> %c1, %c2
@@ -107,7 +107,7 @@ define i32 @caller_signext_i1() {
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov w0, #-1
+; CHECK-NEXT:    mov w0, #-1 // =0xffffffff
 ; CHECK-NEXT:    bl callee_signext_i1
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -117,7 +117,7 @@ define i32 @caller_signext_i1() {
 ; GISEL-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GISEL-NEXT:    .cfi_def_cfa_offset 16
 ; GISEL-NEXT:    .cfi_offset w30, -16
-; GISEL-NEXT:    mov w8, #1
+; GISEL-NEXT:    mov w8, #1 // =0x1
 ; GISEL-NEXT:    sbfx w0, w8, #0, #1
 ; GISEL-NEXT:    bl callee_signext_i1
 ; GISEL-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AArch64/branch-relax-alignment.ll b/llvm/test/CodeGen/AArch64/branch-relax-alignment.ll
index d409a0b57a3d7a..bafe432bbfbdbe 100644
--- a/llvm/test/CodeGen/AArch64/branch-relax-alignment.ll
+++ b/llvm/test/CodeGen/AArch64/branch-relax-alignment.ll
@@ -7,9 +7,9 @@
 define i32 @invert_bcc_block_align_higher_func(i32 %x, i32 %y) align 4 #0 {
 ; CHECK-LABEL: invert_bcc_block_align_higher_func:
 ; CHECK:       ; %bb.0: ; %common.ret
+; CHECK-NEXT:    mov w8, #9 ; =0x9
 ; CHECK-NEXT:    cmp w0, w1
-; CHECK-NEXT:    mov w8, #9
-; CHECK-NEXT:    mov w9, #42
+; CHECK-NEXT:    mov w9, #42 ; =0x2a
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    csel w8, w9, w8, eq
 ; CHECK-NEXT:    str w8, [x8]

diff  --git a/llvm/test/CodeGen/AArch64/branch-relax-bcc.ll b/llvm/test/CodeGen/AArch64/branch-relax-bcc.ll
index fed9734a2251ef..1a901dc40f14c0 100644
--- a/llvm/test/CodeGen/AArch64/branch-relax-bcc.ll
+++ b/llvm/test/CodeGen/AArch64/branch-relax-bcc.ll
@@ -4,9 +4,9 @@
 define i32 @invert_bcc(float %x, float %y) #0 {
 ; CHECK-LABEL: invert_bcc:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    mov w8, #42
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    mov w8, #42 ; =0x2a
 ; CHECK-NEXT:    b.pl LBB0_3
 ; CHECK-NEXT:    b LBB0_2
 ; CHECK-NEXT:  LBB0_3:
@@ -15,8 +15,8 @@ define i32 @invert_bcc(float %x, float %y) #0 {
 ; CHECK-NEXT:    str w8, [x8]
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  LBB0_2: ; %bb2
-; CHECK-NEXT:    mov w0, #1
-; CHECK-NEXT:    mov w8, #9
+; CHECK-NEXT:    mov w0, #1 ; =0x1
+; CHECK-NEXT:    mov w8, #9 ; =0x9
 ; CHECK-NEXT:    ; InlineAsm Start
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
@@ -59,7 +59,7 @@ define i32 @block_split(i32 %a, i32 %b) #0 {
 ; CHECK-NEXT:    bl _foo
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; CHECK-NEXT:  LBB1_3: ; %if.end
-; CHECK-NEXT:    mov w0, #7
+; CHECK-NEXT:    mov w0, #7 ; =0x7
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 5

diff  --git a/llvm/test/CodeGen/AArch64/build-one-lane.ll b/llvm/test/CodeGen/AArch64/build-one-lane.ll
index 33de60cd3290e7..a517ca4a1bb4bc 100644
--- a/llvm/test/CodeGen/AArch64/build-one-lane.ll
+++ b/llvm/test/CodeGen/AArch64/build-one-lane.ll
@@ -318,14 +318,13 @@ define void @v2f64st(ptr %p, double %s) nounwind {
 define <32 x i8> @test_lanex_32xi8(<32 x i8> %a, i32 %x) {
 ; CHECK-LABEL: test_lanex_32xi8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #32
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    and x9, x0, #0x1f
-; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    mov w10, #30
-; CHECK-NEXT:    stp q0, q1, [sp]
-; CHECK-NEXT:    strb w10, [x8, x9]
+; CHECK-NEXT:    stp q0, q1, [sp, #-32]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    and x8, x0, #0x1f
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    mov w10, #30 // =0x1e
+; CHECK-NEXT:    strb w10, [x9, x8]
 ; CHECK-NEXT:    ldp q0, q1, [sp], #32
 ; CHECK-NEXT:    ret
   %b = insertelement <32 x i8> %a, i8 30, i32 %x

diff  --git a/llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll b/llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll
index 0c24f4c9ea0a5d..3b064b718cd679 100644
--- a/llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll
+++ b/llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll
@@ -10,8 +10,8 @@ define i32 @widget(i64 %arg, <8 x i16> %arg1) {
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    umov w9, v0.h[0]
-; CHECK-NEXT:    mov x10, sp
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    mov x10, sp
 ; CHECK-NEXT:    bfi x10, x0, #1, #3
 ; CHECK-NEXT:    mov x8, x0
 ; CHECK-NEXT:    mov w0, wzr

diff  --git a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll
index f7f355a9e101d9..5cfa59a3022394 100644
--- a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll
+++ b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll
@@ -78,8 +78,8 @@ entry:
 define <16 x i8> @test5(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
 ; CHECK-LABEL: test5:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    adrp x8, .LCPI4_0
 ; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    adrp x8, .LCPI4_0
 ; CHECK-NEXT:    ld1r { v1.16b }, [x1]
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI4_0]
 ; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
@@ -212,8 +212,8 @@ define <4 x i32> @test12(ptr nocapture noundef readonly %a, ptr nocapture nounde
 ; CHECK-NEXT:    ld1r { v0.2s }, [x0]
 ; CHECK-NEXT:    ldr w8, [x1]
 ; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    mov v1.s[0], w8
 ; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    mov v1.s[0], w8
 ; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/cgp-usubo.ll b/llvm/test/CodeGen/AArch64/cgp-usubo.ll
index 6b820b635f80e3..2f081cf96d8b88 100644
--- a/llvm/test/CodeGen/AArch64/cgp-usubo.ll
+++ b/llvm/test/CodeGen/AArch64/cgp-usubo.ll
@@ -37,12 +37,12 @@ define i1 @usubo_ugt_constant_op0_i8(i8 %x, ptr %p) nounwind {
 ; CHECK-LABEL: usubo_ugt_constant_op0_i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w8, w0, #0xff
-; CHECK-NEXT:    mov w9, #42
+; CHECK-NEXT:    mov w9, #42 // =0x2a
 ; CHECK-NEXT:    cmp w8, #42
 ; CHECK-NEXT:    sub w9, w9, w0
 ; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    mov w0, w8
 ; CHECK-NEXT:    strb w9, [x1]
+; CHECK-NEXT:    mov w0, w8
 ; CHECK-NEXT:    ret
   %s = sub i8 42, %x
   %ov = icmp ugt i8 %x, 42
@@ -56,12 +56,12 @@ define i1 @usubo_ult_constant_op0_i16(i16 %x, ptr %p) nounwind {
 ; CHECK-LABEL: usubo_ult_constant_op0_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    mov w9, #43
+; CHECK-NEXT:    mov w9, #43 // =0x2b
 ; CHECK-NEXT:    cmp w8, #43
 ; CHECK-NEXT:    sub w9, w9, w0
 ; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    mov w0, w8
 ; CHECK-NEXT:    strh w9, [x1]
+; CHECK-NEXT:    mov w0, w8
 ; CHECK-NEXT:    ret
   %s = sub i16 43, %x
   %ov = icmp ult i16 43, %x
@@ -77,9 +77,9 @@ define i1 @usubo_ult_constant_op1_i16(i16 %x, ptr %p) nounwind {
 ; CHECK-NEXT:    and w8, w0, #0xffff
 ; CHECK-NEXT:    sub w9, w0, #44
 ; CHECK-NEXT:    cmp w8, #44
+; CHECK-NEXT:    strh w9, [x1]
 ; CHECK-NEXT:    cset w8, lo
 ; CHECK-NEXT:    mov w0, w8
-; CHECK-NEXT:    strh w9, [x1]
 ; CHECK-NEXT:    ret
   %s = add i16 %x, -44
   %ov = icmp ult i16 %x, 44
@@ -93,9 +93,9 @@ define i1 @usubo_ugt_constant_op1_i8(i8 %x, ptr %p) nounwind {
 ; CHECK-NEXT:    and w8, w0, #0xff
 ; CHECK-NEXT:    sub w9, w0, #45
 ; CHECK-NEXT:    cmp w8, #45
+; CHECK-NEXT:    strb w9, [x1]
 ; CHECK-NEXT:    cset w8, lo
 ; CHECK-NEXT:    mov w0, w8
-; CHECK-NEXT:    strb w9, [x1]
 ; CHECK-NEXT:    ret
   %ov = icmp ugt i8 45, %x
   %s = add i8 %x, -45
@@ -111,8 +111,8 @@ define i1 @usubo_eq_constant1_op1_i32(i32 %x, ptr %p) nounwind {
 ; CHECK-NEXT:    cmp w0, #0
 ; CHECK-NEXT:    sub w9, w0, #1
 ; CHECK-NEXT:    cset w8, eq
-; CHECK-NEXT:    mov w0, w8
 ; CHECK-NEXT:    str w9, [x1]
+; CHECK-NEXT:    mov w0, w8
 ; CHECK-NEXT:    ret
   %s = add i32 %x, -1
   %ov = icmp eq i32 %x, 0
@@ -162,10 +162,10 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) nounwin
 ; CHECK-NEXT:  // %bb.1: // %t
 ; CHECK-NEXT:    cmp x0, x1
 ; CHECK-NEXT:    mov x23, x0
-; CHECK-NEXT:    cset w21, lo
 ; CHECK-NEXT:    mov x20, x2
-; CHECK-NEXT:    mov w0, w21
+; CHECK-NEXT:    cset w21, lo
 ; CHECK-NEXT:    mov x22, x1
+; CHECK-NEXT:    mov w0, w21
 ; CHECK-NEXT:    bl call
 ; CHECK-NEXT:    subs x8, x23, x22
 ; CHECK-NEXT:    b.hs .LBB8_3

diff  --git a/llvm/test/CodeGen/AArch64/cmp-chains.ll b/llvm/test/CodeGen/AArch64/cmp-chains.ll
index 2de06975d8b30b..c4ad84d9fa25ba 100644
--- a/llvm/test/CodeGen/AArch64/cmp-chains.ll
+++ b/llvm/test/CodeGen/AArch64/cmp-chains.ll
@@ -78,8 +78,8 @@ define i32 @cmp_and4(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32
 ; GISEL-NEXT:    cmp w4, w5
 ; GISEL-NEXT:    cset w10, ne
 ; GISEL-NEXT:    cmp w6, w7
-; GISEL-NEXT:    cset w11, eq
 ; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    cset w11, eq
 ; GISEL-NEXT:    and w9, w10, w11
 ; GISEL-NEXT:    and w0, w8, w9
 ; GISEL-NEXT:    ret
@@ -168,8 +168,8 @@ define i32 @cmp_or4(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32
 ; GISEL-NEXT:    cmp w4, w5
 ; GISEL-NEXT:    cset w10, ne
 ; GISEL-NEXT:    cmp w6, w7
-; GISEL-NEXT:    cset w11, eq
 ; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    cset w11, eq
 ; GISEL-NEXT:    orr w9, w10, w11
 ; GISEL-NEXT:    orr w0, w8, w9
 ; GISEL-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
index 6ce10d1b56d994..bf0c69e291c8fb 100644
--- a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
+++ b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
@@ -229,9 +229,9 @@ define <4 x i32> @not_sign_4xi32_3(<4 x i32> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.2d, #0xffffffffffffffff
 ; CHECK-NEXT:    adrp x8, .LCPI18_0
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI18_0]
 ; CHECK-NEXT:    cmgt v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    bic v1.16b, v2.16b, v0.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI18_0]
+; CHECK-NEXT:    bic v1.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    ret
   %c = icmp sgt <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -244,18 +244,18 @@ define <4 x i65> @sign_4xi65(<4 x i65> %a) {
 ; CHECK-LABEL: sign_4xi65:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sbfx x8, x1, #0, #1
-; CHECK-NEXT:    sbfx x10, x5, #0, #1
-; CHECK-NEXT:    orr x9, x8, #0x1
-; CHECK-NEXT:    lsr x1, x8, #63
-; CHECK-NEXT:    sbfx x8, x7, #0, #1
-; CHECK-NEXT:    orr x4, x10, #0x1
-; CHECK-NEXT:    lsr x5, x10, #63
-; CHECK-NEXT:    orr x6, x8, #0x1
-; CHECK-NEXT:    fmov d0, x9
 ; CHECK-NEXT:    sbfx x9, x3, #0, #1
-; CHECK-NEXT:    orr x2, x9, #0x1
+; CHECK-NEXT:    sbfx x10, x7, #0, #1
+; CHECK-NEXT:    lsr x1, x8, #63
+; CHECK-NEXT:    orr x8, x8, #0x1
 ; CHECK-NEXT:    lsr x3, x9, #63
-; CHECK-NEXT:    lsr x7, x8, #63
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    sbfx x8, x5, #0, #1
+; CHECK-NEXT:    lsr x7, x10, #63
+; CHECK-NEXT:    orr x2, x9, #0x1
+; CHECK-NEXT:    orr x6, x10, #0x1
+; CHECK-NEXT:    lsr x5, x8, #63
+; CHECK-NEXT:    orr x4, x8, #0x1
 ; CHECK-NEXT:    mov v0.d[1], x1
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll b/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll
index 15fabf37793c12..b9da2b76816a99 100644
--- a/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll
+++ b/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll
@@ -15,7 +15,7 @@ define i32 @test_return(ptr %p, i32 %oldval, i32 %newval) {
 ; CHECK-NEXT:    stlxr w8, w2, [x0]
 ; CHECK-NEXT:    cbnz w8, LBB0_1
 ; CHECK-NEXT:  ; %bb.3:
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 ; =0x1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  LBB0_4: ; %cmpxchg.nostore
 ; CHECK-NEXT:    mov w0, wzr
@@ -64,7 +64,7 @@ define i1 @test_return_bool(ptr %value, i8 %oldValue, i8 %newValue) {
 ; CHECK-NEXT:    stlxrb w9, w2, [x0]
 ; CHECK-NEXT:    cbnz w9, LBB1_1
 ; CHECK-NEXT:  ; %bb.3:
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov w8, #1 ; =0x1
 ; CHECK-NEXT:    eor w0, w8, #0x1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  LBB1_4: ; %cmpxchg.nostore
@@ -87,8 +87,8 @@ define i1 @test_return_bool(ptr %value, i8 %oldValue, i8 %newValue) {
 ; OUTLINE-ATOMICS-NEXT:    mov w1, w2
 ; OUTLINE-ATOMICS-NEXT:    mov x2, x8
 ; OUTLINE-ATOMICS-NEXT:    bl ___aarch64_cas1_acq_rel
-; OUTLINE-ATOMICS-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
 ; OUTLINE-ATOMICS-NEXT:    cmp w0, w19, uxtb
+; OUTLINE-ATOMICS-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
 ; OUTLINE-ATOMICS-NEXT:    cset w8, eq
 ; OUTLINE-ATOMICS-NEXT:    eor w0, w8, #0x1
 ; OUTLINE-ATOMICS-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
@@ -188,13 +188,13 @@ define i1 @test_conditional2(i32 %a, i32 %b, ptr %c) {
 ; CHECK-NEXT:    stlxr w8, w20, [x19]
 ; CHECK-NEXT:    cbnz w8, LBB3_1
 ; CHECK-NEXT:  ; %bb.3:
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov w8, #1 ; =0x1
 ; CHECK-NEXT:    b LBB3_5
 ; CHECK-NEXT:  LBB3_4: ; %cmpxchg.nostore
 ; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    clrex
 ; CHECK-NEXT:  LBB3_5: ; %for.cond.preheader
-; CHECK-NEXT:    mov w22, #2
+; CHECK-NEXT:    mov w22, #2 ; =0x2
 ; CHECK-NEXT:  LBB3_6: ; %for.cond
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    cbz w22, LBB3_9
@@ -236,7 +236,7 @@ define i1 @test_conditional2(i32 %a, i32 %b, ptr %c) {
 ; OUTLINE-ATOMICS-NEXT:    mov w21, w0
 ; OUTLINE-ATOMICS-NEXT:    bl ___aarch64_cas4_acq_rel
 ; OUTLINE-ATOMICS-NEXT:    cmp w0, w21
-; OUTLINE-ATOMICS-NEXT:    mov w22, #2
+; OUTLINE-ATOMICS-NEXT:    mov w22, #2 ; =0x2
 ; OUTLINE-ATOMICS-NEXT:    cset w8, eq
 ; OUTLINE-ATOMICS-NEXT:  LBB3_1: ; %for.cond
 ; OUTLINE-ATOMICS-NEXT:    ; =>This Inner Loop Header: Depth=1

diff  --git a/llvm/test/CodeGen/AArch64/combine-andintoload.ll b/llvm/test/CodeGen/AArch64/combine-andintoload.ll
index d18dc0772adfc5..f0b8fef848998f 100644
--- a/llvm/test/CodeGen/AArch64/combine-andintoload.ll
+++ b/llvm/test/CodeGen/AArch64/combine-andintoload.ll
@@ -412,13 +412,13 @@ define i64 @load32_and16_sexty(ptr %p, i32 %y) {
 define zeroext i1 @bigger(ptr nocapture readonly %c, ptr nocapture readonly %e, i64 %d, i64 %p1) {
 ; CHECK-LABEL: bigger:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrb w8, [x0, x2]
+; CHECK-NEXT:    ldrb w8, [x1, x2]
+; CHECK-NEXT:    ldrb w9, [x0, x2]
 ; CHECK-NEXT:    and w10, w3, #0x7
-; CHECK-NEXT:    ldrb w9, [x1, x2]
-; CHECK-NEXT:    mov w11, #8
+; CHECK-NEXT:    mov w11, #8 // =0x8
 ; CHECK-NEXT:    sub w10, w11, w10
-; CHECK-NEXT:    eor w8, w9, w8
-; CHECK-NEXT:    mov w9, #5
+; CHECK-NEXT:    eor w8, w8, w9
+; CHECK-NEXT:    mov w9, #5 // =0x5
 ; CHECK-NEXT:    lsr w8, w8, w10
 ; CHECK-NEXT:    tst w8, w9
 ; CHECK-NEXT:    cset w0, eq
@@ -426,13 +426,13 @@ define zeroext i1 @bigger(ptr nocapture readonly %c, ptr nocapture readonly %e,
 ;
 ; CHECKBE-LABEL: bigger:
 ; CHECKBE:       // %bb.0: // %entry
-; CHECKBE-NEXT:    ldrb w8, [x0, x2]
+; CHECKBE-NEXT:    ldrb w8, [x1, x2]
+; CHECKBE-NEXT:    ldrb w9, [x0, x2]
 ; CHECKBE-NEXT:    and w10, w3, #0x7
-; CHECKBE-NEXT:    ldrb w9, [x1, x2]
-; CHECKBE-NEXT:    mov w11, #8
+; CHECKBE-NEXT:    mov w11, #8 // =0x8
 ; CHECKBE-NEXT:    sub w10, w11, w10
-; CHECKBE-NEXT:    eor w8, w9, w8
-; CHECKBE-NEXT:    mov w9, #5
+; CHECKBE-NEXT:    eor w8, w8, w9
+; CHECKBE-NEXT:    mov w9, #5 // =0x5
 ; CHECKBE-NEXT:    lsr w8, w8, w10
 ; CHECKBE-NEXT:    tst w8, w9
 ; CHECKBE-NEXT:    cset w0, eq

diff  --git a/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll b/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
index 101e5696c70f98..6449c3e11d6672 100644
--- a/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
+++ b/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
@@ -13,10 +13,10 @@ define i32 @combine_gt_ge_10() #0 {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    adrp x8, :got:a
 ; CHECK-NEXT:    ldr x8, [x8, :got_lo12:a]
-; CHECK-NEXT:    ldr w9, [x8]
+; CHECK-NEXT:    ldr w8, [x8]
+; CHECK-NEXT:    cmp w8, #10
 ; CHECK-NEXT:    adrp x8, :got:b
 ; CHECK-NEXT:    ldr x8, [x8, :got_lo12:b]
-; CHECK-NEXT:    cmp w9, #10
 ; CHECK-NEXT:    b.le .LBB0_3
 ; CHECK-NEXT:  // %bb.1: // %land.lhs.true
 ; CHECK-NEXT:    adrp x9, :got:c
@@ -26,7 +26,7 @@ define i32 @combine_gt_ge_10() #0 {
 ; CHECK-NEXT:    cmp w10, w9
 ; CHECK-NEXT:    b.ne .LBB0_4
 ; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB0_3: // %lor.lhs.false
 ; CHECK-NEXT:    b.lt .LBB0_6
@@ -38,7 +38,7 @@ define i32 @combine_gt_ge_10() #0 {
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    b.ne .LBB0_6
 ; CHECK-NEXT:  // %bb.5:
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB0_6: // %if.end
 ; CHECK-NEXT:    mov w0, wzr
@@ -91,7 +91,7 @@ define i32 @combine_gt_lt_5() #0 {
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    b.ne .LBB1_6
 ; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB1_3: // %lor.lhs.false
 ; CHECK-NEXT:    b.ge .LBB1_6
@@ -105,7 +105,7 @@ define i32 @combine_gt_lt_5() #0 {
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    b.ne .LBB1_6
 ; CHECK-NEXT:  // %bb.5:
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB1_6: // %if.end
 ; CHECK-NEXT:    mov w0, wzr
@@ -145,10 +145,10 @@ define i32 @combine_lt_ge_5() #0 {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    adrp x8, :got:a
 ; CHECK-NEXT:    ldr x8, [x8, :got_lo12:a]
-; CHECK-NEXT:    ldr w9, [x8]
+; CHECK-NEXT:    ldr w8, [x8]
+; CHECK-NEXT:    cmp w8, #5
 ; CHECK-NEXT:    adrp x8, :got:b
 ; CHECK-NEXT:    ldr x8, [x8, :got_lo12:b]
-; CHECK-NEXT:    cmp w9, #5
 ; CHECK-NEXT:    b.ge .LBB2_3
 ; CHECK-NEXT:  // %bb.1: // %land.lhs.true
 ; CHECK-NEXT:    adrp x9, :got:c
@@ -158,7 +158,7 @@ define i32 @combine_lt_ge_5() #0 {
 ; CHECK-NEXT:    cmp w10, w9
 ; CHECK-NEXT:    b.ne .LBB2_4
 ; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB2_3: // %lor.lhs.false
 ; CHECK-NEXT:    b.gt .LBB2_6
@@ -170,7 +170,7 @@ define i32 @combine_lt_ge_5() #0 {
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    b.ne .LBB2_6
 ; CHECK-NEXT:  // %bb.5:
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB2_6: // %if.end
 ; CHECK-NEXT:    mov w0, wzr
@@ -223,7 +223,7 @@ define i32 @combine_lt_gt_5() #0 {
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    b.ne .LBB3_6
 ; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB3_3: // %lor.lhs.false
 ; CHECK-NEXT:    b.le .LBB3_6
@@ -237,7 +237,7 @@ define i32 @combine_lt_gt_5() #0 {
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    b.ne .LBB3_6
 ; CHECK-NEXT:  // %bb.5:
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB3_6: // %if.end
 ; CHECK-NEXT:    mov w0, wzr
@@ -290,7 +290,7 @@ define i32 @combine_gt_lt_n5() #0 {
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    b.ne .LBB4_6
 ; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB4_3: // %lor.lhs.false
 ; CHECK-NEXT:    b.ge .LBB4_6
@@ -304,7 +304,7 @@ define i32 @combine_gt_lt_n5() #0 {
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    b.ne .LBB4_6
 ; CHECK-NEXT:  // %bb.5:
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB4_6: // %if.end
 ; CHECK-NEXT:    mov w0, wzr
@@ -357,7 +357,7 @@ define i32 @combine_lt_gt_n5() #0 {
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    b.ne .LBB5_6
 ; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB5_3: // %lor.lhs.false
 ; CHECK-NEXT:    b.le .LBB5_6
@@ -371,7 +371,7 @@ define i32 @combine_lt_gt_n5() #0 {
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    b.ne .LBB5_6
 ; CHECK-NEXT:  // %bb.5:
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB5_6: // %if.end
 ; CHECK-NEXT:    mov w0, wzr
@@ -425,7 +425,7 @@ define void @combine_non_adjacent_cmp_br(ptr nocapture readonly %hdCall) #0 {
 ; CHECK-NEXT:    .cfi_offset w22, -32
 ; CHECK-NEXT:    .cfi_offset w30, -48
 ; CHECK-NEXT:    ldr x20, [x0]
-; CHECK-NEXT:    mov w19, #24
+; CHECK-NEXT:    mov w19, #24 // =0x18
 ; CHECK-NEXT:    adrp x22, glob
 ; CHECK-NEXT:    add x21, x20, #2
 ; CHECK-NEXT:  .LBB6_1: // %land.rhs
@@ -511,7 +511,7 @@ define i32 @do_nothing_if_resultant_opcodes_would_
diff er() #0 {
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    b.ne .LBB7_7
 ; CHECK-NEXT:  // %bb.6:
-; CHECK-NEXT:    mov w0, #123
+; CHECK-NEXT:    mov w0, #123 // =0x7b
 ; CHECK-NEXT:    b .LBB7_8
 ; CHECK-NEXT:  .LBB7_7: // %if.end
 ; CHECK-NEXT:    mov w0, wzr
@@ -597,7 +597,7 @@ define i32 @do_nothing_if_compares_can_not_be_adjusted_to_each_other() #0 {
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    b.ne .LBB8_6
 ; CHECK-NEXT:  // %bb.5:
-; CHECK-NEXT:    mov w0, #123
+; CHECK-NEXT:    mov w0, #123 // =0x7b
 ; CHECK-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    .cfi_restore w19
@@ -680,8 +680,8 @@ define i32 @fcmpri(i32 %argc, ptr nocapture readonly %argv) #0 {
 ; CHECK-NEXT:    bl yoo
 ; CHECK-NEXT:    cmp w19, #0
 ; CHECK-NEXT:    mov w1, #2 // =0x2
-; CHECK-NEXT:    cinc w0, w19, gt
 ; CHECK-NEXT:    fmov d8, d0
+; CHECK-NEXT:    cinc w0, w19, gt
 ; CHECK-NEXT:    bl xoo
 ; CHECK-NEXT:    fmov d0, #-1.00000000
 ; CHECK-NEXT:    fcmp d8, #0.0
@@ -740,11 +740,11 @@ define void @cmp_shifted(i32 %in, i32 %lhs, i32 %rhs) #0 {
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov w8, #42
+; CHECK-NEXT:    mov w8, #42 // =0x2a
 ; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    mov w9, #128 // =0x80
 ; CHECK-NEXT:    csinc w8, w8, wzr, gt
 ; CHECK-NEXT:    cmp w0, #2, lsl #12 // =8192
-; CHECK-NEXT:    mov w9, #128
 ; CHECK-NEXT:    csel w0, w9, w8, ge
 ; CHECK-NEXT:    bl zoo
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -797,7 +797,7 @@ define i32 @combine_gt_ge_sel(i64 %v, ptr %p) #0 {
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    b.ne .LBB11_4
 ; CHECK-NEXT:  // %bb.3:
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB11_4: // %land.lhs.true3
 ; CHECK-NEXT:    adrp x8, :got:b
@@ -809,7 +809,7 @@ define i32 @combine_gt_ge_sel(i64 %v, ptr %p) #0 {
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    b.ne .LBB11_6
 ; CHECK-NEXT:  // %bb.5:
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB11_6: // %if.end
 ; CHECK-NEXT:    mov w0, wzr

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-contract.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-contract.ll
index 04cce6c7dd6cef..c684a18a7e0773 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-contract.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-contract.ll
@@ -16,10 +16,10 @@ define <4 x double> @mull_add(<4 x double> %a, <4 x double> %b, <4 x double> %c)
 ; CHECK-NEXT:    fneg v1.2d, v1.2d
 ; CHECK-NEXT:    fmla v3.2d, v2.2d, v5.2d
 ; CHECK-NEXT:    fmla v1.2d, v2.2d, v0.2d
-; CHECK-NEXT:    fadd v3.2d, v3.2d, v4.2d
 ; CHECK-NEXT:    fadd v1.2d, v2.2d, v1.2d
-; CHECK-NEXT:    zip1 v0.2d, v1.2d, v3.2d
-; CHECK-NEXT:    zip2 v1.2d, v1.2d, v3.2d
+; CHECK-NEXT:    fadd v2.2d, v3.2d, v4.2d
+; CHECK-NEXT:    zip1 v0.2d, v1.2d, v2.2d
+; CHECK-NEXT:    zip2 v1.2d, v1.2d, v2.2d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
@@ -49,15 +49,15 @@ define <4 x double> @mul_add_mull(<4 x double> %a, <4 x double> %b, <4 x double>
 ; CHECK-NEXT:    movi v18.2d, #0000000000000000
 ; CHECK-NEXT:    movi v19.2d, #0000000000000000
 ; CHECK-NEXT:    fcmla v16.2d, v0.2d, v2.2d, #0
-; CHECK-NEXT:    fcmla v17.2d, v1.2d, v3.2d, #0
-; CHECK-NEXT:    fcmla v18.2d, v4.2d, v6.2d, #0
-; CHECK-NEXT:    fcmla v19.2d, v5.2d, v7.2d, #0
+; CHECK-NEXT:    fcmla v18.2d, v1.2d, v3.2d, #0
+; CHECK-NEXT:    fcmla v17.2d, v5.2d, v7.2d, #0
+; CHECK-NEXT:    fcmla v19.2d, v4.2d, v6.2d, #0
 ; CHECK-NEXT:    fcmla v16.2d, v0.2d, v2.2d, #90
-; CHECK-NEXT:    fcmla v17.2d, v1.2d, v3.2d, #90
-; CHECK-NEXT:    fcmla v18.2d, v4.2d, v6.2d, #90
-; CHECK-NEXT:    fcmla v19.2d, v5.2d, v7.2d, #90
-; CHECK-NEXT:    fadd v0.2d, v16.2d, v18.2d
-; CHECK-NEXT:    fadd v1.2d, v17.2d, v19.2d
+; CHECK-NEXT:    fcmla v18.2d, v1.2d, v3.2d, #90
+; CHECK-NEXT:    fcmla v17.2d, v5.2d, v7.2d, #90
+; CHECK-NEXT:    fcmla v19.2d, v4.2d, v6.2d, #90
+; CHECK-NEXT:    fadd v1.2d, v18.2d, v17.2d
+; CHECK-NEXT:    fadd v0.2d, v16.2d, v19.2d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
@@ -95,15 +95,15 @@ define <4 x double> @mul_sub_mull(<4 x double> %a, <4 x double> %b, <4 x double>
 ; CHECK-NEXT:    movi v18.2d, #0000000000000000
 ; CHECK-NEXT:    movi v19.2d, #0000000000000000
 ; CHECK-NEXT:    fcmla v16.2d, v0.2d, v2.2d, #0
-; CHECK-NEXT:    fcmla v17.2d, v1.2d, v3.2d, #0
-; CHECK-NEXT:    fcmla v18.2d, v4.2d, v6.2d, #0
-; CHECK-NEXT:    fcmla v19.2d, v5.2d, v7.2d, #0
+; CHECK-NEXT:    fcmla v18.2d, v1.2d, v3.2d, #0
+; CHECK-NEXT:    fcmla v17.2d, v5.2d, v7.2d, #0
+; CHECK-NEXT:    fcmla v19.2d, v4.2d, v6.2d, #0
 ; CHECK-NEXT:    fcmla v16.2d, v0.2d, v2.2d, #90
-; CHECK-NEXT:    fcmla v17.2d, v1.2d, v3.2d, #90
-; CHECK-NEXT:    fcmla v18.2d, v4.2d, v6.2d, #90
-; CHECK-NEXT:    fcmla v19.2d, v5.2d, v7.2d, #90
-; CHECK-NEXT:    fsub v0.2d, v16.2d, v18.2d
-; CHECK-NEXT:    fsub v1.2d, v17.2d, v19.2d
+; CHECK-NEXT:    fcmla v18.2d, v1.2d, v3.2d, #90
+; CHECK-NEXT:    fcmla v17.2d, v5.2d, v7.2d, #90
+; CHECK-NEXT:    fcmla v19.2d, v4.2d, v6.2d, #90
+; CHECK-NEXT:    fsub v1.2d, v18.2d, v17.2d
+; CHECK-NEXT:    fsub v0.2d, v16.2d, v19.2d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
@@ -141,15 +141,15 @@ define <4 x double> @mul_conj_mull(<4 x double> %a, <4 x double> %b, <4 x double
 ; CHECK-NEXT:    movi v18.2d, #0000000000000000
 ; CHECK-NEXT:    movi v19.2d, #0000000000000000
 ; CHECK-NEXT:    fcmla v16.2d, v0.2d, v2.2d, #0
-; CHECK-NEXT:    fcmla v17.2d, v1.2d, v3.2d, #0
-; CHECK-NEXT:    fcmla v18.2d, v6.2d, v4.2d, #0
-; CHECK-NEXT:    fcmla v19.2d, v7.2d, v5.2d, #0
+; CHECK-NEXT:    fcmla v18.2d, v1.2d, v3.2d, #0
+; CHECK-NEXT:    fcmla v17.2d, v7.2d, v5.2d, #0
+; CHECK-NEXT:    fcmla v19.2d, v6.2d, v4.2d, #0
 ; CHECK-NEXT:    fcmla v16.2d, v0.2d, v2.2d, #90
-; CHECK-NEXT:    fcmla v17.2d, v1.2d, v3.2d, #90
-; CHECK-NEXT:    fcmla v18.2d, v6.2d, v4.2d, #270
-; CHECK-NEXT:    fcmla v19.2d, v7.2d, v5.2d, #270
-; CHECK-NEXT:    fadd v0.2d, v16.2d, v18.2d
-; CHECK-NEXT:    fadd v1.2d, v17.2d, v19.2d
+; CHECK-NEXT:    fcmla v18.2d, v1.2d, v3.2d, #90
+; CHECK-NEXT:    fcmla v17.2d, v7.2d, v5.2d, #270
+; CHECK-NEXT:    fcmla v19.2d, v6.2d, v4.2d, #270
+; CHECK-NEXT:    fadd v1.2d, v18.2d, v17.2d
+; CHECK-NEXT:    fadd v0.2d, v16.2d, v19.2d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
@@ -185,34 +185,33 @@ define <4 x double> @mul_add_rot_mull(<4 x double> %a, <4 x double> %b, <4 x dou
 ; CHECK-NEXT:    movi v16.2d, #0xffffffffffffffff
 ; CHECK-NEXT:    zip2 v17.2d, v4.2d, v5.2d
 ; CHECK-NEXT:    movi v18.2d, #0000000000000000
-; CHECK-NEXT:    zip1 v19.2d, v0.2d, v1.2d
+; CHECK-NEXT:    zip1 v4.2d, v4.2d, v5.2d
+; CHECK-NEXT:    zip2 v19.2d, v0.2d, v1.2d
+; CHECK-NEXT:    zip2 v20.2d, v2.2d, v3.2d
+; CHECK-NEXT:    zip1 v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    zip1 v2.2d, v2.2d, v3.2d
 ; CHECK-NEXT:    fneg v16.2d, v16.2d
-; CHECK-NEXT:    zip2 v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    zip1 v1.2d, v4.2d, v5.2d
-; CHECK-NEXT:    zip1 v5.2d, v2.2d, v3.2d
-; CHECK-NEXT:    mov v4.16b, v16.16b
-; CHECK-NEXT:    bsl v4.16b, v18.16b, v17.16b
-; CHECK-NEXT:    zip2 v2.2d, v2.2d, v3.2d
-; CHECK-NEXT:    mov v3.16b, v16.16b
-; CHECK-NEXT:    bsl v3.16b, v18.16b, v1.16b
-; CHECK-NEXT:    fadd v1.2d, v1.2d, v4.2d
-; CHECK-NEXT:    zip2 v4.2d, v6.2d, v7.2d
+; CHECK-NEXT:    fmul v1.2d, v19.2d, v20.2d
+; CHECK-NEXT:    fmul v3.2d, v0.2d, v20.2d
+; CHECK-NEXT:    mov v5.16b, v16.16b
+; CHECK-NEXT:    bsl v16.16b, v18.16b, v4.16b
+; CHECK-NEXT:    fneg v1.2d, v1.2d
+; CHECK-NEXT:    fmla v3.2d, v2.2d, v19.2d
+; CHECK-NEXT:    bsl v5.16b, v18.16b, v17.16b
+; CHECK-NEXT:    fsub v16.2d, v16.2d, v17.2d
+; CHECK-NEXT:    fmla v1.2d, v2.2d, v0.2d
+; CHECK-NEXT:    fadd v4.2d, v4.2d, v5.2d
+; CHECK-NEXT:    zip2 v5.2d, v6.2d, v7.2d
 ; CHECK-NEXT:    zip1 v6.2d, v6.2d, v7.2d
-; CHECK-NEXT:    fmul v7.2d, v0.2d, v2.2d
-; CHECK-NEXT:    fsub v3.2d, v3.2d, v17.2d
-; CHECK-NEXT:    fmul v16.2d, v1.2d, v4.2d
-; CHECK-NEXT:    fmul v2.2d, v19.2d, v2.2d
-; CHECK-NEXT:    fneg v7.2d, v7.2d
-; CHECK-NEXT:    fmul v4.2d, v3.2d, v4.2d
-; CHECK-NEXT:    fneg v16.2d, v16.2d
-; CHECK-NEXT:    fmla v2.2d, v5.2d, v0.2d
-; CHECK-NEXT:    fmla v7.2d, v5.2d, v19.2d
-; CHECK-NEXT:    fmla v4.2d, v1.2d, v6.2d
-; CHECK-NEXT:    fmla v16.2d, v6.2d, v3.2d
-; CHECK-NEXT:    fadd v1.2d, v2.2d, v4.2d
-; CHECK-NEXT:    fadd v2.2d, v7.2d, v16.2d
-; CHECK-NEXT:    zip1 v0.2d, v2.2d, v1.2d
-; CHECK-NEXT:    zip2 v1.2d, v2.2d, v1.2d
+; CHECK-NEXT:    fmul v17.2d, v4.2d, v5.2d
+; CHECK-NEXT:    fmul v5.2d, v16.2d, v5.2d
+; CHECK-NEXT:    fneg v7.2d, v17.2d
+; CHECK-NEXT:    fmla v5.2d, v4.2d, v6.2d
+; CHECK-NEXT:    fmla v7.2d, v6.2d, v16.2d
+; CHECK-NEXT:    fadd v2.2d, v3.2d, v5.2d
+; CHECK-NEXT:    fadd v1.2d, v1.2d, v7.2d
+; CHECK-NEXT:    zip1 v0.2d, v1.2d, v2.2d
+; CHECK-NEXT:    zip2 v1.2d, v1.2d, v2.2d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll
index a0c19f4589f51c..9b6a9e7adf796f 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll
@@ -39,16 +39,16 @@ define <4 x double> @mul_add_mull(<4 x double> %a, <4 x double> %b, <4 x double>
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v16.2d, #0000000000000000
 ; CHECK-NEXT:    movi v17.2d, #0000000000000000
-; CHECK-NEXT:    fcmla v16.2d, v4.2d, v6.2d, #0
-; CHECK-NEXT:    fcmla v17.2d, v5.2d, v7.2d, #0
-; CHECK-NEXT:    fcmla v16.2d, v2.2d, v0.2d, #0
-; CHECK-NEXT:    fcmla v17.2d, v3.2d, v1.2d, #0
-; CHECK-NEXT:    fcmla v16.2d, v4.2d, v6.2d, #90
-; CHECK-NEXT:    fcmla v17.2d, v5.2d, v7.2d, #90
-; CHECK-NEXT:    fcmla v16.2d, v2.2d, v0.2d, #90
-; CHECK-NEXT:    fcmla v17.2d, v3.2d, v1.2d, #90
-; CHECK-NEXT:    mov v0.16b, v16.16b
-; CHECK-NEXT:    mov v1.16b, v17.16b
+; CHECK-NEXT:    fcmla v17.2d, v4.2d, v6.2d, #0
+; CHECK-NEXT:    fcmla v16.2d, v5.2d, v7.2d, #0
+; CHECK-NEXT:    fcmla v17.2d, v2.2d, v0.2d, #0
+; CHECK-NEXT:    fcmla v16.2d, v3.2d, v1.2d, #0
+; CHECK-NEXT:    fcmla v17.2d, v4.2d, v6.2d, #90
+; CHECK-NEXT:    fcmla v16.2d, v5.2d, v7.2d, #90
+; CHECK-NEXT:    fcmla v17.2d, v2.2d, v0.2d, #90
+; CHECK-NEXT:    fcmla v16.2d, v3.2d, v1.2d, #90
+; CHECK-NEXT:    mov v0.16b, v17.16b
+; CHECK-NEXT:    mov v1.16b, v16.16b
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
@@ -83,16 +83,16 @@ define <4 x double> @mul_sub_mull(<4 x double> %a, <4 x double> %b, <4 x double>
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v16.2d, #0000000000000000
 ; CHECK-NEXT:    movi v17.2d, #0000000000000000
-; CHECK-NEXT:    fcmla v16.2d, v4.2d, v6.2d, #270
-; CHECK-NEXT:    fcmla v17.2d, v5.2d, v7.2d, #270
-; CHECK-NEXT:    fcmla v16.2d, v2.2d, v0.2d, #0
-; CHECK-NEXT:    fcmla v17.2d, v3.2d, v1.2d, #0
-; CHECK-NEXT:    fcmla v16.2d, v4.2d, v6.2d, #180
-; CHECK-NEXT:    fcmla v17.2d, v5.2d, v7.2d, #180
-; CHECK-NEXT:    fcmla v16.2d, v2.2d, v0.2d, #90
-; CHECK-NEXT:    fcmla v17.2d, v3.2d, v1.2d, #90
-; CHECK-NEXT:    mov v0.16b, v16.16b
-; CHECK-NEXT:    mov v1.16b, v17.16b
+; CHECK-NEXT:    fcmla v17.2d, v4.2d, v6.2d, #270
+; CHECK-NEXT:    fcmla v16.2d, v5.2d, v7.2d, #270
+; CHECK-NEXT:    fcmla v17.2d, v2.2d, v0.2d, #0
+; CHECK-NEXT:    fcmla v16.2d, v3.2d, v1.2d, #0
+; CHECK-NEXT:    fcmla v17.2d, v4.2d, v6.2d, #180
+; CHECK-NEXT:    fcmla v16.2d, v5.2d, v7.2d, #180
+; CHECK-NEXT:    fcmla v17.2d, v2.2d, v0.2d, #90
+; CHECK-NEXT:    fcmla v16.2d, v3.2d, v1.2d, #90
+; CHECK-NEXT:    mov v0.16b, v17.16b
+; CHECK-NEXT:    mov v1.16b, v16.16b
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
@@ -127,16 +127,16 @@ define <4 x double> @mul_conj_mull(<4 x double> %a, <4 x double> %b, <4 x double
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v16.2d, #0000000000000000
 ; CHECK-NEXT:    movi v17.2d, #0000000000000000
-; CHECK-NEXT:    fcmla v16.2d, v2.2d, v0.2d, #0
-; CHECK-NEXT:    fcmla v17.2d, v3.2d, v1.2d, #0
-; CHECK-NEXT:    fcmla v16.2d, v2.2d, v0.2d, #90
-; CHECK-NEXT:    fcmla v17.2d, v3.2d, v1.2d, #90
-; CHECK-NEXT:    fcmla v16.2d, v6.2d, v4.2d, #0
-; CHECK-NEXT:    fcmla v17.2d, v7.2d, v5.2d, #0
-; CHECK-NEXT:    fcmla v16.2d, v6.2d, v4.2d, #270
-; CHECK-NEXT:    fcmla v17.2d, v7.2d, v5.2d, #270
-; CHECK-NEXT:    mov v0.16b, v16.16b
-; CHECK-NEXT:    mov v1.16b, v17.16b
+; CHECK-NEXT:    fcmla v17.2d, v2.2d, v0.2d, #0
+; CHECK-NEXT:    fcmla v16.2d, v3.2d, v1.2d, #0
+; CHECK-NEXT:    fcmla v17.2d, v2.2d, v0.2d, #90
+; CHECK-NEXT:    fcmla v16.2d, v3.2d, v1.2d, #90
+; CHECK-NEXT:    fcmla v17.2d, v6.2d, v4.2d, #0
+; CHECK-NEXT:    fcmla v16.2d, v7.2d, v5.2d, #0
+; CHECK-NEXT:    fcmla v17.2d, v6.2d, v4.2d, #270
+; CHECK-NEXT:    fcmla v16.2d, v7.2d, v5.2d, #270
+; CHECK-NEXT:    mov v0.16b, v17.16b
+; CHECK-NEXT:    mov v1.16b, v16.16b
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
@@ -172,22 +172,22 @@ define <4 x double> @mul_add_rot_mull(<4 x double> %a, <4 x double> %b, <4 x dou
 ; CHECK-NEXT:    zip2 v16.2d, v2.2d, v3.2d
 ; CHECK-NEXT:    zip2 v17.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    zip1 v2.2d, v2.2d, v3.2d
-; CHECK-NEXT:    zip2 v3.2d, v4.2d, v5.2d
-; CHECK-NEXT:    zip1 v18.2d, v6.2d, v7.2d
-; CHECK-NEXT:    fmul v19.2d, v16.2d, v17.2d
+; CHECK-NEXT:    zip2 v18.2d, v4.2d, v5.2d
+; CHECK-NEXT:    zip1 v19.2d, v6.2d, v7.2d
 ; CHECK-NEXT:    zip1 v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    zip1 v1.2d, v4.2d, v5.2d
-; CHECK-NEXT:    fmul v4.2d, v2.2d, v17.2d
 ; CHECK-NEXT:    zip2 v5.2d, v6.2d, v7.2d
-; CHECK-NEXT:    fmla v19.2d, v3.2d, v18.2d
+; CHECK-NEXT:    fmul v3.2d, v16.2d, v17.2d
+; CHECK-NEXT:    fmul v4.2d, v2.2d, v17.2d
+; CHECK-NEXT:    fmla v3.2d, v18.2d, v19.2d
 ; CHECK-NEXT:    fmla v4.2d, v0.2d, v16.2d
-; CHECK-NEXT:    fmla v19.2d, v1.2d, v5.2d
-; CHECK-NEXT:    fmla v4.2d, v1.2d, v18.2d
-; CHECK-NEXT:    fneg v1.2d, v19.2d
-; CHECK-NEXT:    fmls v4.2d, v3.2d, v5.2d
-; CHECK-NEXT:    fmla v1.2d, v0.2d, v2.2d
-; CHECK-NEXT:    zip1 v0.2d, v1.2d, v4.2d
-; CHECK-NEXT:    zip2 v1.2d, v1.2d, v4.2d
+; CHECK-NEXT:    fmla v3.2d, v1.2d, v5.2d
+; CHECK-NEXT:    fmla v4.2d, v1.2d, v19.2d
+; CHECK-NEXT:    fneg v3.2d, v3.2d
+; CHECK-NEXT:    fmls v4.2d, v18.2d, v5.2d
+; CHECK-NEXT:    fmla v3.2d, v0.2d, v2.2d
+; CHECK-NEXT:    zip1 v0.2d, v3.2d, v4.2d
+; CHECK-NEXT:    zip2 v1.2d, v3.2d, v4.2d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll
index 79ffe693fe311a..ebf5ce20d4ecc4 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll
@@ -7,21 +7,21 @@ target triple = "aarch64-unknown-linux-gnu"
 define <vscale x 4 x double> @mull_add(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c) {
 ; CHECK-LABEL: mull_add:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uzp2 z6.d, z2.d, z3.d
-; CHECK-NEXT:    uzp2 z7.d, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    uzp1 z6.d, z0.d, z1.d
+; CHECK-NEXT:    uzp2 z7.d, z2.d, z3.d
+; CHECK-NEXT:    uzp2 z0.d, z0.d, z1.d
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uzp1 z1.d, z2.d, z3.d
-; CHECK-NEXT:    fmul z2.d, z0.d, z6.d
-; CHECK-NEXT:    fmla z2.d, p0/m, z7.d, z1.d
-; CHECK-NEXT:    fmul z3.d, z7.d, z6.d
-; CHECK-NEXT:    fnmsb z0.d, p0/m, z1.d, z3.d
-; CHECK-NEXT:    uzp2 z1.d, z4.d, z5.d
+; CHECK-NEXT:    fmul z2.d, z6.d, z7.d
+; CHECK-NEXT:    fmul z3.d, z0.d, z7.d
+; CHECK-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    fnmsb z1.d, p0/m, z6.d, z3.d
+; CHECK-NEXT:    uzp2 z2.d, z4.d, z5.d
 ; CHECK-NEXT:    uzp1 z3.d, z4.d, z5.d
-; CHECK-NEXT:    fadd z3.d, z3.d, z0.d
-; CHECK-NEXT:    fadd z1.d, z2.d, z1.d
-; CHECK-NEXT:    zip1 z0.d, z3.d, z1.d
-; CHECK-NEXT:    zip2 z1.d, z3.d, z1.d
+; CHECK-NEXT:    fadd z2.d, z0.d, z2.d
+; CHECK-NEXT:    fadd z1.d, z3.d, z1.d
+; CHECK-NEXT:    zip1 z0.d, z1.d, z2.d
+; CHECK-NEXT:    zip2 z1.d, z1.d, z2.d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
@@ -49,8 +49,8 @@ entry:
 define <vscale x 4 x double> @mul_add_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
 ; CHECK-LABEL: mul_add_mull:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z24.d, #0 // =0x0
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z24.d, #0 // =0x0
 ; CHECK-NEXT:    mov z25.d, z24.d
 ; CHECK-NEXT:    mov z26.d, z24.d
 ; CHECK-NEXT:    mov z27.d, z24.d
@@ -100,8 +100,8 @@ entry:
 define <vscale x 4 x double> @mul_sub_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
 ; CHECK-LABEL: mul_sub_mull:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z24.d, #0 // =0x0
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z24.d, #0 // =0x0
 ; CHECK-NEXT:    mov z25.d, z24.d
 ; CHECK-NEXT:    mov z26.d, z24.d
 ; CHECK-NEXT:    mov z27.d, z24.d
@@ -151,8 +151,8 @@ entry:
 define <vscale x 4 x double> @mul_conj_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
 ; CHECK-LABEL: mul_conj_mull:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z24.d, #0 // =0x0
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z24.d, #0 // =0x0
 ; CHECK-NEXT:    mov z25.d, z24.d
 ; CHECK-NEXT:    mov z26.d, z24.d
 ; CHECK-NEXT:    mov z27.d, z24.d
@@ -203,35 +203,37 @@ define <vscale x 4 x double> @mul_add_rot_mull(<vscale x 4 x double> %a, <vscale
 ; CHECK-LABEL: mul_add_rot_mull:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    uzp2 z24.d, z4.d, z5.d
-; CHECK-NEXT:    mov z26.d, #0 // =0x0
-; CHECK-NEXT:    mov z25.d, z24.d
-; CHECK-NEXT:    and z26.d, z26.d, #0x7fffffffffffffff
-; CHECK-NEXT:    and z25.d, z25.d, #0x8000000000000000
-; CHECK-NEXT:    uzp2 z27.d, z0.d, z1.d
+; CHECK-NEXT:    mov z25.d, #0 // =0x0
+; CHECK-NEXT:    uzp1 z4.d, z4.d, z5.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    and z25.d, z25.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z26.d, z24.d
+; CHECK-NEXT:    and z26.d, z26.d, #0x8000000000000000
+; CHECK-NEXT:    orr z5.d, z25.d, z26.d
+; CHECK-NEXT:    fadd z5.d, z4.d, z5.d
+; CHECK-NEXT:    and z4.d, z4.d, #0x8000000000000000
+; CHECK-NEXT:    orr z4.d, z25.d, z4.d
+; CHECK-NEXT:    uzp2 z25.d, z0.d, z1.d
 ; CHECK-NEXT:    uzp1 z0.d, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z1.d, z4.d, z5.d
-; CHECK-NEXT:    orr z5.d, z26.d, z25.d
-; CHECK-NEXT:    uzp1 z4.d, z2.d, z3.d
-; CHECK-NEXT:    fadd z5.d, z1.d, z5.d
-; CHECK-NEXT:    and z1.d, z1.d, #0x8000000000000000
-; CHECK-NEXT:    orr z1.d, z26.d, z1.d
-; CHECK-NEXT:    uzp2 z2.d, z2.d, z3.d
-; CHECK-NEXT:    fsub z1.d, z1.d, z24.d
+; CHECK-NEXT:    uzp2 z1.d, z2.d, z3.d
+; CHECK-NEXT:    uzp1 z2.d, z2.d, z3.d
+; CHECK-NEXT:    fsub z4.d, z4.d, z24.d
 ; CHECK-NEXT:    uzp2 z24.d, z6.d, z7.d
-; CHECK-NEXT:    fmul z3.d, z0.d, z2.d
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uzp1 z6.d, z6.d, z7.d
-; CHECK-NEXT:    fmul z7.d, z1.d, z24.d
-; CHECK-NEXT:    fmla z3.d, p0/m, z27.d, z4.d
-; CHECK-NEXT:    fmla z7.d, p0/m, z6.d, z5.d
-; CHECK-NEXT:    fmul z2.d, z27.d, z2.d
-; CHECK-NEXT:    fmul z5.d, z5.d, z24.d
-; CHECK-NEXT:    fnmsb z0.d, p0/m, z4.d, z2.d
-; CHECK-NEXT:    fnmsb z1.d, p0/m, z6.d, z5.d
-; CHECK-NEXT:    fadd z1.d, z0.d, z1.d
-; CHECK-NEXT:    fadd z2.d, z3.d, z7.d
-; CHECK-NEXT:    zip1 z0.d, z1.d, z2.d
-; CHECK-NEXT:    zip2 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmul z3.d, z0.d, z1.d
+; CHECK-NEXT:    fmul z1.d, z25.d, z1.d
+; CHECK-NEXT:    fmul z7.d, z4.d, z24.d
+; CHECK-NEXT:    fmul z24.d, z5.d, z24.d
+; CHECK-NEXT:    fmla z3.d, p0/m, z25.d, z2.d
+; CHECK-NEXT:    fnmsb z0.d, p0/m, z2.d, z1.d
+; CHECK-NEXT:    movprfx z1, z7
+; CHECK-NEXT:    fmla z1.d, p0/m, z6.d, z5.d
+; CHECK-NEXT:    movprfx z2, z24
+; CHECK-NEXT:    fnmls z2.d, p0/m, z4.d, z6.d
+; CHECK-NEXT:    fadd z2.d, z0.d, z2.d
+; CHECK-NEXT:    fadd z1.d, z3.d, z1.d
+; CHECK-NEXT:    zip1 z0.d, z2.d, z1.d
+; CHECK-NEXT:    zip2 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
index 3e3c26b74956d2..aac43f7b960fed 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
@@ -41,19 +41,19 @@ entry:
 define <vscale x 4 x double> @mul_add_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
 ; CHECK-LABEL: mul_add_mull:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z24.d, #0 // =0x0
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z24.d, #0 // =0x0
 ; CHECK-NEXT:    mov z25.d, z24.d
-; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #0
 ; CHECK-NEXT:    fcmla z25.d, p0/m, z6.d, z4.d, #0
-; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #0
+; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #0
 ; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #0
-; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #90
+; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #0
 ; CHECK-NEXT:    fcmla z25.d, p0/m, z6.d, z4.d, #90
-; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #90
+; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #90
 ; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #90
-; CHECK-NEXT:    mov z1.d, z24.d
+; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #90
 ; CHECK-NEXT:    mov z0.d, z25.d
+; CHECK-NEXT:    mov z1.d, z24.d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
@@ -90,19 +90,19 @@ entry:
 define <vscale x 4 x double> @mul_sub_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
 ; CHECK-LABEL: mul_sub_mull:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z24.d, #0 // =0x0
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z24.d, #0 // =0x0
 ; CHECK-NEXT:    mov z25.d, z24.d
-; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #270
 ; CHECK-NEXT:    fcmla z25.d, p0/m, z6.d, z4.d, #270
-; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #0
+; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #270
 ; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #0
-; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #180
+; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #0
 ; CHECK-NEXT:    fcmla z25.d, p0/m, z6.d, z4.d, #180
-; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #90
+; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #180
 ; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #90
-; CHECK-NEXT:    mov z1.d, z24.d
+; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #90
 ; CHECK-NEXT:    mov z0.d, z25.d
+; CHECK-NEXT:    mov z1.d, z24.d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
@@ -139,19 +139,19 @@ entry:
 define <vscale x 4 x double> @mul_conj_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
 ; CHECK-LABEL: mul_conj_mull:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z24.d, #0 // =0x0
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z24.d, #0 // =0x0
 ; CHECK-NEXT:    mov z25.d, z24.d
-; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #0
 ; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #0
-; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #90
+; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #0
 ; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #90
-; CHECK-NEXT:    fcmla z24.d, p0/m, z5.d, z7.d, #0
+; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #90
 ; CHECK-NEXT:    fcmla z25.d, p0/m, z4.d, z6.d, #0
-; CHECK-NEXT:    fcmla z24.d, p0/m, z5.d, z7.d, #270
+; CHECK-NEXT:    fcmla z24.d, p0/m, z5.d, z7.d, #0
 ; CHECK-NEXT:    fcmla z25.d, p0/m, z4.d, z6.d, #270
-; CHECK-NEXT:    mov z1.d, z24.d
+; CHECK-NEXT:    fcmla z24.d, p0/m, z5.d, z7.d, #270
 ; CHECK-NEXT:    mov z0.d, z25.d
+; CHECK-NEXT:    mov z1.d, z24.d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
@@ -188,25 +188,26 @@ entry:
 define <vscale x 4 x double> @mul_add_rot_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
 ; CHECK-LABEL: mul_add_rot_mull:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uzp1 z25.d, z0.d, z1.d
-; CHECK-NEXT:    uzp2 z0.d, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z1.d, z2.d, z3.d
-; CHECK-NEXT:    uzp2 z24.d, z2.d, z3.d
-; CHECK-NEXT:    fmul z2.d, z1.d, z0.d
+; CHECK-NEXT:    uzp1 z24.d, z2.d, z3.d
+; CHECK-NEXT:    uzp2 z25.d, z0.d, z1.d
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fmla z2.d, p0/m, z24.d, z25.d
-; CHECK-NEXT:    fmul z0.d, z24.d, z0.d
-; CHECK-NEXT:    uzp2 z3.d, z4.d, z5.d
-; CHECK-NEXT:    uzp1 z24.d, z6.d, z7.d
+; CHECK-NEXT:    uzp2 z2.d, z2.d, z3.d
+; CHECK-NEXT:    uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    uzp1 z26.d, z6.d, z7.d
+; CHECK-NEXT:    fmul z1.d, z24.d, z25.d
+; CHECK-NEXT:    fmul z3.d, z2.d, z25.d
+; CHECK-NEXT:    uzp2 z25.d, z4.d, z5.d
 ; CHECK-NEXT:    uzp1 z4.d, z4.d, z5.d
-; CHECK-NEXT:    fmla z0.d, p0/m, z24.d, z3.d
 ; CHECK-NEXT:    uzp2 z5.d, z6.d, z7.d
-; CHECK-NEXT:    fmla z2.d, p0/m, z24.d, z4.d
-; CHECK-NEXT:    fmla z0.d, p0/m, z5.d, z4.d
-; CHECK-NEXT:    fmls z2.d, p0/m, z5.d, z3.d
-; CHECK-NEXT:    fnmsb z1.d, p0/m, z25.d, z0.d
-; CHECK-NEXT:    zip1 z0.d, z1.d, z2.d
-; CHECK-NEXT:    zip2 z1.d, z1.d, z2.d
+; CHECK-NEXT:    fmla z1.d, p0/m, z2.d, z0.d
+; CHECK-NEXT:    fmla z3.d, p0/m, z26.d, z25.d
+; CHECK-NEXT:    movprfx z2, z3
+; CHECK-NEXT:    fmla z2.d, p0/m, z5.d, z4.d
+; CHECK-NEXT:    fnmls z2.d, p0/m, z24.d, z0.d
+; CHECK-NEXT:    fmla z1.d, p0/m, z26.d, z4.d
+; CHECK-NEXT:    fmls z1.d, p0/m, z5.d, z25.d
+; CHECK-NEXT:    zip1 z0.d, z2.d, z1.d
+; CHECK-NEXT:    zip2 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
index 37fb425b972cf1..48b5756b01fb87 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
@@ -85,9 +85,9 @@ define <vscale x 32 x half> @complex_add_v32f16(<vscale x 32 x half> %a, <vscale
 ; CHECK-LABEL: complex_add_v32f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fcadd z6.h, p0/m, z6.h, z2.h, #90
 ; CHECK-NEXT:    fcadd z4.h, p0/m, z4.h, z0.h, #90
 ; CHECK-NEXT:    fcadd z5.h, p0/m, z5.h, z1.h, #90
+; CHECK-NEXT:    fcadd z6.h, p0/m, z6.h, z2.h, #90
 ; CHECK-NEXT:    fcadd z7.h, p0/m, z7.h, z3.h, #90
 ; CHECK-NEXT:    mov z0.d, z4.d
 ; CHECK-NEXT:    mov z1.d, z5.d

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
index 90c0d9e164717b..93497f38063d28 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
@@ -66,8 +66,8 @@ entry:
 define <16 x half> @complex_add_v16f16(<16 x half> %a, <16 x half> %b) {
 ; CHECK-LABEL: complex_add_v16f16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcadd v0.8h, v2.8h, v0.8h, #90
 ; CHECK-NEXT:    fcadd v1.8h, v3.8h, v1.8h, #90
+; CHECK-NEXT:    fcadd v0.8h, v2.8h, v0.8h, #90
 ; CHECK-NEXT:    ret
 entry:
   %a.real = shufflevector <16 x half> %a, <16 x half> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -142,8 +142,8 @@ entry:
 define <16 x half> @complex_add_v16f16_with_intrinsic(<16 x half> %a, <16 x half> %b) {
 ; CHECK-LABEL: complex_add_v16f16_with_intrinsic:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcadd v0.8h, v2.8h, v0.8h, #90
 ; CHECK-NEXT:    fcadd v1.8h, v3.8h, v1.8h, #90
+; CHECK-NEXT:    fcadd v0.8h, v2.8h, v0.8h, #90
 ; CHECK-NEXT:    ret
 entry:
   %a.deinterleaved = tail call { <8 x half>, <8 x half> } @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %a)

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
index 5bcb51bca85ec0..611cf44ea7ee87 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
@@ -18,8 +18,8 @@ define <vscale x 4 x half> @complex_mul_v4f16(<vscale x 4 x half> %a, <vscale x
 ; CHECK-NEXT:    uzp1 z1.d, z1.d, z3.d
 ; CHECK-NEXT:    movprfx z3, z2
 ; CHECK-NEXT:    fmul z3.h, p0/m, z3.h, z0.h
-; CHECK-NEXT:    fmla z3.h, p0/m, z1.h, z4.h
 ; CHECK-NEXT:    fmul z2.h, p0/m, z2.h, z4.h
+; CHECK-NEXT:    fmla z3.h, p0/m, z1.h, z4.h
 ; CHECK-NEXT:    fnmsb z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    zip2 z1.d, z0.d, z3.d
 ; CHECK-NEXT:    zip1 z0.d, z0.d, z3.d
@@ -46,8 +46,8 @@ entry:
 define <vscale x 8 x half> @complex_mul_v8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b) {
 ; CHECK-LABEL: complex_mul_v8f16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z2.h, #0 // =0x0
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z2.h, #0 // =0x0
 ; CHECK-NEXT:    fcmla z2.h, p0/m, z1.h, z0.h, #0
 ; CHECK-NEXT:    fcmla z2.h, p0/m, z1.h, z0.h, #90
 ; CHECK-NEXT:    mov z0.d, z2.d
@@ -72,15 +72,15 @@ entry:
 define <vscale x 16 x half> @complex_mul_v16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b) {
 ; CHECK-LABEL: complex_mul_v16f16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z4.h, #0 // =0x0
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z4.h, #0 // =0x0
 ; CHECK-NEXT:    mov z5.d, z4.d
-; CHECK-NEXT:    fcmla z4.h, p0/m, z3.h, z1.h, #0
 ; CHECK-NEXT:    fcmla z5.h, p0/m, z2.h, z0.h, #0
-; CHECK-NEXT:    fcmla z4.h, p0/m, z3.h, z1.h, #90
+; CHECK-NEXT:    fcmla z4.h, p0/m, z3.h, z1.h, #0
 ; CHECK-NEXT:    fcmla z5.h, p0/m, z2.h, z0.h, #90
-; CHECK-NEXT:    mov z1.d, z4.d
+; CHECK-NEXT:    fcmla z4.h, p0/m, z3.h, z1.h, #90
 ; CHECK-NEXT:    mov z0.d, z5.d
+; CHECK-NEXT:    mov z1.d, z4.d
 ; CHECK-NEXT:    ret
 entry:
   %a.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %a)
@@ -103,8 +103,8 @@ entry:
 define <vscale x 32 x half> @complex_mul_v32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b) {
 ; CHECK-LABEL: complex_mul_v32f16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z24.h, #0 // =0x0
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z24.h, #0 // =0x0
 ; CHECK-NEXT:    mov z25.d, z24.d
 ; CHECK-NEXT:    mov z26.d, z24.d
 ; CHECK-NEXT:    mov z27.d, z24.d

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll
index cb0a9f5236b5d1..f4e72ac5f810d3 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll
@@ -87,12 +87,12 @@ define <16 x half> @complex_mul_v16f16(<16 x half> %a, <16 x half> %b) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v4.2d, #0000000000000000
 ; CHECK-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-NEXT:    fcmla v4.8h, v0.8h, v2.8h, #0
-; CHECK-NEXT:    fcmla v5.8h, v1.8h, v3.8h, #0
-; CHECK-NEXT:    fcmla v4.8h, v0.8h, v2.8h, #90
-; CHECK-NEXT:    fcmla v5.8h, v1.8h, v3.8h, #90
-; CHECK-NEXT:    mov v0.16b, v4.16b
-; CHECK-NEXT:    mov v1.16b, v5.16b
+; CHECK-NEXT:    fcmla v5.8h, v0.8h, v2.8h, #0
+; CHECK-NEXT:    fcmla v4.8h, v1.8h, v3.8h, #0
+; CHECK-NEXT:    fcmla v5.8h, v0.8h, v2.8h, #90
+; CHECK-NEXT:    fcmla v4.8h, v1.8h, v3.8h, #90
+; CHECK-NEXT:    mov v0.16b, v5.16b
+; CHECK-NEXT:    mov v1.16b, v4.16b
 ; CHECK-NEXT:    ret
 entry:
   %a.real   = shufflevector <16 x half> %a, <16 x half> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -118,17 +118,17 @@ define <32 x half> @complex_mul_v32f16(<32 x half> %a, <32 x half> %b) {
 ; CHECK-NEXT:    movi v18.2d, #0000000000000000
 ; CHECK-NEXT:    movi v19.2d, #0000000000000000
 ; CHECK-NEXT:    fcmla v16.8h, v0.8h, v4.8h, #0
-; CHECK-NEXT:    fcmla v17.8h, v1.8h, v5.8h, #0
-; CHECK-NEXT:    fcmla v18.8h, v2.8h, v6.8h, #0
-; CHECK-NEXT:    fcmla v19.8h, v3.8h, v7.8h, #0
+; CHECK-NEXT:    fcmla v18.8h, v1.8h, v5.8h, #0
+; CHECK-NEXT:    fcmla v17.8h, v3.8h, v7.8h, #0
+; CHECK-NEXT:    fcmla v19.8h, v2.8h, v6.8h, #0
 ; CHECK-NEXT:    fcmla v16.8h, v0.8h, v4.8h, #90
-; CHECK-NEXT:    fcmla v17.8h, v1.8h, v5.8h, #90
-; CHECK-NEXT:    fcmla v18.8h, v2.8h, v6.8h, #90
-; CHECK-NEXT:    fcmla v19.8h, v3.8h, v7.8h, #90
+; CHECK-NEXT:    fcmla v18.8h, v1.8h, v5.8h, #90
+; CHECK-NEXT:    fcmla v17.8h, v3.8h, v7.8h, #90
+; CHECK-NEXT:    fcmla v19.8h, v2.8h, v6.8h, #90
 ; CHECK-NEXT:    mov v0.16b, v16.16b
-; CHECK-NEXT:    mov v1.16b, v17.16b
-; CHECK-NEXT:    mov v2.16b, v18.16b
-; CHECK-NEXT:    mov v3.16b, v19.16b
+; CHECK-NEXT:    mov v1.16b, v18.16b
+; CHECK-NEXT:    mov v3.16b, v17.16b
+; CHECK-NEXT:    mov v2.16b, v19.16b
 ; CHECK-NEXT:    ret
 entry:
   %a.real   = shufflevector <32 x half> %a, <32 x half> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add-scalable.ll
index ae7be554a0c5e4..ab764a58a77082 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add-scalable.ll
@@ -51,9 +51,9 @@ define <vscale x 16 x float> @complex_add_v16f32(<vscale x 16 x float> %a, <vsca
 ; CHECK-LABEL: complex_add_v16f32:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcadd z6.s, p0/m, z6.s, z2.s, #90
 ; CHECK-NEXT:    fcadd z4.s, p0/m, z4.s, z0.s, #90
 ; CHECK-NEXT:    fcadd z5.s, p0/m, z5.s, z1.s, #90
+; CHECK-NEXT:    fcadd z6.s, p0/m, z6.s, z2.s, #90
 ; CHECK-NEXT:    fcadd z7.s, p0/m, z7.s, z3.s, #90
 ; CHECK-NEXT:    mov z0.d, z4.d
 ; CHECK-NEXT:    mov z1.d, z5.d

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add.ll
index f8d559eec34be9..c4d0c9364f1be0 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add.ll
@@ -42,8 +42,8 @@ entry:
 define <8 x float> @complex_add_v8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: complex_add_v8f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcadd v0.4s, v2.4s, v0.4s, #90
 ; CHECK-NEXT:    fcadd v1.4s, v3.4s, v1.4s, #90
+; CHECK-NEXT:    fcadd v0.4s, v2.4s, v0.4s, #90
 ; CHECK-NEXT:    ret
 entry:
   %a.real = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll
index 917f3e5981543a..0f5e9a2202ddd4 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll
@@ -7,8 +7,8 @@ target triple = "aarch64"
 define <vscale x 4 x float> @complex_mul_v4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: complex_mul_v4f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z2.s, #0 // =0x0
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov z2.s, #0 // =0x0
 ; CHECK-NEXT:    fcmla z2.s, p0/m, z1.s, z0.s, #0
 ; CHECK-NEXT:    fcmla z2.s, p0/m, z1.s, z0.s, #90
 ; CHECK-NEXT:    mov z0.d, z2.d
@@ -34,15 +34,15 @@ entry:
 define <vscale x 8 x float> @complex_mul_v8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b) {
 ; CHECK-LABEL: complex_mul_v8f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z4.s, #0 // =0x0
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov z4.s, #0 // =0x0
 ; CHECK-NEXT:    mov z5.d, z4.d
-; CHECK-NEXT:    fcmla z4.s, p0/m, z3.s, z1.s, #0
 ; CHECK-NEXT:    fcmla z5.s, p0/m, z2.s, z0.s, #0
-; CHECK-NEXT:    fcmla z4.s, p0/m, z3.s, z1.s, #90
+; CHECK-NEXT:    fcmla z4.s, p0/m, z3.s, z1.s, #0
 ; CHECK-NEXT:    fcmla z5.s, p0/m, z2.s, z0.s, #90
-; CHECK-NEXT:    mov z1.d, z4.d
+; CHECK-NEXT:    fcmla z4.s, p0/m, z3.s, z1.s, #90
 ; CHECK-NEXT:    mov z0.d, z5.d
+; CHECK-NEXT:    mov z1.d, z4.d
 ; CHECK-NEXT:    ret
 entry:
   %a.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %a)
@@ -65,8 +65,8 @@ entry:
 define <vscale x 16 x float> @complex_mul_v16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b) {
 ; CHECK-LABEL: complex_mul_v16f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z24.s, #0 // =0x0
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov z24.s, #0 // =0x0
 ; CHECK-NEXT:    mov z25.d, z24.d
 ; CHECK-NEXT:    mov z26.d, z24.d
 ; CHECK-NEXT:    mov z27.d, z24.d

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul.ll
index b25ea19ae92174..05f07f6fd1c2c8 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul.ll
@@ -57,12 +57,12 @@ define <8 x float> @complex_mul_v8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v4.2d, #0000000000000000
 ; CHECK-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-NEXT:    fcmla v4.4s, v0.4s, v2.4s, #0
-; CHECK-NEXT:    fcmla v5.4s, v1.4s, v3.4s, #0
-; CHECK-NEXT:    fcmla v4.4s, v0.4s, v2.4s, #90
-; CHECK-NEXT:    fcmla v5.4s, v1.4s, v3.4s, #90
-; CHECK-NEXT:    mov v0.16b, v4.16b
-; CHECK-NEXT:    mov v1.16b, v5.16b
+; CHECK-NEXT:    fcmla v5.4s, v0.4s, v2.4s, #0
+; CHECK-NEXT:    fcmla v4.4s, v1.4s, v3.4s, #0
+; CHECK-NEXT:    fcmla v5.4s, v0.4s, v2.4s, #90
+; CHECK-NEXT:    fcmla v4.4s, v1.4s, v3.4s, #90
+; CHECK-NEXT:    mov v0.16b, v5.16b
+; CHECK-NEXT:    mov v1.16b, v4.16b
 ; CHECK-NEXT:    ret
 entry:
   %a.real   = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -88,17 +88,17 @@ define <16 x float> @complex_mul_v16f32(<16 x float> %a, <16 x float> %b) {
 ; CHECK-NEXT:    movi v18.2d, #0000000000000000
 ; CHECK-NEXT:    movi v19.2d, #0000000000000000
 ; CHECK-NEXT:    fcmla v16.4s, v0.4s, v4.4s, #0
-; CHECK-NEXT:    fcmla v17.4s, v1.4s, v5.4s, #0
-; CHECK-NEXT:    fcmla v18.4s, v2.4s, v6.4s, #0
-; CHECK-NEXT:    fcmla v19.4s, v3.4s, v7.4s, #0
+; CHECK-NEXT:    fcmla v18.4s, v1.4s, v5.4s, #0
+; CHECK-NEXT:    fcmla v17.4s, v3.4s, v7.4s, #0
+; CHECK-NEXT:    fcmla v19.4s, v2.4s, v6.4s, #0
 ; CHECK-NEXT:    fcmla v16.4s, v0.4s, v4.4s, #90
-; CHECK-NEXT:    fcmla v17.4s, v1.4s, v5.4s, #90
-; CHECK-NEXT:    fcmla v18.4s, v2.4s, v6.4s, #90
-; CHECK-NEXT:    fcmla v19.4s, v3.4s, v7.4s, #90
+; CHECK-NEXT:    fcmla v18.4s, v1.4s, v5.4s, #90
+; CHECK-NEXT:    fcmla v17.4s, v3.4s, v7.4s, #90
+; CHECK-NEXT:    fcmla v19.4s, v2.4s, v6.4s, #90
 ; CHECK-NEXT:    mov v0.16b, v16.16b
-; CHECK-NEXT:    mov v1.16b, v17.16b
-; CHECK-NEXT:    mov v2.16b, v18.16b
-; CHECK-NEXT:    mov v3.16b, v19.16b
+; CHECK-NEXT:    mov v1.16b, v18.16b
+; CHECK-NEXT:    mov v3.16b, v17.16b
+; CHECK-NEXT:    mov v2.16b, v19.16b
 ; CHECK-NEXT:    ret
 entry:
   %a.real   = shufflevector <16 x float> %a, <16 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add-scalable.ll
index 4dcc7d48ac01cf..46a15f489d2b37 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add-scalable.ll
@@ -52,9 +52,9 @@ define <vscale x 8 x double> @complex_add_v8f64(<vscale x 8 x double> %a, <vscal
 ; CHECK-LABEL: complex_add_v8f64:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcadd z6.d, p0/m, z6.d, z2.d, #90
 ; CHECK-NEXT:    fcadd z4.d, p0/m, z4.d, z0.d, #90
 ; CHECK-NEXT:    fcadd z5.d, p0/m, z5.d, z1.d, #90
+; CHECK-NEXT:    fcadd z6.d, p0/m, z6.d, z2.d, #90
 ; CHECK-NEXT:    fcadd z7.d, p0/m, z7.d, z3.d, #90
 ; CHECK-NEXT:    mov z0.d, z4.d
 ; CHECK-NEXT:    mov z1.d, z5.d

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add.ll
index 1a701b2c53da45..e0c76b7bbe7167 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add.ll
@@ -25,8 +25,8 @@ entry:
 define <4 x double> @complex_add_v4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: complex_add_v4f64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcadd v0.2d, v2.2d, v0.2d, #90
 ; CHECK-NEXT:    fcadd v1.2d, v3.2d, v1.2d, #90
+; CHECK-NEXT:    fcadd v0.2d, v2.2d, v0.2d, #90
 ; CHECK-NEXT:    ret
 entry:
   %a.real = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <2 x i32> <i32 0, i32 2>

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll
index 00dbce35919834..1fe554bdc616e6 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll
@@ -7,8 +7,8 @@ target triple = "aarch64"
 define <vscale x 2 x double> @complex_mul_v2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b) {
 ; CHECK-LABEL: complex_mul_v2f64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z2.d, #0 // =0x0
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z2.d, #0 // =0x0
 ; CHECK-NEXT:    fcmla z2.d, p0/m, z1.d, z0.d, #0
 ; CHECK-NEXT:    fcmla z2.d, p0/m, z1.d, z0.d, #90
 ; CHECK-NEXT:    mov z0.d, z2.d
@@ -34,15 +34,15 @@ entry:
 define <vscale x 4 x double> @complex_mul_v4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b) {
 ; CHECK-LABEL: complex_mul_v4f64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z4.d, #0 // =0x0
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z4.d, #0 // =0x0
 ; CHECK-NEXT:    mov z5.d, z4.d
-; CHECK-NEXT:    fcmla z4.d, p0/m, z3.d, z1.d, #0
 ; CHECK-NEXT:    fcmla z5.d, p0/m, z2.d, z0.d, #0
-; CHECK-NEXT:    fcmla z4.d, p0/m, z3.d, z1.d, #90
+; CHECK-NEXT:    fcmla z4.d, p0/m, z3.d, z1.d, #0
 ; CHECK-NEXT:    fcmla z5.d, p0/m, z2.d, z0.d, #90
-; CHECK-NEXT:    mov z1.d, z4.d
+; CHECK-NEXT:    fcmla z4.d, p0/m, z3.d, z1.d, #90
 ; CHECK-NEXT:    mov z0.d, z5.d
+; CHECK-NEXT:    mov z1.d, z4.d
 ; CHECK-NEXT:    ret
 entry:
   %a.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
@@ -65,8 +65,8 @@ entry:
 define <vscale x 8 x double> @complex_mul_v8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b) {
 ; CHECK-LABEL: complex_mul_v8f64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z24.d, #0 // =0x0
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z24.d, #0 // =0x0
 ; CHECK-NEXT:    mov z25.d, z24.d
 ; CHECK-NEXT:    mov z26.d, z24.d
 ; CHECK-NEXT:    mov z27.d, z24.d

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul.ll
index 4de56ee7306881..6df59951e2143c 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul.ll
@@ -33,12 +33,12 @@ define <4 x double> @complex_mul_v4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v4.2d, #0000000000000000
 ; CHECK-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-NEXT:    fcmla v4.2d, v0.2d, v2.2d, #0
-; CHECK-NEXT:    fcmla v5.2d, v1.2d, v3.2d, #0
-; CHECK-NEXT:    fcmla v4.2d, v0.2d, v2.2d, #90
-; CHECK-NEXT:    fcmla v5.2d, v1.2d, v3.2d, #90
-; CHECK-NEXT:    mov v0.16b, v4.16b
-; CHECK-NEXT:    mov v1.16b, v5.16b
+; CHECK-NEXT:    fcmla v5.2d, v0.2d, v2.2d, #0
+; CHECK-NEXT:    fcmla v4.2d, v1.2d, v3.2d, #0
+; CHECK-NEXT:    fcmla v5.2d, v0.2d, v2.2d, #90
+; CHECK-NEXT:    fcmla v4.2d, v1.2d, v3.2d, #90
+; CHECK-NEXT:    mov v0.16b, v5.16b
+; CHECK-NEXT:    mov v1.16b, v4.16b
 ; CHECK-NEXT:    ret
 entry:
   %a.real   = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
@@ -64,17 +64,17 @@ define <8 x double> @complex_mul_v8f64(<8 x double> %a, <8 x double> %b) {
 ; CHECK-NEXT:    movi v18.2d, #0000000000000000
 ; CHECK-NEXT:    movi v19.2d, #0000000000000000
 ; CHECK-NEXT:    fcmla v16.2d, v0.2d, v4.2d, #0
-; CHECK-NEXT:    fcmla v17.2d, v1.2d, v5.2d, #0
-; CHECK-NEXT:    fcmla v18.2d, v2.2d, v6.2d, #0
-; CHECK-NEXT:    fcmla v19.2d, v3.2d, v7.2d, #0
+; CHECK-NEXT:    fcmla v18.2d, v1.2d, v5.2d, #0
+; CHECK-NEXT:    fcmla v17.2d, v3.2d, v7.2d, #0
+; CHECK-NEXT:    fcmla v19.2d, v2.2d, v6.2d, #0
 ; CHECK-NEXT:    fcmla v16.2d, v0.2d, v4.2d, #90
-; CHECK-NEXT:    fcmla v17.2d, v1.2d, v5.2d, #90
-; CHECK-NEXT:    fcmla v18.2d, v2.2d, v6.2d, #90
-; CHECK-NEXT:    fcmla v19.2d, v3.2d, v7.2d, #90
+; CHECK-NEXT:    fcmla v18.2d, v1.2d, v5.2d, #90
+; CHECK-NEXT:    fcmla v17.2d, v3.2d, v7.2d, #90
+; CHECK-NEXT:    fcmla v19.2d, v2.2d, v6.2d, #90
 ; CHECK-NEXT:    mov v0.16b, v16.16b
-; CHECK-NEXT:    mov v1.16b, v17.16b
-; CHECK-NEXT:    mov v2.16b, v18.16b
-; CHECK-NEXT:    mov v3.16b, v19.16b
+; CHECK-NEXT:    mov v1.16b, v18.16b
+; CHECK-NEXT:    mov v3.16b, v17.16b
+; CHECK-NEXT:    mov v2.16b, v19.16b
 ; CHECK-NEXT:    ret
 entry:
   %a.real   = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll
index 2c46d5d032d702..6f4f8d3ca2d37c 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-add-scalable.ll
@@ -58,8 +58,8 @@ entry:
 define <vscale x 16 x i16> @complex_add_v16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
 ; CHECK-LABEL: complex_add_v16i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cadd z2.h, z2.h, z0.h, #90
 ; CHECK-NEXT:    cadd z3.h, z3.h, z1.h, #90
+; CHECK-NEXT:    cadd z2.h, z2.h, z0.h, #90
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    mov z1.d, z3.d
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll
index 94f720c94c7571..b0a3e46c96c49b 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll
@@ -11,15 +11,15 @@ define <vscale x 4 x i16> @complex_mul_v4i16(<vscale x 4 x i16> %a, <vscale x 4
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    uunpkhi z3.d, z1.s
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uzp1 z4.d, z0.d, z2.d
 ; CHECK-NEXT:    uzp2 z0.d, z0.d, z2.d
-; CHECK-NEXT:    uzp2 z2.d, z1.d, z3.d
-; CHECK-NEXT:    uzp1 z1.d, z1.d, z3.d
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mul z3.d, z1.d, z0.d
-; CHECK-NEXT:    mul z1.d, z1.d, z4.d
-; CHECK-NEXT:    mla z3.d, p0/m, z2.d, z4.d
-; CHECK-NEXT:    msb z0.d, p0/m, z2.d, z1.d
+; CHECK-NEXT:    uzp1 z2.d, z1.d, z3.d
+; CHECK-NEXT:    uzp2 z1.d, z1.d, z3.d
+; CHECK-NEXT:    mul z3.d, z2.d, z0.d
+; CHECK-NEXT:    mul z2.d, z2.d, z4.d
+; CHECK-NEXT:    mla z3.d, p0/m, z1.d, z4.d
+; CHECK-NEXT:    msb z0.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    zip2 z1.d, z0.d, z3.d
 ; CHECK-NEXT:    zip1 z0.d, z0.d, z3.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
@@ -104,18 +104,18 @@ define <vscale x 32 x i16> @complex_mul_v32i16(<vscale x 32 x i16> %a, <vscale x
 ; CHECK-NEXT:    mov z25.d, z24.d
 ; CHECK-NEXT:    mov z26.d, z24.d
 ; CHECK-NEXT:    mov z27.d, z24.d
+; CHECK-NEXT:    cmla z24.h, z7.h, z3.h, #0
 ; CHECK-NEXT:    cmla z25.h, z4.h, z0.h, #0
 ; CHECK-NEXT:    cmla z26.h, z5.h, z1.h, #0
 ; CHECK-NEXT:    cmla z27.h, z6.h, z2.h, #0
-; CHECK-NEXT:    cmla z24.h, z7.h, z3.h, #0
+; CHECK-NEXT:    cmla z24.h, z7.h, z3.h, #90
 ; CHECK-NEXT:    cmla z25.h, z4.h, z0.h, #90
 ; CHECK-NEXT:    cmla z26.h, z5.h, z1.h, #90
 ; CHECK-NEXT:    cmla z27.h, z6.h, z2.h, #90
-; CHECK-NEXT:    cmla z24.h, z7.h, z3.h, #90
+; CHECK-NEXT:    mov z3.d, z24.d
 ; CHECK-NEXT:    mov z0.d, z25.d
 ; CHECK-NEXT:    mov z1.d, z26.d
 ; CHECK-NEXT:    mov z2.d, z27.d
-; CHECK-NEXT:    mov z3.d, z24.d
 ; CHECK-NEXT:    ret
 entry:
   %a.deinterleaved = tail call { <vscale x 16 x i16>, <vscale x 16 x i16> } @llvm.experimental.vector.deinterleave2.nxv32i16(<vscale x 32 x i16> %a)

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll
index 4f06af2f87d7a4..3118d8669dc9b9 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-add-scalable.ll
@@ -27,8 +27,8 @@ entry:
 define <vscale x 8 x i32> @complex_add_v8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b) {
 ; CHECK-LABEL: complex_add_v8i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cadd z2.s, z2.s, z0.s, #90
 ; CHECK-NEXT:    cadd z3.s, z3.s, z1.s, #90
+; CHECK-NEXT:    cadd z2.s, z2.s, z0.s, #90
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    mov z1.d, z3.d
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll
index 831a4b35605065..256ed10cad07d1 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll
@@ -67,18 +67,18 @@ define <vscale x 16 x i32> @complex_mul_v16i32(<vscale x 16 x i32> %a, <vscale x
 ; CHECK-NEXT:    mov z25.d, z24.d
 ; CHECK-NEXT:    mov z26.d, z24.d
 ; CHECK-NEXT:    mov z27.d, z24.d
+; CHECK-NEXT:    cmla z24.s, z7.s, z3.s, #0
 ; CHECK-NEXT:    cmla z25.s, z4.s, z0.s, #0
 ; CHECK-NEXT:    cmla z26.s, z5.s, z1.s, #0
 ; CHECK-NEXT:    cmla z27.s, z6.s, z2.s, #0
-; CHECK-NEXT:    cmla z24.s, z7.s, z3.s, #0
+; CHECK-NEXT:    cmla z24.s, z7.s, z3.s, #90
 ; CHECK-NEXT:    cmla z25.s, z4.s, z0.s, #90
 ; CHECK-NEXT:    cmla z26.s, z5.s, z1.s, #90
 ; CHECK-NEXT:    cmla z27.s, z6.s, z2.s, #90
-; CHECK-NEXT:    cmla z24.s, z7.s, z3.s, #90
+; CHECK-NEXT:    mov z3.d, z24.d
 ; CHECK-NEXT:    mov z0.d, z25.d
 ; CHECK-NEXT:    mov z1.d, z26.d
 ; CHECK-NEXT:    mov z2.d, z27.d
-; CHECK-NEXT:    mov z3.d, z24.d
 ; CHECK-NEXT:    ret
 entry:
   %a.deinterleaved = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.experimental.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %a)

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll
index 647b4ecc458ab9..d9ec5fcd3bdc25 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-add-scalable.ll
@@ -27,8 +27,8 @@ entry:
 define <vscale x 4 x i64> @complex_add_v4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b) {
 ; CHECK-LABEL: complex_add_v4i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cadd z2.d, z2.d, z0.d, #90
 ; CHECK-NEXT:    cadd z3.d, z3.d, z1.d, #90
+; CHECK-NEXT:    cadd z2.d, z2.d, z0.d, #90
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    mov z1.d, z3.d
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll
index 02a4f2a821a0c7..2dec03b6f979aa 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll
@@ -67,18 +67,18 @@ define <vscale x 8 x i64> @complex_mul_v8i64(<vscale x 8 x i64> %a, <vscale x 8
 ; CHECK-NEXT:    mov z25.d, z24.d
 ; CHECK-NEXT:    mov z26.d, z24.d
 ; CHECK-NEXT:    mov z27.d, z24.d
+; CHECK-NEXT:    cmla z24.d, z7.d, z3.d, #0
 ; CHECK-NEXT:    cmla z25.d, z4.d, z0.d, #0
 ; CHECK-NEXT:    cmla z26.d, z5.d, z1.d, #0
 ; CHECK-NEXT:    cmla z27.d, z6.d, z2.d, #0
-; CHECK-NEXT:    cmla z24.d, z7.d, z3.d, #0
+; CHECK-NEXT:    cmla z24.d, z7.d, z3.d, #90
 ; CHECK-NEXT:    cmla z25.d, z4.d, z0.d, #90
 ; CHECK-NEXT:    cmla z26.d, z5.d, z1.d, #90
 ; CHECK-NEXT:    cmla z27.d, z6.d, z2.d, #90
-; CHECK-NEXT:    cmla z24.d, z7.d, z3.d, #90
+; CHECK-NEXT:    mov z3.d, z24.d
 ; CHECK-NEXT:    mov z0.d, z25.d
 ; CHECK-NEXT:    mov z1.d, z26.d
 ; CHECK-NEXT:    mov z2.d, z27.d
-; CHECK-NEXT:    mov z3.d, z24.d
 ; CHECK-NEXT:    ret
 entry:
   %a.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %a)
@@ -105,18 +105,18 @@ define <vscale x 8 x i64> @complex_minus_mul_v8i64(<vscale x 8 x i64> %a, <vscal
 ; CHECK-NEXT:    mov z25.d, z24.d
 ; CHECK-NEXT:    mov z26.d, z24.d
 ; CHECK-NEXT:    mov z27.d, z24.d
+; CHECK-NEXT:    cmla z24.d, z7.d, z3.d, #270
 ; CHECK-NEXT:    cmla z25.d, z4.d, z0.d, #270
 ; CHECK-NEXT:    cmla z26.d, z5.d, z1.d, #270
 ; CHECK-NEXT:    cmla z27.d, z6.d, z2.d, #270
-; CHECK-NEXT:    cmla z24.d, z7.d, z3.d, #270
+; CHECK-NEXT:    cmla z24.d, z7.d, z3.d, #180
 ; CHECK-NEXT:    cmla z25.d, z4.d, z0.d, #180
 ; CHECK-NEXT:    cmla z26.d, z5.d, z1.d, #180
 ; CHECK-NEXT:    cmla z27.d, z6.d, z2.d, #180
-; CHECK-NEXT:    cmla z24.d, z7.d, z3.d, #180
+; CHECK-NEXT:    mov z3.d, z24.d
 ; CHECK-NEXT:    mov z0.d, z25.d
 ; CHECK-NEXT:    mov z1.d, z26.d
 ; CHECK-NEXT:    mov z2.d, z27.d
-; CHECK-NEXT:    mov z3.d, z24.d
 ; CHECK-NEXT:    ret
 entry:
   %a.deinterleaved = tail call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.experimental.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %a)

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll
index 41577df7a18b7c..e7ebd07fd73149 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i8-add-scalable.ll
@@ -58,8 +58,8 @@ entry:
 define <vscale x 32 x i8> @complex_add_v32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
 ; CHECK-LABEL: complex_add_v32i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cadd z2.b, z2.b, z0.b, #90
 ; CHECK-NEXT:    cadd z3.b, z3.b, z1.b, #90
+; CHECK-NEXT:    cadd z2.b, z2.b, z0.b, #90
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    mov z1.d, z3.d
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll
index fdeb4a0f5548f6..f7837b2367671d 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll
@@ -82,8 +82,8 @@ define <4 x float> @mul_mul270_mul(<4 x float> %a, <4 x float> %b, <4 x float> %
 ; CHECK-NEXT:    ext v3.16b, v2.16b, v2.16b, #8
 ; CHECK-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
 ; CHECK-NEXT:    zip1 v5.2s, v2.2s, v3.2s
-; CHECK-NEXT:    zip2 v2.2s, v2.2s, v3.2s
 ; CHECK-NEXT:    zip1 v6.2s, v1.2s, v4.2s
+; CHECK-NEXT:    zip2 v2.2s, v2.2s, v3.2s
 ; CHECK-NEXT:    zip2 v1.2s, v1.2s, v4.2s
 ; CHECK-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    fmul v7.2s, v6.2s, v5.2s
@@ -220,11 +220,11 @@ define <4 x float> @mul_add90_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEXT:    fcmla v3.4s, v2.4s, v0.4s, #0
-; CHECK-NEXT:    fcmla v4.4s, v1.4s, v0.4s, #0
-; CHECK-NEXT:    fcmla v3.4s, v2.4s, v0.4s, #90
-; CHECK-NEXT:    fcmla v4.4s, v1.4s, v0.4s, #90
-; CHECK-NEXT:    fcadd v0.4s, v3.4s, v4.4s, #90
+; CHECK-NEXT:    fcmla v4.4s, v2.4s, v0.4s, #0
+; CHECK-NEXT:    fcmla v3.4s, v1.4s, v0.4s, #0
+; CHECK-NEXT:    fcmla v4.4s, v2.4s, v0.4s, #90
+; CHECK-NEXT:    fcmla v3.4s, v1.4s, v0.4s, #90
+; CHECK-NEXT:    fcadd v0.4s, v4.4s, v3.4s, #90
 ; CHECK-NEXT:    ret
 entry:
   %ar = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
@@ -258,25 +258,25 @@ entry:
 define <4 x float> @mul_triangle_addmul(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; CHECK-LABEL: mul_triangle_addmul:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    ext v4.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    zip1 v5.2s, v1.2s, v3.2s
-; CHECK-NEXT:    zip2 v1.2s, v1.2s, v3.2s
-; CHECK-NEXT:    zip1 v6.2s, v0.2s, v4.2s
-; CHECK-NEXT:    zip2 v0.2s, v0.2s, v4.2s
-; CHECK-NEXT:    ext v3.16b, v2.16b, v2.16b, #8
-; CHECK-NEXT:    fmul v7.2s, v5.2s, v6.2s
-; CHECK-NEXT:    fmul v6.2s, v1.2s, v6.2s
-; CHECK-NEXT:    zip1 v4.2s, v2.2s, v3.2s
-; CHECK-NEXT:    zip2 v2.2s, v2.2s, v3.2s
-; CHECK-NEXT:    fmov d3, d7
-; CHECK-NEXT:    fmov d16, d6
+; CHECK-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    zip1 v5.2s, v0.2s, v3.2s
+; CHECK-NEXT:    zip1 v6.2s, v1.2s, v4.2s
+; CHECK-NEXT:    zip2 v1.2s, v1.2s, v4.2s
+; CHECK-NEXT:    ext v4.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT:    zip2 v0.2s, v0.2s, v3.2s
+; CHECK-NEXT:    fmul v7.2s, v6.2s, v5.2s
+; CHECK-NEXT:    fmul v5.2s, v1.2s, v5.2s
+; CHECK-NEXT:    zip1 v3.2s, v2.2s, v4.2s
+; CHECK-NEXT:    zip2 v2.2s, v2.2s, v4.2s
+; CHECK-NEXT:    fmov d4, d7
+; CHECK-NEXT:    fmov d16, d5
 ; CHECK-NEXT:    fmls v7.2s, v0.2s, v2.2s
-; CHECK-NEXT:    fmla v6.2s, v0.2s, v4.2s
-; CHECK-NEXT:    fmls v3.2s, v0.2s, v1.2s
-; CHECK-NEXT:    fmla v16.2s, v0.2s, v5.2s
+; CHECK-NEXT:    fmla v5.2s, v0.2s, v3.2s
+; CHECK-NEXT:    fmls v4.2s, v0.2s, v1.2s
+; CHECK-NEXT:    fmla v16.2s, v0.2s, v6.2s
 ; CHECK-NEXT:    fsub v0.2s, v7.2s, v16.2s
-; CHECK-NEXT:    fadd v1.2s, v6.2s, v3.2s
+; CHECK-NEXT:    fadd v1.2s, v5.2s, v4.2s
 ; CHECK-NEXT:    zip1 v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
 entry:
@@ -314,8 +314,8 @@ define <4 x float> @mul_triangle_multiuses(<4 x float> %a, <4 x float> %b, ptr %
 ; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
 ; CHECK-NEXT:    zip2 v4.2s, v0.2s, v2.2s
-; CHECK-NEXT:    zip1 v0.2s, v0.2s, v2.2s
 ; CHECK-NEXT:    zip1 v5.2s, v1.2s, v3.2s
+; CHECK-NEXT:    zip1 v0.2s, v0.2s, v2.2s
 ; CHECK-NEXT:    zip2 v1.2s, v1.2s, v3.2s
 ; CHECK-NEXT:    fmul v2.2s, v4.2s, v5.2s
 ; CHECK-NEXT:    fmul v3.2s, v1.2s, v4.2s
@@ -442,23 +442,23 @@ entry:
 define <4 x float> @mul_divequal(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; CHECK-LABEL: mul_divequal:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    ext v4.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    ext v16.16b, v2.16b, v2.16b, #8
-; CHECK-NEXT:    zip2 v5.2s, v1.2s, v3.2s
-; CHECK-NEXT:    zip1 v1.2s, v1.2s, v3.2s
-; CHECK-NEXT:    zip2 v6.2s, v0.2s, v4.2s
-; CHECK-NEXT:    zip1 v0.2s, v0.2s, v4.2s
-; CHECK-NEXT:    zip1 v4.2s, v2.2s, v16.2s
-; CHECK-NEXT:    zip2 v2.2s, v2.2s, v16.2s
-; CHECK-NEXT:    fmul v7.2s, v6.2s, v5.2s
-; CHECK-NEXT:    fneg v3.2s, v7.2s
-; CHECK-NEXT:    fmla v3.2s, v0.2s, v1.2s
-; CHECK-NEXT:    fmul v0.2s, v5.2s, v0.2s
-; CHECK-NEXT:    fmla v0.2s, v6.2s, v1.2s
-; CHECK-NEXT:    fdiv v3.2s, v3.2s, v4.2s
+; CHECK-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    zip2 v5.2s, v0.2s, v3.2s
+; CHECK-NEXT:    zip2 v6.2s, v1.2s, v4.2s
+; CHECK-NEXT:    zip1 v0.2s, v0.2s, v3.2s
+; CHECK-NEXT:    zip1 v1.2s, v1.2s, v4.2s
+; CHECK-NEXT:    ext v3.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT:    fmul v7.2s, v5.2s, v6.2s
+; CHECK-NEXT:    fneg v4.2s, v7.2s
+; CHECK-NEXT:    zip1 v7.2s, v2.2s, v3.2s
+; CHECK-NEXT:    zip2 v2.2s, v2.2s, v3.2s
+; CHECK-NEXT:    fmla v4.2s, v0.2s, v1.2s
+; CHECK-NEXT:    fmul v0.2s, v6.2s, v0.2s
+; CHECK-NEXT:    fmla v0.2s, v5.2s, v1.2s
+; CHECK-NEXT:    fdiv v4.2s, v4.2s, v7.2s
 ; CHECK-NEXT:    fdiv v0.2s, v0.2s, v2.2s
-; CHECK-NEXT:    zip1 v0.4s, v3.4s, v0.4s
+; CHECK-NEXT:    zip1 v0.4s, v4.4s, v0.4s
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll
index 3243a691493ca0..16dec1af60c1ca 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll
@@ -49,17 +49,17 @@ define <4 x float> @mul_triangle_external_use(<4 x float> %a, <4 x float> %b, pt
 ; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
 ; CHECK-NEXT:    zip2 v4.2s, v0.2s, v2.2s
-; CHECK-NEXT:    zip1 v0.2s, v0.2s, v2.2s
 ; CHECK-NEXT:    zip1 v5.2s, v1.2s, v3.2s
+; CHECK-NEXT:    zip1 v0.2s, v0.2s, v2.2s
 ; CHECK-NEXT:    zip2 v1.2s, v1.2s, v3.2s
 ; CHECK-NEXT:    fmul v2.2s, v4.2s, v5.2s
 ; CHECK-NEXT:    fmul v3.2s, v1.2s, v4.2s
 ; CHECK-NEXT:    fmla v2.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    fneg v1.2s, v3.2s
 ; CHECK-NEXT:    fmul v3.2s, v2.2s, v4.2s
+; CHECK-NEXT:    str d2, [x0]
 ; CHECK-NEXT:    fmla v1.2s, v0.2s, v5.2s
 ; CHECK-NEXT:    fmul v5.2s, v2.2s, v0.2s
-; CHECK-NEXT:    str d2, [x0]
 ; CHECK-NEXT:    fneg v3.2s, v3.2s
 ; CHECK-NEXT:    fmla v5.2s, v4.2s, v1.2s
 ; CHECK-NEXT:    fmla v3.2s, v0.2s, v1.2s
@@ -96,28 +96,27 @@ define <4 x float> @multiple_muls_shuffle_external(<4 x float> %a, <4 x float> %
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ext v5.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    ext v6.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-NEXT:    ext v4.16b, v2.16b, v2.16b, #8
 ; CHECK-NEXT:    zip2 v7.2s, v0.2s, v5.2s
-; CHECK-NEXT:    zip1 v0.2s, v0.2s, v5.2s
 ; CHECK-NEXT:    zip1 v16.2s, v1.2s, v6.2s
 ; CHECK-NEXT:    zip2 v1.2s, v1.2s, v6.2s
-; CHECK-NEXT:    ext v6.16b, v2.16b, v2.16b, #8
-; CHECK-NEXT:    fcmla v4.4s, v3.4s, v2.4s, #0
+; CHECK-NEXT:    zip1 v0.2s, v0.2s, v5.2s
 ; CHECK-NEXT:    fmul v5.2s, v16.2s, v7.2s
-; CHECK-NEXT:    fmul v7.2s, v1.2s, v7.2s
-; CHECK-NEXT:    fcmla v4.4s, v3.4s, v2.4s, #90
+; CHECK-NEXT:    fmul v6.2s, v1.2s, v7.2s
 ; CHECK-NEXT:    fmla v5.2s, v0.2s, v1.2s
-; CHECK-NEXT:    fneg v1.2s, v7.2s
-; CHECK-NEXT:    zip1 v7.2s, v2.2s, v6.2s
-; CHECK-NEXT:    zip2 v6.2s, v2.2s, v6.2s
+; CHECK-NEXT:    fneg v1.2s, v6.2s
+; CHECK-NEXT:    zip1 v6.2s, v2.2s, v4.2s
+; CHECK-NEXT:    zip2 v4.2s, v2.2s, v4.2s
 ; CHECK-NEXT:    fmla v1.2s, v0.2s, v16.2s
-; CHECK-NEXT:    fmul v17.2s, v7.2s, v5.2s
-; CHECK-NEXT:    fmul v0.2s, v6.2s, v5.2s
+; CHECK-NEXT:    fmul v17.2s, v6.2s, v5.2s
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    fmul v5.2s, v4.2s, v5.2s
+; CHECK-NEXT:    fmla v17.2s, v1.2s, v4.2s
+; CHECK-NEXT:    fcmla v0.4s, v3.4s, v2.4s, #0
 ; CHECK-NEXT:    str d1, [x0]
-; CHECK-NEXT:    fmla v17.2s, v1.2s, v6.2s
-; CHECK-NEXT:    fneg v16.2s, v0.2s
-; CHECK-NEXT:    mov v0.16b, v4.16b
-; CHECK-NEXT:    fmla v16.2s, v1.2s, v7.2s
+; CHECK-NEXT:    fneg v16.2s, v5.2s
+; CHECK-NEXT:    fcmla v0.4s, v3.4s, v2.4s, #90
+; CHECK-NEXT:    fmla v16.2s, v1.2s, v6.2s
 ; CHECK-NEXT:    st2 { v16.2s, v17.2s }, [x1]
 ; CHECK-NEXT:    ret
 entry:
@@ -160,28 +159,28 @@ entry:
 define <4 x float> @multiple_muls_shuffle_external_with_loads(ptr %ptr_a, ptr %ptr_b, ptr %ptr_c, ptr %ptr_d, ptr %p1, ptr %p2) {
 ; CHECK-LABEL: multiple_muls_shuffle_external_with_loads:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ld2 { v1.2s, v2.2s }, [x0]
+; CHECK-NEXT:    ld2 { v0.2s, v1.2s }, [x0]
+; CHECK-NEXT:    ld2 { v2.2s, v3.2s }, [x1]
+; CHECK-NEXT:    fmul v4.2s, v3.2s, v1.2s
+; CHECK-NEXT:    fmul v6.2s, v2.2s, v1.2s
+; CHECK-NEXT:    fneg v4.2s, v4.2s
+; CHECK-NEXT:    fmla v6.2s, v0.2s, v3.2s
+; CHECK-NEXT:    fmla v4.2s, v0.2s, v2.2s
+; CHECK-NEXT:    str d4, [x4]
+; CHECK-NEXT:    ldr q5, [x2]
+; CHECK-NEXT:    ext v7.16b, v5.16b, v5.16b, #8
+; CHECK-NEXT:    zip1 v0.2s, v5.2s, v7.2s
+; CHECK-NEXT:    zip2 v1.2s, v5.2s, v7.2s
+; CHECK-NEXT:    fmul v3.2s, v0.2s, v6.2s
+; CHECK-NEXT:    fmul v6.2s, v1.2s, v6.2s
+; CHECK-NEXT:    fmla v3.2s, v4.2s, v1.2s
+; CHECK-NEXT:    fneg v2.2s, v6.2s
+; CHECK-NEXT:    fmla v2.2s, v4.2s, v0.2s
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    ld2 { v3.2s, v4.2s }, [x1]
-; CHECK-NEXT:    fmul v5.2s, v4.2s, v2.2s
-; CHECK-NEXT:    fmul v7.2s, v3.2s, v2.2s
-; CHECK-NEXT:    fneg v5.2s, v5.2s
-; CHECK-NEXT:    fmla v7.2s, v1.2s, v4.2s
-; CHECK-NEXT:    fmla v5.2s, v1.2s, v3.2s
-; CHECK-NEXT:    str d5, [x4]
-; CHECK-NEXT:    ldr q6, [x2]
-; CHECK-NEXT:    ext v16.16b, v6.16b, v6.16b, #8
-; CHECK-NEXT:    zip1 v1.2s, v6.2s, v16.2s
-; CHECK-NEXT:    zip2 v2.2s, v6.2s, v16.2s
-; CHECK-NEXT:    fmul v4.2s, v1.2s, v7.2s
-; CHECK-NEXT:    fmul v7.2s, v2.2s, v7.2s
-; CHECK-NEXT:    fmla v4.2s, v5.2s, v2.2s
-; CHECK-NEXT:    fneg v3.2s, v7.2s
-; CHECK-NEXT:    fmla v3.2s, v5.2s, v1.2s
-; CHECK-NEXT:    st2 { v3.2s, v4.2s }, [x5]
+; CHECK-NEXT:    st2 { v2.2s, v3.2s }, [x5]
 ; CHECK-NEXT:    ldr q1, [x3]
-; CHECK-NEXT:    fcmla v0.4s, v1.4s, v6.4s, #0
-; CHECK-NEXT:    fcmla v0.4s, v1.4s, v6.4s, #90
+; CHECK-NEXT:    fcmla v0.4s, v1.4s, v5.4s, #0
+; CHECK-NEXT:    fcmla v0.4s, v1.4s, v5.4s, #90
 ; CHECK-NEXT:    ret
 entry:
   %a = load <4 x float>, ptr %ptr_a
@@ -228,30 +227,30 @@ entry:
 define <4 x float> @multiple_muls_mul_external(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, ptr %p1, ptr %p2) {
 ; CHECK-LABEL: multiple_muls_mul_external:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ext v5.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    ext v6.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    ext v4.16b, v3.16b, v3.16b, #8
-; CHECK-NEXT:    ext v7.16b, v2.16b, v2.16b, #8
-; CHECK-NEXT:    zip2 v16.2s, v0.2s, v5.2s
-; CHECK-NEXT:    zip2 v17.2s, v1.2s, v6.2s
-; CHECK-NEXT:    zip1 v0.2s, v0.2s, v5.2s
-; CHECK-NEXT:    zip1 v1.2s, v1.2s, v6.2s
-; CHECK-NEXT:    zip1 v18.2s, v2.2s, v7.2s
-; CHECK-NEXT:    zip2 v2.2s, v2.2s, v7.2s
-; CHECK-NEXT:    zip2 v7.2s, v3.2s, v4.2s
-; CHECK-NEXT:    zip1 v3.2s, v3.2s, v4.2s
-; CHECK-NEXT:    fmul v19.2s, v16.2s, v17.2s
-; CHECK-NEXT:    fmul v5.2s, v18.2s, v7.2s
-; CHECK-NEXT:    fmul v6.2s, v2.2s, v7.2s
-; CHECK-NEXT:    fneg v4.2s, v19.2s
-; CHECK-NEXT:    fmul v7.2s, v0.2s, v17.2s
+; CHECK-NEXT:    ext v4.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ext v5.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    ext v16.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT:    ext v17.16b, v3.16b, v3.16b, #8
+; CHECK-NEXT:    zip2 v6.2s, v0.2s, v4.2s
+; CHECK-NEXT:    zip2 v7.2s, v1.2s, v5.2s
+; CHECK-NEXT:    zip1 v19.2s, v2.2s, v16.2s
+; CHECK-NEXT:    zip2 v2.2s, v2.2s, v16.2s
+; CHECK-NEXT:    zip2 v16.2s, v3.2s, v17.2s
+; CHECK-NEXT:    zip1 v0.2s, v0.2s, v4.2s
+; CHECK-NEXT:    zip1 v1.2s, v1.2s, v5.2s
+; CHECK-NEXT:    zip1 v3.2s, v3.2s, v17.2s
+; CHECK-NEXT:    fmul v18.2s, v6.2s, v7.2s
+; CHECK-NEXT:    fmul v5.2s, v19.2s, v16.2s
+; CHECK-NEXT:    fmul v16.2s, v2.2s, v16.2s
+; CHECK-NEXT:    fmul v7.2s, v0.2s, v7.2s
+; CHECK-NEXT:    fneg v4.2s, v18.2s
 ; CHECK-NEXT:    fmla v5.2s, v3.2s, v2.2s
-; CHECK-NEXT:    fneg v2.2s, v6.2s
+; CHECK-NEXT:    fneg v2.2s, v16.2s
+; CHECK-NEXT:    fmla v7.2s, v1.2s, v6.2s
 ; CHECK-NEXT:    fmla v4.2s, v1.2s, v0.2s
-; CHECK-NEXT:    fmla v7.2s, v1.2s, v16.2s
-; CHECK-NEXT:    fmla v2.2s, v3.2s, v18.2s
-; CHECK-NEXT:    fmul v17.2s, v4.2s, v5.2s
+; CHECK-NEXT:    fmla v2.2s, v3.2s, v19.2s
 ; CHECK-NEXT:    fmul v0.2s, v7.2s, v5.2s
+; CHECK-NEXT:    fmul v17.2s, v4.2s, v5.2s
 ; CHECK-NEXT:    str d4, [x0]
 ; CHECK-NEXT:    fmla v17.2s, v2.2s, v7.2s
 ; CHECK-NEXT:    fneg v16.2s, v0.2s
@@ -299,34 +298,35 @@ entry:
 define void @mul_add_common_mul_add_mul(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d, <4 x double> %e, <4 x double> %f, <4 x double> %g, <4 x double> %h, ptr %p1, ptr %p2) {
 ; CHECK-LABEL: mul_add_common_mul_add_mul:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp q17, q16, [sp, #64]
-; CHECK-NEXT:    movi v20.2d, #0000000000000000
-; CHECK-NEXT:    movi v21.2d, #0000000000000000
-; CHECK-NEXT:    movi v24.2d, #0000000000000000
-; CHECK-NEXT:    movi v25.2d, #0000000000000000
-; CHECK-NEXT:    ldp q19, q18, [sp, #96]
-; CHECK-NEXT:    fcmla v24.2d, v2.2d, v0.2d, #0
-; CHECK-NEXT:    fcmla v25.2d, v3.2d, v1.2d, #0
-; CHECK-NEXT:    fcmla v20.2d, v19.2d, v17.2d, #0
-; CHECK-NEXT:    fcmla v24.2d, v2.2d, v0.2d, #90
-; CHECK-NEXT:    fcmla v21.2d, v18.2d, v16.2d, #0
-; CHECK-NEXT:    ldp q23, q22, [sp, #32]
-; CHECK-NEXT:    fcmla v20.2d, v19.2d, v17.2d, #90
-; CHECK-NEXT:    fcmla v25.2d, v3.2d, v1.2d, #90
-; CHECK-NEXT:    fcmla v21.2d, v18.2d, v16.2d, #90
-; CHECK-NEXT:    fcmla v20.2d, v6.2d, v4.2d, #0
-; CHECK-NEXT:    ldp q1, q0, [sp]
-; CHECK-NEXT:    fcmla v21.2d, v7.2d, v5.2d, #0
-; CHECK-NEXT:    fcmla v20.2d, v6.2d, v4.2d, #90
-; CHECK-NEXT:    fcmla v21.2d, v7.2d, v5.2d, #90
-; CHECK-NEXT:    fsub v2.2d, v24.2d, v20.2d
-; CHECK-NEXT:    fcmla v20.2d, v1.2d, v23.2d, #0
-; CHECK-NEXT:    fsub v3.2d, v25.2d, v21.2d
-; CHECK-NEXT:    fcmla v21.2d, v0.2d, v22.2d, #0
-; CHECK-NEXT:    fcmla v20.2d, v1.2d, v23.2d, #90
-; CHECK-NEXT:    stp q2, q3, [x0]
-; CHECK-NEXT:    fcmla v21.2d, v0.2d, v22.2d, #90
-; CHECK-NEXT:    stp q20, q21, [x1]
+; CHECK-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-NEXT:    movi v17.2d, #0000000000000000
+; CHECK-NEXT:    movi v22.2d, #0000000000000000
+; CHECK-NEXT:    ldp q21, q18, [sp, #96]
+; CHECK-NEXT:    ldp q20, q19, [sp, #64]
+; CHECK-NEXT:    fcmla v22.2d, v3.2d, v1.2d, #0
+; CHECK-NEXT:    fcmla v16.2d, v18.2d, v19.2d, #0
+; CHECK-NEXT:    fcmla v17.2d, v21.2d, v20.2d, #0
+; CHECK-NEXT:    fcmla v22.2d, v3.2d, v1.2d, #90
+; CHECK-NEXT:    ldr q1, [sp, #48]
+; CHECK-NEXT:    ldr q3, [sp]
+; CHECK-NEXT:    fcmla v16.2d, v18.2d, v19.2d, #90
+; CHECK-NEXT:    movi v18.2d, #0000000000000000
+; CHECK-NEXT:    fcmla v17.2d, v21.2d, v20.2d, #90
+; CHECK-NEXT:    fcmla v16.2d, v7.2d, v5.2d, #0
+; CHECK-NEXT:    fcmla v18.2d, v2.2d, v0.2d, #0
+; CHECK-NEXT:    fcmla v17.2d, v6.2d, v4.2d, #0
+; CHECK-NEXT:    fcmla v16.2d, v7.2d, v5.2d, #90
+; CHECK-NEXT:    fcmla v18.2d, v2.2d, v0.2d, #90
+; CHECK-NEXT:    fcmla v17.2d, v6.2d, v4.2d, #90
+; CHECK-NEXT:    ldp q0, q2, [sp, #16]
+; CHECK-NEXT:    fsub v4.2d, v22.2d, v16.2d
+; CHECK-NEXT:    fcmla v16.2d, v0.2d, v1.2d, #0
+; CHECK-NEXT:    fsub v5.2d, v18.2d, v17.2d
+; CHECK-NEXT:    fcmla v17.2d, v3.2d, v2.2d, #0
+; CHECK-NEXT:    fcmla v16.2d, v0.2d, v1.2d, #90
+; CHECK-NEXT:    stp q5, q4, [x0]
+; CHECK-NEXT:    fcmla v17.2d, v3.2d, v2.2d, #90
+; CHECK-NEXT:    stp q17, q16, [x1]
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
index 2a034d70374099..f00265a80e0328 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
@@ -15,42 +15,42 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: complex_mul_v2f64:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov w9, #100 // =0x64
-; CHECK-NEXT:    cntd x10
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    mov x11, x10
 ; CHECK-NEXT:    mov z1.d, #0 // =0x0
-; CHECK-NEXT:    rdvl x12, #2
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    whilelo p1.d, xzr, x9
+; CHECK-NEXT:    cntd x10
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    rdvl x11, #2
+; CHECK-NEXT:    mov x12, x10
 ; CHECK-NEXT:    zip2 z0.d, z1.d, z1.d
 ; CHECK-NEXT:    zip1 z1.d, z1.d, z1.d
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:  .LBB0_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    zip2 p2.d, p1.d, p1.d
 ; CHECK-NEXT:    add x13, x0, x8
 ; CHECK-NEXT:    add x14, x1, x8
-; CHECK-NEXT:    zip1 p2.d, p1.d, p1.d
-; CHECK-NEXT:    zip2 p3.d, p1.d, p1.d
+; CHECK-NEXT:    zip1 p3.d, p1.d, p1.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    mov z7.d, z0.d
-; CHECK-NEXT:    ld1d { z2.d }, p3/z, [x13, #1, mul vl]
-; CHECK-NEXT:    ld1d { z3.d }, p2/z, [x13]
-; CHECK-NEXT:    ld1d { z4.d }, p3/z, [x14, #1, mul vl]
-; CHECK-NEXT:    ld1d { z5.d }, p2/z, [x14]
-; CHECK-NEXT:    whilelo p1.d, x11, x9
-; CHECK-NEXT:    add x8, x8, x12
-; CHECK-NEXT:    add x11, x11, x10
+; CHECK-NEXT:    whilelo p1.d, x12, x9
+; CHECK-NEXT:    add x8, x8, x11
+; CHECK-NEXT:    add x12, x12, x10
+; CHECK-NEXT:    ld1d { z2.d }, p2/z, [x13, #1, mul vl]
+; CHECK-NEXT:    ld1d { z3.d }, p3/z, [x13]
+; CHECK-NEXT:    ld1d { z4.d }, p2/z, [x14, #1, mul vl]
+; CHECK-NEXT:    ld1d { z5.d }, p3/z, [x14]
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z5.d, z3.d, #0
 ; CHECK-NEXT:    fcmla z7.d, p0/m, z4.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z5.d, z3.d, #90
 ; CHECK-NEXT:    fcmla z7.d, p0/m, z4.d, z2.d, #90
-; CHECK-NEXT:    mov z0.d, p3/m, z7.d
-; CHECK-NEXT:    mov z1.d, p2/m, z6.d
+; CHECK-NEXT:    mov z0.d, p2/m, z7.d
+; CHECK-NEXT:    mov z1.d, p3/m, z6.d
 ; CHECK-NEXT:    b.mi .LBB0_1
 ; CHECK-NEXT:  // %bb.2: // %exit.block
-; CHECK-NEXT:    uzp2 z2.d, z1.d, z0.d
-; CHECK-NEXT:    uzp1 z0.d, z1.d, z0.d
-; CHECK-NEXT:    faddv d0, p0, z0.d
-; CHECK-NEXT:    faddv d1, p0, z2.d
+; CHECK-NEXT:    uzp1 z2.d, z1.d, z0.d
+; CHECK-NEXT:    uzp2 z1.d, z1.d, z0.d
+; CHECK-NEXT:    faddv d0, p0, z2.d
+; CHECK-NEXT:    faddv d1, p0, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $z1
 ; CHECK-NEXT:    ret
@@ -113,17 +113,17 @@ exit.block:                                     ; preds = %vector.body
 define %"class.std::complex" @complex_mul_predicated_v2f64(ptr %a, ptr %b, ptr %cond) {
 ; CHECK-LABEL: complex_mul_predicated_v2f64:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z1.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cntd x10
-; CHECK-NEXT:    mov w12, #100 // =0x64
 ; CHECK-NEXT:    neg x11, x10
+; CHECK-NEXT:    mov w12, #100 // =0x64
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    mov x9, xzr
 ; CHECK-NEXT:    and x11, x11, x12
-; CHECK-NEXT:    mov z1.d, #0 // =0x0
 ; CHECK-NEXT:    rdvl x12, #2
 ; CHECK-NEXT:    zip2 z0.d, z1.d, z1.d
 ; CHECK-NEXT:    zip1 z1.d, z1.d, z1.d
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:  .LBB1_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ld1w { z2.d }, p0/z, [x2, x9, lsl #2]
@@ -133,14 +133,14 @@ define %"class.std::complex" @complex_mul_predicated_v2f64(ptr %a, ptr %b, ptr %
 ; CHECK-NEXT:    mov z7.d, z0.d
 ; CHECK-NEXT:    add x9, x9, x10
 ; CHECK-NEXT:    add x8, x8, x12
-; CHECK-NEXT:    cmpne p2.d, p0/z, z2.d, #0
-; CHECK-NEXT:    zip1 p1.d, p2.d, p2.d
-; CHECK-NEXT:    zip2 p2.d, p2.d, p2.d
+; CHECK-NEXT:    cmpne p1.d, p0/z, z2.d, #0
+; CHECK-NEXT:    cmp x11, x9
+; CHECK-NEXT:    zip2 p2.d, p1.d, p1.d
+; CHECK-NEXT:    zip1 p1.d, p1.d, p1.d
 ; CHECK-NEXT:    ld1d { z2.d }, p2/z, [x13, #1, mul vl]
 ; CHECK-NEXT:    ld1d { z3.d }, p1/z, [x13]
 ; CHECK-NEXT:    ld1d { z4.d }, p2/z, [x14, #1, mul vl]
 ; CHECK-NEXT:    ld1d { z5.d }, p1/z, [x14]
-; CHECK-NEXT:    cmp x11, x9
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z5.d, z3.d, #0
 ; CHECK-NEXT:    fcmla z7.d, p0/m, z4.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z5.d, z3.d, #90
@@ -149,10 +149,10 @@ define %"class.std::complex" @complex_mul_predicated_v2f64(ptr %a, ptr %b, ptr %
 ; CHECK-NEXT:    mov z1.d, p1/m, z6.d
 ; CHECK-NEXT:    b.ne .LBB1_1
 ; CHECK-NEXT:  // %bb.2: // %exit.block
-; CHECK-NEXT:    uzp2 z2.d, z1.d, z0.d
-; CHECK-NEXT:    uzp1 z0.d, z1.d, z0.d
-; CHECK-NEXT:    faddv d0, p0, z0.d
-; CHECK-NEXT:    faddv d1, p0, z2.d
+; CHECK-NEXT:    uzp1 z2.d, z1.d, z0.d
+; CHECK-NEXT:    uzp2 z1.d, z1.d, z0.d
+; CHECK-NEXT:    faddv d0, p0, z2.d
+; CHECK-NEXT:    faddv d1, p0, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $z1
 ; CHECK-NEXT:    ret
@@ -218,15 +218,15 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt
 ; CHECK-LABEL: complex_mul_predicated_x2_v2f64:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov w10, #100 // =0x64
+; CHECK-NEXT:    mov z1.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    whilelo p1.d, xzr, x10
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    mov x9, xzr
-; CHECK-NEXT:    mov z1.d, #0 // =0x0
 ; CHECK-NEXT:    cntd x11
 ; CHECK-NEXT:    rdvl x12, #2
-; CHECK-NEXT:    whilelo p1.d, xzr, x10
 ; CHECK-NEXT:    zip2 z0.d, z1.d, z1.d
 ; CHECK-NEXT:    zip1 z1.d, z1.d, z1.d
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:  .LBB2_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ld1w { z2.d }, p1/z, [x2, x9, lsl #2]
@@ -237,25 +237,25 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt
 ; CHECK-NEXT:    add x9, x9, x11
 ; CHECK-NEXT:    add x8, x8, x12
 ; CHECK-NEXT:    cmpne p1.d, p1/z, z2.d, #0
-; CHECK-NEXT:    zip1 p2.d, p1.d, p1.d
-; CHECK-NEXT:    zip2 p3.d, p1.d, p1.d
-; CHECK-NEXT:    ld1d { z2.d }, p3/z, [x13, #1, mul vl]
-; CHECK-NEXT:    ld1d { z3.d }, p2/z, [x13]
-; CHECK-NEXT:    ld1d { z4.d }, p3/z, [x14, #1, mul vl]
-; CHECK-NEXT:    ld1d { z5.d }, p2/z, [x14]
+; CHECK-NEXT:    zip2 p2.d, p1.d, p1.d
+; CHECK-NEXT:    zip1 p3.d, p1.d, p1.d
 ; CHECK-NEXT:    whilelo p1.d, x9, x10
+; CHECK-NEXT:    ld1d { z2.d }, p2/z, [x13, #1, mul vl]
+; CHECK-NEXT:    ld1d { z3.d }, p3/z, [x13]
+; CHECK-NEXT:    ld1d { z4.d }, p2/z, [x14, #1, mul vl]
+; CHECK-NEXT:    ld1d { z5.d }, p3/z, [x14]
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z5.d, z3.d, #0
 ; CHECK-NEXT:    fcmla z7.d, p0/m, z4.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z5.d, z3.d, #90
 ; CHECK-NEXT:    fcmla z7.d, p0/m, z4.d, z2.d, #90
-; CHECK-NEXT:    mov z0.d, p3/m, z7.d
-; CHECK-NEXT:    mov z1.d, p2/m, z6.d
+; CHECK-NEXT:    mov z0.d, p2/m, z7.d
+; CHECK-NEXT:    mov z1.d, p3/m, z6.d
 ; CHECK-NEXT:    b.mi .LBB2_1
 ; CHECK-NEXT:  // %bb.2: // %exit.block
-; CHECK-NEXT:    uzp2 z2.d, z1.d, z0.d
-; CHECK-NEXT:    uzp1 z0.d, z1.d, z0.d
-; CHECK-NEXT:    faddv d0, p0, z0.d
-; CHECK-NEXT:    faddv d1, p0, z2.d
+; CHECK-NEXT:    uzp1 z2.d, z1.d, z0.d
+; CHECK-NEXT:    uzp2 z1.d, z1.d, z0.d
+; CHECK-NEXT:    faddv d0, p0, z2.d
+; CHECK-NEXT:    faddv d1, p0, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $z1
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
index a11f67bdcb046e..aefacc605474fa 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
@@ -14,26 +14,26 @@ target triple = "aarch64"
 define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: complex_mul_v2f64:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z1.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    mov w11, #100 // =0x64
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    neg x10, x9
+; CHECK-NEXT:    mov w11, #100 // =0x64
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    and x10, x10, x11
-; CHECK-NEXT:    mov z1.d, #0 // =0x0
 ; CHECK-NEXT:    rdvl x11, #2
 ; CHECK-NEXT:    zip2 z0.d, z1.d, z1.d
 ; CHECK-NEXT:    zip1 z1.d, z1.d, z1.d
-; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:  .LBB0_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x12, x0, x8
 ; CHECK-NEXT:    add x13, x1, x8
 ; CHECK-NEXT:    ld1b { z2.b }, p1/z, [x0, x8]
-; CHECK-NEXT:    subs x10, x10, x9
 ; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x12, #1, mul vl]
 ; CHECK-NEXT:    ld1b { z4.b }, p1/z, [x1, x8]
 ; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x13, #1, mul vl]
+; CHECK-NEXT:    subs x10, x10, x9
 ; CHECK-NEXT:    add x8, x8, x11
 ; CHECK-NEXT:    fcmla z1.d, p0/m, z4.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z0.d, p0/m, z5.d, z3.d, #0
@@ -41,10 +41,10 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcmla z0.d, p0/m, z5.d, z3.d, #90
 ; CHECK-NEXT:    b.ne .LBB0_1
 ; CHECK-NEXT:  // %bb.2: // %exit.block
-; CHECK-NEXT:    uzp2 z2.d, z1.d, z0.d
-; CHECK-NEXT:    uzp1 z0.d, z1.d, z0.d
-; CHECK-NEXT:    faddv d0, p0, z0.d
-; CHECK-NEXT:    faddv d1, p0, z2.d
+; CHECK-NEXT:    uzp1 z2.d, z1.d, z0.d
+; CHECK-NEXT:    uzp2 z1.d, z1.d, z0.d
+; CHECK-NEXT:    faddv d0, p0, z2.d
+; CHECK-NEXT:    faddv d1, p0, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $z1
 ; CHECK-NEXT:    ret
@@ -101,31 +101,31 @@ exit.block:                                     ; preds = %vector.body
 define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: complex_mul_nonzero_init_v2f64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    ptrue p0.d, vl1
 ; CHECK-NEXT:    fmov d0, #1.00000000
-; CHECK-NEXT:    fmov d1, #2.00000000
-; CHECK-NEXT:    neg x10, x9
+; CHECK-NEXT:    mov z1.d, #0 // =0x0
+; CHECK-NEXT:    fmov d2, #2.00000000
+; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    mov w11, #100 // =0x64
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    neg x10, x9
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    and x10, x10, x11
-; CHECK-NEXT:    mov z2.d, #0 // =0x0
-; CHECK-NEXT:    ptrue p0.d, vl1
 ; CHECK-NEXT:    rdvl x11, #2
-; CHECK-NEXT:    sel z3.d, p0, z0.d, z2.d
-; CHECK-NEXT:    sel z1.d, p0, z1.d, z2.d
+; CHECK-NEXT:    sel z3.d, p0, z0.d, z1.d
+; CHECK-NEXT:    mov z1.d, p0/m, z2.d
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    zip2 z0.d, z1.d, z3.d
 ; CHECK-NEXT:    zip1 z1.d, z1.d, z3.d
-; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:  .LBB1_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x12, x0, x8
 ; CHECK-NEXT:    add x13, x1, x8
 ; CHECK-NEXT:    ld1b { z2.b }, p1/z, [x0, x8]
-; CHECK-NEXT:    subs x10, x10, x9
 ; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x12, #1, mul vl]
 ; CHECK-NEXT:    ld1b { z4.b }, p1/z, [x1, x8]
 ; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x13, #1, mul vl]
+; CHECK-NEXT:    subs x10, x10, x9
 ; CHECK-NEXT:    add x8, x8, x11
 ; CHECK-NEXT:    fcmla z1.d, p0/m, z4.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z0.d, p0/m, z5.d, z3.d, #0
@@ -133,10 +133,10 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcmla z0.d, p0/m, z5.d, z3.d, #90
 ; CHECK-NEXT:    b.ne .LBB1_1
 ; CHECK-NEXT:  // %bb.2: // %exit.block
-; CHECK-NEXT:    uzp2 z2.d, z1.d, z0.d
-; CHECK-NEXT:    uzp1 z0.d, z1.d, z0.d
-; CHECK-NEXT:    faddv d0, p0, z0.d
-; CHECK-NEXT:    faddv d1, p0, z2.d
+; CHECK-NEXT:    uzp1 z2.d, z1.d, z0.d
+; CHECK-NEXT:    uzp2 z1.d, z1.d, z0.d
+; CHECK-NEXT:    faddv d0, p0, z2.d
+; CHECK-NEXT:    faddv d1, p0, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $z1
 ; CHECK-NEXT:    ret
@@ -189,21 +189,21 @@ exit.block:                                     ; preds = %vector.body
 define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
 ; CHECK-LABEL: complex_mul_v2f64_unrolled:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z1.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    cntw x9
-; CHECK-NEXT:    mov w11, #1000 // =0x3e8
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    neg x10, x9
+; CHECK-NEXT:    mov w11, #1000 // =0x3e8
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    and x10, x10, x11
-; CHECK-NEXT:    mov z1.d, #0 // =0x0
+; CHECK-NEXT:    rdvl x11, #4
 ; CHECK-NEXT:    zip2 z0.d, z1.d, z1.d
 ; CHECK-NEXT:    zip1 z1.d, z1.d, z1.d
-; CHECK-NEXT:    rdvl x11, #4
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:    addvl x12, x1, #2
 ; CHECK-NEXT:    addvl x13, x0, #2
-; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:  .LBB2_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x14, x0, x8
@@ -211,7 +211,6 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
 ; CHECK-NEXT:    add x16, x1, x8
 ; CHECK-NEXT:    add x17, x12, x8
 ; CHECK-NEXT:    ld1b { z4.b }, p1/z, [x0, x8]
-; CHECK-NEXT:    subs x10, x10, x9
 ; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x14, #1, mul vl]
 ; CHECK-NEXT:    ld1b { z6.b }, p1/z, [x13, x8]
 ; CHECK-NEXT:    ld1d { z7.d }, p0/z, [x15, #1, mul vl]
@@ -219,6 +218,7 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ld1d { z17.d }, p0/z, [x16, #1, mul vl]
 ; CHECK-NEXT:    ld1b { z18.b }, p1/z, [x12, x8]
 ; CHECK-NEXT:    ld1d { z19.d }, p0/z, [x17, #1, mul vl]
+; CHECK-NEXT:    subs x10, x10, x9
 ; CHECK-NEXT:    add x8, x8, x11
 ; CHECK-NEXT:    fcmla z1.d, p0/m, z16.d, z4.d, #0
 ; CHECK-NEXT:    fcmla z0.d, p0/m, z17.d, z5.d, #0
@@ -230,14 +230,14 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fcmla z3.d, p0/m, z19.d, z7.d, #90
 ; CHECK-NEXT:    b.ne .LBB2_1
 ; CHECK-NEXT:  // %bb.2: // %exit.block
-; CHECK-NEXT:    uzp2 z4.d, z2.d, z3.d
-; CHECK-NEXT:    uzp1 z2.d, z2.d, z3.d
-; CHECK-NEXT:    uzp2 z3.d, z1.d, z0.d
-; CHECK-NEXT:    uzp1 z0.d, z1.d, z0.d
-; CHECK-NEXT:    fadd z0.d, z2.d, z0.d
-; CHECK-NEXT:    fadd z1.d, z4.d, z3.d
-; CHECK-NEXT:    faddv d0, p0, z0.d
-; CHECK-NEXT:    faddv d1, p0, z1.d
+; CHECK-NEXT:    uzp1 z4.d, z2.d, z3.d
+; CHECK-NEXT:    uzp1 z5.d, z1.d, z0.d
+; CHECK-NEXT:    uzp2 z2.d, z2.d, z3.d
+; CHECK-NEXT:    uzp2 z0.d, z1.d, z0.d
+; CHECK-NEXT:    fadd z1.d, z4.d, z5.d
+; CHECK-NEXT:    fadd z2.d, z2.d, z0.d
+; CHECK-NEXT:    faddv d0, p0, z1.d
+; CHECK-NEXT:    faddv d1, p0, z2.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $z1
 ; CHECK-NEXT:    ret
@@ -322,16 +322,16 @@ exit.block:                                     ; preds = %vector.body
 define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalias nocapture noundef readnone %c, [2 x double] %d.coerce, ptr nocapture noundef readonly %s, ptr nocapture noundef writeonly %outs) local_unnamed_addr #0 {
 ; CHECK-LABEL: reduction_mix:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z2.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    mov w11, #100 // =0x64
 ; CHECK-NEXT:    neg x10, x9
+; CHECK-NEXT:    mov w11, #100 // =0x64
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    and x10, x10, x11
-; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    rdvl x11, #2
-; CHECK-NEXT:    zip2 z1.d, z0.d, z0.d
-; CHECK-NEXT:    zip1 z2.d, z0.d, z0.d
-; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    zip2 z0.d, z2.d, z2.d
+; CHECK-NEXT:    zip1 z1.d, z2.d, z2.d
 ; CHECK-NEXT:  .LBB3_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ld1w { z3.d }, p0/z, [x3, x8, lsl #2]
@@ -340,17 +340,17 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia
 ; CHECK-NEXT:    add x8, x8, x9
 ; CHECK-NEXT:    add x0, x0, x11
 ; CHECK-NEXT:    cmp x10, x8
-; CHECK-NEXT:    add z0.d, z3.d, z0.d
-; CHECK-NEXT:    fadd z2.d, z4.d, z2.d
-; CHECK-NEXT:    fadd z1.d, z5.d, z1.d
+; CHECK-NEXT:    fadd z0.d, z5.d, z0.d
+; CHECK-NEXT:    fadd z1.d, z4.d, z1.d
+; CHECK-NEXT:    add z2.d, z3.d, z2.d
 ; CHECK-NEXT:    b.ne .LBB3_1
 ; CHECK-NEXT:  // %bb.2: // %middle.block
-; CHECK-NEXT:    uzp1 z3.d, z2.d, z1.d
-; CHECK-NEXT:    uzp2 z1.d, z2.d, z1.d
-; CHECK-NEXT:    uaddv d2, p0, z0.d
-; CHECK-NEXT:    faddv d0, p0, z1.d
+; CHECK-NEXT:    uaddv d2, p0, z2.d
+; CHECK-NEXT:    uzp2 z3.d, z1.d, z0.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z0.d
 ; CHECK-NEXT:    fmov x8, d2
-; CHECK-NEXT:    faddv d1, p0, z3.d
+; CHECK-NEXT:    faddv d0, p0, z3.d
+; CHECK-NEXT:    faddv d1, p0, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $z1
 ; CHECK-NEXT:    str w8, [x4]

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
index 2eb1c9e07407e9..d245c0a0e4823d 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll
@@ -15,26 +15,26 @@ define dso_local %"struct.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: complex_mul_v2f64:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  .LBB0_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x9, x0, x8
 ; CHECK-NEXT:    add x10, x1, x8
 ; CHECK-NEXT:    add x8, x8, #32
-; CHECK-NEXT:    cmp x8, #1600
 ; CHECK-NEXT:    ldp q3, q2, [x9]
-; CHECK-NEXT:    ldp q4, q5, [x10]
-; CHECK-NEXT:    fcmla v0.2d, v3.2d, v4.2d, #0
-; CHECK-NEXT:    fcmla v1.2d, v2.2d, v5.2d, #0
-; CHECK-NEXT:    fcmla v0.2d, v3.2d, v4.2d, #90
-; CHECK-NEXT:    fcmla v1.2d, v2.2d, v5.2d, #90
+; CHECK-NEXT:    cmp x8, #1600
+; CHECK-NEXT:    ldp q5, q4, [x10]
+; CHECK-NEXT:    fcmla v0.2d, v3.2d, v5.2d, #0
+; CHECK-NEXT:    fcmla v1.2d, v2.2d, v4.2d, #0
+; CHECK-NEXT:    fcmla v0.2d, v3.2d, v5.2d, #90
+; CHECK-NEXT:    fcmla v1.2d, v2.2d, v4.2d, #90
 ; CHECK-NEXT:    b.ne .LBB0_1
 ; CHECK-NEXT:  // %bb.2: // %middle.block
 ; CHECK-NEXT:    zip2 v2.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    zip1 v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    faddp d1, v2.2d
 ; CHECK-NEXT:    faddp d0, v0.2d
+; CHECK-NEXT:    faddp d1, v2.2d
 ; CHECK-NEXT:    ret
 entry:
   br label %vector.body
@@ -80,28 +80,28 @@ middle.block:                                     ; preds = %vector.body
 define %"struct.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: complex_mul_nonzero_init_v2f64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    adrp x9, .LCPI1_0
-; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI1_0]
+; CHECK-NEXT:    adrp x8, .LCPI1_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI1_0]
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  .LBB1_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x9, x0, x8
 ; CHECK-NEXT:    add x10, x1, x8
 ; CHECK-NEXT:    add x8, x8, #32
-; CHECK-NEXT:    cmp x8, #1600
 ; CHECK-NEXT:    ldp q3, q2, [x9]
-; CHECK-NEXT:    ldp q4, q5, [x10]
-; CHECK-NEXT:    fcmla v1.2d, v3.2d, v4.2d, #0
-; CHECK-NEXT:    fcmla v0.2d, v2.2d, v5.2d, #0
-; CHECK-NEXT:    fcmla v1.2d, v3.2d, v4.2d, #90
-; CHECK-NEXT:    fcmla v0.2d, v2.2d, v5.2d, #90
+; CHECK-NEXT:    cmp x8, #1600
+; CHECK-NEXT:    ldp q5, q4, [x10]
+; CHECK-NEXT:    fcmla v1.2d, v3.2d, v5.2d, #0
+; CHECK-NEXT:    fcmla v0.2d, v2.2d, v4.2d, #0
+; CHECK-NEXT:    fcmla v1.2d, v3.2d, v5.2d, #90
+; CHECK-NEXT:    fcmla v0.2d, v2.2d, v4.2d, #90
 ; CHECK-NEXT:    b.ne .LBB1_1
 ; CHECK-NEXT:  // %bb.2: // %middle.block
 ; CHECK-NEXT:    zip2 v2.2d, v1.2d, v0.2d
 ; CHECK-NEXT:    zip1 v0.2d, v1.2d, v0.2d
-; CHECK-NEXT:    faddp d1, v2.2d
 ; CHECK-NEXT:    faddp d0, v0.2d
+; CHECK-NEXT:    faddp d1, v2.2d
 ; CHECK-NEXT:    ret
 entry:
   br label %vector.body
@@ -143,40 +143,40 @@ middle.block:                                     ; preds = %vector.body
 define %"struct.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
 ; CHECK-LABEL: complex_mul_v2f64_unrolled:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    adrp x9, .LCPI2_0
-; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    adrp x8, .LCPI2_0
 ; CHECK-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI2_0]
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  .LBB2_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x9, x0, x8
 ; CHECK-NEXT:    add x10, x1, x8
 ; CHECK-NEXT:    add x8, x8, #64
-; CHECK-NEXT:    cmp x8, #1600
 ; CHECK-NEXT:    ldp q5, q4, [x9]
-; CHECK-NEXT:    ldp q7, q6, [x9, #32]
-; CHECK-NEXT:    ldp q17, q16, [x10]
-; CHECK-NEXT:    fcmla v1.2d, v5.2d, v17.2d, #0
+; CHECK-NEXT:    cmp x8, #1600
+; CHECK-NEXT:    ldp q7, q6, [x10]
+; CHECK-NEXT:    ldp q17, q16, [x9, #32]
 ; CHECK-NEXT:    ldp q19, q18, [x10, #32]
-; CHECK-NEXT:    fcmla v0.2d, v4.2d, v16.2d, #0
-; CHECK-NEXT:    fcmla v1.2d, v5.2d, v17.2d, #90
-; CHECK-NEXT:    fcmla v2.2d, v7.2d, v19.2d, #0
-; CHECK-NEXT:    fcmla v0.2d, v4.2d, v16.2d, #90
-; CHECK-NEXT:    fcmla v3.2d, v6.2d, v18.2d, #0
-; CHECK-NEXT:    fcmla v2.2d, v7.2d, v19.2d, #90
-; CHECK-NEXT:    fcmla v3.2d, v6.2d, v18.2d, #90
+; CHECK-NEXT:    fcmla v1.2d, v5.2d, v7.2d, #0
+; CHECK-NEXT:    fcmla v0.2d, v4.2d, v6.2d, #0
+; CHECK-NEXT:    fcmla v2.2d, v17.2d, v19.2d, #0
+; CHECK-NEXT:    fcmla v3.2d, v16.2d, v18.2d, #0
+; CHECK-NEXT:    fcmla v1.2d, v5.2d, v7.2d, #90
+; CHECK-NEXT:    fcmla v0.2d, v4.2d, v6.2d, #90
+; CHECK-NEXT:    fcmla v2.2d, v17.2d, v19.2d, #90
+; CHECK-NEXT:    fcmla v3.2d, v16.2d, v18.2d, #90
 ; CHECK-NEXT:    b.ne .LBB2_1
 ; CHECK-NEXT:  // %bb.2: // %middle.block
 ; CHECK-NEXT:    zip2 v4.2d, v2.2d, v3.2d
 ; CHECK-NEXT:    zip1 v2.2d, v2.2d, v3.2d
-; CHECK-NEXT:    zip1 v3.2d, v1.2d, v0.2d
-; CHECK-NEXT:    zip2 v0.2d, v1.2d, v0.2d
-; CHECK-NEXT:    fadd v1.2d, v2.2d, v3.2d
-; CHECK-NEXT:    fadd v2.2d, v4.2d, v0.2d
-; CHECK-NEXT:    faddp d0, v1.2d
-; CHECK-NEXT:    faddp d1, v2.2d
+; CHECK-NEXT:    zip2 v3.2d, v1.2d, v0.2d
+; CHECK-NEXT:    zip1 v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    fadd v1.2d, v4.2d, v3.2d
+; CHECK-NEXT:    fadd v0.2d, v2.2d, v0.2d
+; CHECK-NEXT:    faddp d1, v1.2d
+; CHECK-NEXT:    faddp d0, v0.2d
 ; CHECK-NEXT:    ret
 entry:
   %scevgep = getelementptr i8, ptr %a, i64 32

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
index db290aee1b3b99..0cbe2f46088e6c 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
@@ -8,23 +8,23 @@ target triple = "aarch64-arm-none-eabi"
 define <vscale x 4 x double> @complex_mul_const(<vscale x 4 x double> %a, <vscale x 4 x double> %b) {
 ; CHECK-LABEL: complex_mul_const:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z4.d, #0 // =0x0
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    mov z4.d, #0 // =0x0
+; CHECK-NEXT:    fmov z7.d, #3.00000000
+; CHECK-NEXT:    fmov z24.d, #11.00000000
 ; CHECK-NEXT:    mov z6.d, z4.d
-; CHECK-NEXT:    fcmla z5.d, p0/m, z0.d, z2.d, #0
+; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z1.d, z3.d, #0
-; CHECK-NEXT:    fcmla z5.d, p0/m, z0.d, z2.d, #90
+; CHECK-NEXT:    fcmla z5.d, p0/m, z0.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z1.d, z3.d, #90
-; CHECK-NEXT:    fmov z1.d, #3.00000000
-; CHECK-NEXT:    fmov z2.d, #11.00000000
-; CHECK-NEXT:    zip2 z3.d, z2.d, z1.d
+; CHECK-NEXT:    zip2 z1.d, z24.d, z7.d
+; CHECK-NEXT:    fcmla z5.d, p0/m, z0.d, z2.d, #90
+; CHECK-NEXT:    zip1 z2.d, z24.d, z7.d
 ; CHECK-NEXT:    mov z0.d, z4.d
-; CHECK-NEXT:    zip1 z1.d, z2.d, z1.d
-; CHECK-NEXT:    fcmla z4.d, p0/m, z6.d, z3.d, #0
-; CHECK-NEXT:    fcmla z0.d, p0/m, z5.d, z1.d, #0
-; CHECK-NEXT:    fcmla z4.d, p0/m, z6.d, z3.d, #90
-; CHECK-NEXT:    fcmla z0.d, p0/m, z5.d, z1.d, #90
+; CHECK-NEXT:    fcmla z4.d, p0/m, z6.d, z1.d, #0
+; CHECK-NEXT:    fcmla z0.d, p0/m, z5.d, z2.d, #0
+; CHECK-NEXT:    fcmla z4.d, p0/m, z6.d, z1.d, #90
+; CHECK-NEXT:    fcmla z0.d, p0/m, z5.d, z2.d, #90
 ; CHECK-NEXT:    mov z1.d, z4.d
 ; CHECK-NEXT:    ret
 entry:
@@ -55,24 +55,24 @@ entry:
 define <vscale x 4 x double> @complex_mul_non_const(<vscale x 4 x double> %a, <vscale x 4 x double> %b, [2 x double] %c) {
 ; CHECK-LABEL: complex_mul_non_const:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z6.d, #0 // =0x0
 ; CHECK-NEXT:    // kill: def $d5 killed $d5 def $z5
 ; CHECK-NEXT:    // kill: def $d4 killed $d4 def $z4
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z7.d, z6.d
-; CHECK-NEXT:    mov z24.d, z6.d
 ; CHECK-NEXT:    mov z5.d, d5
 ; CHECK-NEXT:    mov z4.d, d4
-; CHECK-NEXT:    fcmla z7.d, p0/m, z0.d, z2.d, #0
+; CHECK-NEXT:    mov z24.d, z6.d
+; CHECK-NEXT:    mov z7.d, z6.d
+; CHECK-NEXT:    zip2 z25.d, z4.d, z5.d
+; CHECK-NEXT:    zip1 z4.d, z4.d, z5.d
 ; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #0
-; CHECK-NEXT:    fcmla z7.d, p0/m, z0.d, z2.d, #90
-; CHECK-NEXT:    zip2 z2.d, z4.d, z5.d
+; CHECK-NEXT:    fcmla z7.d, p0/m, z0.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #90
+; CHECK-NEXT:    fcmla z7.d, p0/m, z0.d, z2.d, #90
 ; CHECK-NEXT:    mov z0.d, z6.d
-; CHECK-NEXT:    zip1 z4.d, z4.d, z5.d
-; CHECK-NEXT:    fcmla z6.d, p0/m, z24.d, z2.d, #0
+; CHECK-NEXT:    fcmla z6.d, p0/m, z24.d, z25.d, #0
 ; CHECK-NEXT:    fcmla z0.d, p0/m, z7.d, z4.d, #0
-; CHECK-NEXT:    fcmla z6.d, p0/m, z24.d, z2.d, #90
+; CHECK-NEXT:    fcmla z6.d, p0/m, z24.d, z25.d, #90
 ; CHECK-NEXT:    fcmla z0.d, p0/m, z7.d, z4.d, #90
 ; CHECK-NEXT:    mov z1.d, z6.d
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat.ll
index d27436b6be66a6..0dbc2ecc8b008d 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat.ll
@@ -10,19 +10,19 @@ define <4 x double> @complex_mul_const(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: complex_mul_const:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v6.2d, #0000000000000000
-; CHECK-NEXT:    adrp x8, .LCPI0_0
 ; CHECK-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-NEXT:    adrp x8, .LCPI0_0
 ; CHECK-NEXT:    movi v4.2d, #0000000000000000
 ; CHECK-NEXT:    fcmla v6.2d, v3.2d, v1.2d, #0
 ; CHECK-NEXT:    fcmla v5.2d, v2.2d, v0.2d, #0
 ; CHECK-NEXT:    fcmla v6.2d, v3.2d, v1.2d, #90
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI0_0]
 ; CHECK-NEXT:    fcmla v5.2d, v2.2d, v0.2d, #90
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI0_0]
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    fcmla v4.2d, v2.2d, v6.2d, #0
-; CHECK-NEXT:    fcmla v0.2d, v2.2d, v5.2d, #0
-; CHECK-NEXT:    fcmla v4.2d, v2.2d, v6.2d, #90
-; CHECK-NEXT:    fcmla v0.2d, v2.2d, v5.2d, #90
+; CHECK-NEXT:    fcmla v4.2d, v1.2d, v6.2d, #0
+; CHECK-NEXT:    fcmla v0.2d, v1.2d, v5.2d, #0
+; CHECK-NEXT:    fcmla v4.2d, v1.2d, v6.2d, #90
+; CHECK-NEXT:    fcmla v0.2d, v1.2d, v5.2d, #90
 ; CHECK-NEXT:    mov v1.16b, v4.16b
 ; CHECK-NEXT:    ret
 entry:
@@ -52,22 +52,22 @@ entry:
 define <4 x double> @complex_mul_non_const(<4 x double> %a, <4 x double> %b, [2 x double] %c) {
 ; CHECK-LABEL: complex_mul_non_const:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v7.2d, #0000000000000000
 ; CHECK-NEXT:    movi v6.2d, #0000000000000000
-; CHECK-NEXT:    // kill: def $d4 killed $d4 def $q4
 ; CHECK-NEXT:    // kill: def $d5 killed $d5 def $q5
-; CHECK-NEXT:    movi v7.2d, #0000000000000000
+; CHECK-NEXT:    // kill: def $d4 killed $d4 def $q4
 ; CHECK-NEXT:    mov v4.d[1], v5.d[0]
-; CHECK-NEXT:    fcmla v6.2d, v2.2d, v0.2d, #0
+; CHECK-NEXT:    movi v5.2d, #0000000000000000
 ; CHECK-NEXT:    fcmla v7.2d, v3.2d, v1.2d, #0
-; CHECK-NEXT:    fcmla v6.2d, v2.2d, v0.2d, #90
-; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    fcmla v6.2d, v2.2d, v0.2d, #0
 ; CHECK-NEXT:    fcmla v7.2d, v3.2d, v1.2d, #90
+; CHECK-NEXT:    fcmla v6.2d, v2.2d, v0.2d, #90
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    fcmla v2.2d, v4.2d, v7.2d, #0
+; CHECK-NEXT:    fcmla v5.2d, v4.2d, v7.2d, #0
 ; CHECK-NEXT:    fcmla v0.2d, v4.2d, v6.2d, #0
-; CHECK-NEXT:    fcmla v2.2d, v4.2d, v7.2d, #90
+; CHECK-NEXT:    fcmla v5.2d, v4.2d, v7.2d, #90
 ; CHECK-NEXT:    fcmla v0.2d, v4.2d, v6.2d, #90
-; CHECK-NEXT:    mov v1.16b, v2.16b
+; CHECK-NEXT:    mov v1.16b, v5.16b
 ; CHECK-NEXT:    ret
 entry:
   %c.coerce.fca.1.extract = extractvalue [2 x double] %c, 1

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
index 94f104a44ce261..81a8631a1691b5 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
@@ -35,14 +35,14 @@ define <4 x float> @simple_mul_no_contract(<4 x float> %a, <4 x float> %b) {
 ; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
 ; CHECK-NEXT:    zip1 v4.2s, v0.2s, v2.2s
 ; CHECK-NEXT:    zip2 v0.2s, v0.2s, v2.2s
-; CHECK-NEXT:    zip2 v5.2s, v1.2s, v3.2s
-; CHECK-NEXT:    zip1 v1.2s, v1.2s, v3.2s
-; CHECK-NEXT:    fmul v2.2s, v5.2s, v4.2s
+; CHECK-NEXT:    zip1 v2.2s, v1.2s, v3.2s
+; CHECK-NEXT:    zip2 v1.2s, v1.2s, v3.2s
 ; CHECK-NEXT:    fmul v3.2s, v1.2s, v4.2s
-; CHECK-NEXT:    fmul v4.2s, v0.2s, v5.2s
-; CHECK-NEXT:    fmla v2.2s, v0.2s, v1.2s
-; CHECK-NEXT:    fsub v0.2s, v3.2s, v4.2s
-; CHECK-NEXT:    zip1 v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    fmul v4.2s, v2.2s, v4.2s
+; CHECK-NEXT:    fmul v1.2s, v0.2s, v1.2s
+; CHECK-NEXT:    fmla v3.2s, v0.2s, v2.2s
+; CHECK-NEXT:    fsub v0.2s, v4.2s, v1.2s
+; CHECK-NEXT:    zip1 v0.4s, v0.4s, v3.4s
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
@@ -155,8 +155,8 @@ define <4 x float> @add_external_use(<4 x float> %a, <4 x float> %b) {
 ; CHECK-NEXT:    zip2 v0.2s, v0.2s, v2.2s
 ; CHECK-NEXT:    zip1 v2.2s, v1.2s, v3.2s
 ; CHECK-NEXT:    zip2 v1.2s, v1.2s, v3.2s
-; CHECK-NEXT:    fsub v1.2s, v4.2s, v1.2s
 ; CHECK-NEXT:    fadd v0.2s, v0.2s, v2.2s
+; CHECK-NEXT:    fsub v1.2s, v4.2s, v1.2s
 ; CHECK-NEXT:    zip1 v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    ret
 entry:
@@ -201,93 +201,93 @@ entry:
 define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c) {
 ; CHECK-LABEL: abp90c12:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr s21, [sp, #32]
-; CHECK-NEXT:    add x9, sp, #48
-; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    ldr s23, [sp, #40]
-; CHECK-NEXT:    add x11, sp, #56
-; CHECK-NEXT:    mov v0.s[1], v2.s[0]
-; CHECK-NEXT:    ldr s2, [sp]
-; CHECK-NEXT:    add x10, sp, #16
-; CHECK-NEXT:    ld1 { v21.s }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #64
-; CHECK-NEXT:    ld1 { v23.s }[1], [x11]
 ; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
 ; CHECK-NEXT:    // kill: def $s3 killed $s3 def $q3
-; CHECK-NEXT:    ldr s22, [sp, #96]
-; CHECK-NEXT:    add x11, sp, #24
-; CHECK-NEXT:    ld1 { v2.s }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #72
+; CHECK-NEXT:    ldr s16, [sp, #40]
+; CHECK-NEXT:    add x10, sp, #56
+; CHECK-NEXT:    add x9, sp, #48
 ; CHECK-NEXT:    mov v1.s[1], v3.s[0]
-; CHECK-NEXT:    ld1 { v21.s }[2], [x9]
-; CHECK-NEXT:    ldr s24, [sp, #8]
-; CHECK-NEXT:    add x9, sp, #112
-; CHECK-NEXT:    ld1 { v23.s }[2], [x10]
-; CHECK-NEXT:    add x10, sp, #80
+; CHECK-NEXT:    ldr s3, [sp, #32]
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
 ; CHECK-NEXT:    // kill: def $s5 killed $s5 def $q5
-; CHECK-NEXT:    ldr s18, [sp, #128]
+; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
+; CHECK-NEXT:    ldr s18, [sp, #8]
+; CHECK-NEXT:    ld1 { v16.s }[1], [x10]
+; CHECK-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-NEXT:    add x10, sp, #72
+; CHECK-NEXT:    ld1 { v3.s }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #64
+; CHECK-NEXT:    ldr s17, [sp, #96]
 ; CHECK-NEXT:    // kill: def $s7 killed $s7 def $q7
 ; CHECK-NEXT:    // kill: def $s4 killed $s4 def $q4
 ; CHECK-NEXT:    // kill: def $s6 killed $s6 def $q6
+; CHECK-NEXT:    ldr s2, [sp, #136]
+; CHECK-NEXT:    ldr s20, [sp, #192]
 ; CHECK-NEXT:    mov v1.s[2], v5.s[0]
-; CHECK-NEXT:    ldr s20, [sp, #104]
-; CHECK-NEXT:    ld1 { v24.s }[1], [x11]
-; CHECK-NEXT:    add x11, sp, #88
-; CHECK-NEXT:    ld1 { v22.s }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #144
-; CHECK-NEXT:    ld1 { v21.s }[3], [x10]
-; CHECK-NEXT:    add x10, sp, #120
-; CHECK-NEXT:    mov v0.s[2], v4.s[0]
-; CHECK-NEXT:    ld1 { v23.s }[3], [x11]
+; CHECK-NEXT:    ld1 { v16.s }[2], [x10]
+; CHECK-NEXT:    ldr s5, [sp, #104]
+; CHECK-NEXT:    ld1 { v3.s }[2], [x9]
+; CHECK-NEXT:    add x9, sp, #24
+; CHECK-NEXT:    add x10, sp, #112
 ; CHECK-NEXT:    ld1 { v18.s }[1], [x9]
-; CHECK-NEXT:    add x11, sp, #152
-; CHECK-NEXT:    ld1 { v20.s }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #160
+; CHECK-NEXT:    add x9, sp, #88
+; CHECK-NEXT:    mov v0.s[2], v4.s[0]
+; CHECK-NEXT:    ld1 { v17.s }[1], [x10]
+; CHECK-NEXT:    add x10, sp, #80
+; CHECK-NEXT:    ld1 { v16.s }[3], [x9]
 ; CHECK-NEXT:    mov v1.s[3], v7.s[0]
-; CHECK-NEXT:    ldr s17, [sp, #136]
-; CHECK-NEXT:    ldr s19, [sp, #192]
-; CHECK-NEXT:    add x9, sp, #208
+; CHECK-NEXT:    add x9, sp, #120
+; CHECK-NEXT:    ldr s4, [sp, #128]
+; CHECK-NEXT:    ld1 { v3.s }[3], [x10]
+; CHECK-NEXT:    ld1 { v5.s }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #144
+; CHECK-NEXT:    ldr s7, [sp]
+; CHECK-NEXT:    ld1 { v4.s }[1], [x9]
 ; CHECK-NEXT:    mov v0.s[3], v6.s[0]
-; CHECK-NEXT:    ld1 { v18.s }[2], [x10]
-; CHECK-NEXT:    ld1 { v17.s }[1], [x11]
-; CHECK-NEXT:    add x10, sp, #176
-; CHECK-NEXT:    fmul v3.4s, v23.4s, v1.4s
-; CHECK-NEXT:    ld1 { v19.s }[1], [x9]
-; CHECK-NEXT:    fmul v4.4s, v20.4s, v24.4s
+; CHECK-NEXT:    add x10, sp, #16
+; CHECK-NEXT:    add x9, sp, #160
+; CHECK-NEXT:    fmul v6.4s, v16.4s, v1.4s
+; CHECK-NEXT:    fmul v19.4s, v5.4s, v18.4s
+; CHECK-NEXT:    fmul v18.4s, v17.4s, v18.4s
+; CHECK-NEXT:    fmul v1.4s, v3.4s, v1.4s
+; CHECK-NEXT:    ld1 { v7.s }[1], [x10]
+; CHECK-NEXT:    ld1 { v4.s }[2], [x9]
+; CHECK-NEXT:    add x9, sp, #152
+; CHECK-NEXT:    add x10, sp, #208
+; CHECK-NEXT:    ld1 { v2.s }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #176
+; CHECK-NEXT:    ld1 { v20.s }[1], [x10]
+; CHECK-NEXT:    fneg v6.4s, v6.4s
+; CHECK-NEXT:    fneg v19.4s, v19.4s
+; CHECK-NEXT:    fmla v18.4s, v7.4s, v5.4s
+; CHECK-NEXT:    fmla v1.4s, v0.4s, v16.4s
+; CHECK-NEXT:    ld1 { v4.s }[3], [x9]
 ; CHECK-NEXT:    add x9, sp, #168
-; CHECK-NEXT:    fmul v1.4s, v21.4s, v1.4s
-; CHECK-NEXT:    ld1 { v18.s }[3], [x10]
-; CHECK-NEXT:    fmul v5.4s, v22.4s, v24.4s
-; CHECK-NEXT:    ldr s16, [sp, #200]
-; CHECK-NEXT:    ld1 { v17.s }[2], [x9]
-; CHECK-NEXT:    add x11, sp, #216
-; CHECK-NEXT:    fneg v3.4s, v3.4s
-; CHECK-NEXT:    add x9, sp, #184
-; CHECK-NEXT:    fneg v4.4s, v4.4s
-; CHECK-NEXT:    fmla v1.4s, v0.4s, v23.4s
-; CHECK-NEXT:    fmla v5.4s, v2.4s, v20.4s
-; CHECK-NEXT:    ld1 { v16.s }[1], [x11]
-; CHECK-NEXT:    ld1 { v17.s }[3], [x9]
-; CHECK-NEXT:    fmla v3.4s, v0.4s, v21.4s
-; CHECK-NEXT:    fmla v4.4s, v2.4s, v22.4s
-; CHECK-NEXT:    fsub v0.4s, v18.4s, v1.4s
-; CHECK-NEXT:    fsub v1.4s, v19.4s, v5.4s
-; CHECK-NEXT:    fadd v2.4s, v17.4s, v3.4s
-; CHECK-NEXT:    fadd v3.4s, v16.4s, v4.4s
+; CHECK-NEXT:    ld1 { v2.s }[2], [x9]
+; CHECK-NEXT:    ldr s5, [sp, #200]
+; CHECK-NEXT:    add x9, sp, #216
+; CHECK-NEXT:    add x10, sp, #184
+; CHECK-NEXT:    fmla v6.4s, v0.4s, v3.4s
+; CHECK-NEXT:    fmla v19.4s, v7.4s, v17.4s
+; CHECK-NEXT:    ld1 { v5.s }[1], [x9]
+; CHECK-NEXT:    fsub v0.4s, v4.4s, v1.4s
+; CHECK-NEXT:    fsub v1.4s, v20.4s, v18.4s
+; CHECK-NEXT:    ld1 { v2.s }[3], [x10]
+; CHECK-NEXT:    fadd v3.4s, v5.4s, v19.4s
+; CHECK-NEXT:    fadd v2.4s, v2.4s, v6.4s
 ; CHECK-NEXT:    ext v4.16b, v0.16b, v1.16b, #12
 ; CHECK-NEXT:    ext v5.16b, v2.16b, v3.16b, #12
 ; CHECK-NEXT:    trn2 v1.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    ext v4.16b, v0.16b, v4.16b, #12
-; CHECK-NEXT:    zip2 v3.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ext v5.16b, v2.16b, v5.16b, #8
-; CHECK-NEXT:    zip1 v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    rev64 v4.4s, v4.4s
+; CHECK-NEXT:    trn2 v3.4s, v4.4s, v5.4s
+; CHECK-NEXT:    zip2 v4.4s, v0.4s, v2.4s
+; CHECK-NEXT:    zip1 v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ext v1.16b, v3.16b, v1.16b, #8
+; CHECK-NEXT:    mov v4.d[1], v3.d[0]
 ; CHECK-NEXT:    str q0, [x8]
-; CHECK-NEXT:    trn2 v4.4s, v4.4s, v5.4s
-; CHECK-NEXT:    ext v1.16b, v4.16b, v1.16b, #8
-; CHECK-NEXT:    mov v3.d[1], v4.d[0]
-; CHECK-NEXT:    stp q3, q1, [x8, #16]
+; CHECK-NEXT:    stp q4, q1, [x8, #16]
 ; CHECK-NEXT:    ret
 entry:
   %ar = shufflevector <12 x float> %a, <12 x float> poison, <6 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10>

diff  --git a/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll b/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll
index 5c542c4ca79a2f..e7c85281459fa3 100644
--- a/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll
+++ b/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll
@@ -96,8 +96,8 @@ entry:
 define void @test_concat_fptrunc_v4f64_to_v4f32(<vscale x 4 x float>* %ptr) #1 {
 ; CHECK-LABEL: test_concat_fptrunc_v4f64_to_v4f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov z0.s, #1.00000000
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fmov z0.s, #1.00000000
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/cond-br-tuning.ll b/llvm/test/CodeGen/AArch64/cond-br-tuning.ll
index 458a376b50203e..dc00c41892ba89 100644
--- a/llvm/test/CodeGen/AArch64/cond-br-tuning.ll
+++ b/llvm/test/CodeGen/AArch64/cond-br-tuning.ll
@@ -27,9 +27,9 @@ L2:
 define void @test_add_cbz_multiple_use(i32 %a, i32 %b, ptr %ptr) {
 ; CHECK-LABEL: test_add_cbz_multiple_use:
 ; CHECK:       // %bb.0: // %common.ret
-; CHECK-NEXT:    mov w8, #10
-; CHECK-NEXT:    adds w9, w0, w1
-; CHECK-NEXT:    csel w8, w8, w9, ne
+; CHECK-NEXT:    adds w8, w0, w1
+; CHECK-NEXT:    mov w9, #10 // =0xa
+; CHECK-NEXT:    csel w8, w9, w8, ne
 ; CHECK-NEXT:    str w8, [x2]
 ; CHECK-NEXT:    ret
   %c = add nsw i32 %a, %b

diff  --git a/llvm/test/CodeGen/AArch64/consthoist-gep.ll b/llvm/test/CodeGen/AArch64/consthoist-gep.ll
index 9245b670489de7..d109dade67f551 100644
--- a/llvm/test/CodeGen/AArch64/consthoist-gep.ll
+++ b/llvm/test/CodeGen/AArch64/consthoist-gep.ll
@@ -32,44 +32,44 @@ define dso_local void @blam() local_unnamed_addr #0 {
 ; CHECK-NEXT:    ldrb w9, [x8]
 ; CHECK-NEXT:    tbnz w9, #0, .LBB0_2
 ; CHECK-NEXT:  // %bb.1: // %bb3
-; CHECK-NEXT:    mov w9, #44032
+; CHECK-NEXT:    mov w9, #44032 // =0xac00
+; CHECK-NEXT:    mov w11, #172 // =0xac
 ; CHECK-NEXT:    movk w9, #12296, lsl #16
-; CHECK-NEXT:    orr w11, w9, #0x4
 ; CHECK-NEXT:    ldr w10, [x9]
 ; CHECK-NEXT:    stur w10, [x8, #158]
-; CHECK-NEXT:    ldr w10, [x11]
-; CHECK-NEXT:    orr w11, w9, #0x8
+; CHECK-NEXT:    orr w10, w9, #0x4
+; CHECK-NEXT:    ldr w10, [x10]
 ; CHECK-NEXT:    and w10, w10, #0xffff
 ; CHECK-NEXT:    stur w10, [x8, #162]
-; CHECK-NEXT:    ldr w10, [x11]
-; CHECK-NEXT:    orr w11, w9, #0xc
+; CHECK-NEXT:    orr w10, w9, #0x8
+; CHECK-NEXT:    ldr w10, [x10]
 ; CHECK-NEXT:    and w10, w10, #0x1f1f1f1f
 ; CHECK-NEXT:    stur w10, [x8, #166]
-; CHECK-NEXT:    ldr w10, [x11]
-; CHECK-NEXT:    mov w11, #172
-; CHECK-NEXT:    orr w11, w9, w11
+; CHECK-NEXT:    orr w10, w9, #0xc
+; CHECK-NEXT:    ldr w10, [x10]
 ; CHECK-NEXT:    and w10, w10, #0x1f1f1f1f
 ; CHECK-NEXT:    stur w10, [x8, #170]
-; CHECK-NEXT:    mov w10, #176
-; CHECK-NEXT:    ldr w8, [x11]
-; CHECK-NEXT:    adrp x11, global+528
-; CHECK-NEXT:    add x11, x11, :lo12:global+528
-; CHECK-NEXT:    orr w10, w9, w10
+; CHECK-NEXT:    orr w8, w9, w11
+; CHECK-NEXT:    adrp x10, global+528
+; CHECK-NEXT:    add x10, x10, :lo12:global+528
+; CHECK-NEXT:    ldr w8, [x8]
+; CHECK-NEXT:    mov w11, #176 // =0xb0
 ; CHECK-NEXT:    and w8, w8, #0xffffff
-; CHECK-NEXT:    str w8, [x11]
-; CHECK-NEXT:    ldr w8, [x10]
-; CHECK-NEXT:    mov w10, #180
-; CHECK-NEXT:    orr w10, w9, w10
+; CHECK-NEXT:    str w8, [x10]
+; CHECK-NEXT:    orr w8, w9, w11
+; CHECK-NEXT:    mov w11, #180 // =0xb4
+; CHECK-NEXT:    ldr w8, [x8]
 ; CHECK-NEXT:    and w8, w8, #0xffffff
-; CHECK-NEXT:    str w8, [x11, #4]
-; CHECK-NEXT:    ldr w8, [x10]
-; CHECK-NEXT:    mov w10, #184
-; CHECK-NEXT:    orr w9, w9, w10
+; CHECK-NEXT:    str w8, [x10, #4]
+; CHECK-NEXT:    orr w8, w9, w11
+; CHECK-NEXT:    mov w11, #184 // =0xb8
+; CHECK-NEXT:    ldr w8, [x8]
 ; CHECK-NEXT:    and w8, w8, #0xffffff
-; CHECK-NEXT:    str w8, [x11, #8]
-; CHECK-NEXT:    ldr w8, [x9]
+; CHECK-NEXT:    str w8, [x10, #8]
+; CHECK-NEXT:    orr w8, w9, w11
+; CHECK-NEXT:    ldr w8, [x8]
 ; CHECK-NEXT:    and w8, w8, #0xffffff
-; CHECK-NEXT:    str w8, [x11, #12]
+; CHECK-NEXT:    str w8, [x10, #12]
 ; CHECK-NEXT:  .LBB0_2: // %bb19
 ; CHECK-NEXT:    ret
 bb:

diff  --git a/llvm/test/CodeGen/AArch64/copyprop.ll b/llvm/test/CodeGen/AArch64/copyprop.ll
index 6fcb8d435a4e38..965aa2cb98c758 100644
--- a/llvm/test/CodeGen/AArch64/copyprop.ll
+++ b/llvm/test/CodeGen/AArch64/copyprop.ll
@@ -7,19 +7,19 @@ define void @copyprop_after_mbp(i32 %v, ptr %a, ptr %b, ptr %c, ptr %d) {
 ; CHECK-NEXT:    cmp w0, #10
 ; CHECK-NEXT:    b.ne .LBB0_2
 ; CHECK-NEXT:  // %bb.1: // %bb.0
-; CHECK-NEXT:    mov w9, #15
-; CHECK-NEXT:    mov w8, #1
-; CHECK-NEXT:    str w9, [x2]
-; CHECK-NEXT:    mov w9, #12
+; CHECK-NEXT:    mov w8, #15 // =0xf
+; CHECK-NEXT:    str w8, [x2]
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    str w8, [x1]
-; CHECK-NEXT:    str w9, [x4]
+; CHECK-NEXT:    mov w8, #12 // =0xc
+; CHECK-NEXT:    str w8, [x4]
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB0_2: // %bb.1
-; CHECK-NEXT:    mov w9, #25
+; CHECK-NEXT:    mov w9, #25 // =0x19
 ; CHECK-NEXT:    str w9, [x3]
-; CHECK-NEXT:    mov w9, #12
 ; CHECK-NEXT:    str wzr, [x1]
-; CHECK-NEXT:    str w9, [x4]
+; CHECK-NEXT:    mov w8, #12 // =0xc
+; CHECK-NEXT:    str w8, [x4]
 ; CHECK-NEXT:    ret
   %1 = icmp eq i32 %v, 10
   br i1 %1, label %bb.0, label %bb.1

diff  --git a/llvm/test/CodeGen/AArch64/ctpop-nonean.ll b/llvm/test/CodeGen/AArch64/ctpop-nonean.ll
index af5652a442ace0..1d3e7a392a28ff 100644
--- a/llvm/test/CodeGen/AArch64/ctpop-nonean.ll
+++ b/llvm/test/CodeGen/AArch64/ctpop-nonean.ll
@@ -7,28 +7,28 @@ declare i128 @llvm.ctpop.i128(i128)
 define i128 @ctpop_i128(i128 %i) {
 ; CHECK-LABEL: ctpop_i128:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x8, x1, #1
 ; CHECK-NEXT:    lsr x9, x0, #1
-; CHECK-NEXT:    and x8, x8, #0x5555555555555555
+; CHECK-NEXT:    lsr x10, x1, #1
+; CHECK-NEXT:    mov x8, #72340172838076673 // =0x101010101010101
 ; CHECK-NEXT:    and x9, x9, #0x5555555555555555
-; CHECK-NEXT:    sub x8, x1, x8
+; CHECK-NEXT:    and x10, x10, #0x5555555555555555
 ; CHECK-NEXT:    sub x9, x0, x9
-; CHECK-NEXT:    and x10, x8, #0x3333333333333333
-; CHECK-NEXT:    lsr x8, x8, #2
-; CHECK-NEXT:    and x11, x9, #0x3333333333333333
-; CHECK-NEXT:    lsr x9, x9, #2
-; CHECK-NEXT:    and x8, x8, #0x3333333333333333
-; CHECK-NEXT:    and x9, x9, #0x3333333333333333
-; CHECK-NEXT:    add x8, x10, x8
-; CHECK-NEXT:    add x9, x11, x9
-; CHECK-NEXT:    mov x10, #72340172838076673
+; CHECK-NEXT:    sub x10, x1, x10
 ; CHECK-NEXT:    mov x1, xzr
-; CHECK-NEXT:    add x8, x8, x8, lsr #4
+; CHECK-NEXT:    lsr x11, x9, #2
+; CHECK-NEXT:    lsr x12, x10, #2
+; CHECK-NEXT:    and x9, x9, #0x3333333333333333
+; CHECK-NEXT:    and x10, x10, #0x3333333333333333
+; CHECK-NEXT:    and x11, x11, #0x3333333333333333
+; CHECK-NEXT:    add x9, x9, x11
+; CHECK-NEXT:    and x11, x12, #0x3333333333333333
 ; CHECK-NEXT:    add x9, x9, x9, lsr #4
-; CHECK-NEXT:    and x8, x8, #0xf0f0f0f0f0f0f0f
+; CHECK-NEXT:    add x10, x10, x11
+; CHECK-NEXT:    add x10, x10, x10, lsr #4
 ; CHECK-NEXT:    and x9, x9, #0xf0f0f0f0f0f0f0f
-; CHECK-NEXT:    mul x8, x8, x10
-; CHECK-NEXT:    mul x9, x9, x10
+; CHECK-NEXT:    mul x9, x9, x8
+; CHECK-NEXT:    and x10, x10, #0xf0f0f0f0f0f0f0f
+; CHECK-NEXT:    mul x8, x10, x8
 ; CHECK-NEXT:    lsr x9, x9, #56
 ; CHECK-NEXT:    add x0, x9, x8, lsr #56
 ; CHECK-NEXT:    ret
@@ -37,8 +37,8 @@ define i128 @ctpop_i128(i128 %i) {
 ; CHECK-CSSC:       // %bb.0:
 ; CHECK-CSSC-NEXT:    cnt x8, x1
 ; CHECK-CSSC-NEXT:    cnt x9, x0
-; CHECK-CSSC-NEXT:    add x0, x9, x8
 ; CHECK-CSSC-NEXT:    mov x1, xzr
+; CHECK-CSSC-NEXT:    add x0, x9, x8
 ; CHECK-CSSC-NEXT:    ret
   %c = call i128 @llvm.ctpop.i128(i128 %i)
   ret i128 %c

diff  --git a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
index 64e6372fd859b6..4e50cb11be71f9 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
@@ -8,24 +8,24 @@ declare void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8>, <vscale x
 define fastcc i8 @allocno_reload_assign() {
 ; CHECK-LABEL: allocno_reload_assign:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z3.b, #0 // =0x0
+; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpklo z4.h, z3.b
-; CHECK-NEXT:    uunpkhi z7.h, z3.b
-; CHECK-NEXT:    uunpklo z2.s, z4.h
-; CHECK-NEXT:    uunpkhi z4.s, z4.h
-; CHECK-NEXT:    uunpklo z6.s, z7.h
-; CHECK-NEXT:    uunpkhi z16.s, z7.h
+; CHECK-NEXT:    mov z16.d, #0 // =0x0
 ; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    mov z0.d, #0 // =0x0
-; CHECK-NEXT:    uunpklo z1.d, z2.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    uunpklo z3.d, z4.s
-; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    uunpklo z5.d, z6.s
-; CHECK-NEXT:    uunpkhi z6.d, z6.s
-; CHECK-NEXT:    uunpklo z7.d, z16.s
-; CHECK-NEXT:    uunpkhi z16.d, z16.s
+; CHECK-NEXT:    uunpklo z1.h, z0.b
+; CHECK-NEXT:    uunpkhi z0.h, z0.b
+; CHECK-NEXT:    uunpklo z2.s, z1.h
+; CHECK-NEXT:    uunpkhi z3.s, z1.h
+; CHECK-NEXT:    uunpklo z5.s, z0.h
+; CHECK-NEXT:    uunpkhi z7.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z2.s
+; CHECK-NEXT:    uunpkhi z1.d, z2.s
+; CHECK-NEXT:    uunpklo z2.d, z3.s
+; CHECK-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-NEXT:    uunpklo z4.d, z5.s
+; CHECK-NEXT:    uunpkhi z5.d, z5.s
+; CHECK-NEXT:    uunpklo z6.d, z7.s
+; CHECK-NEXT:    uunpkhi z7.d, z7.s
 ; CHECK-NEXT:  .LBB0_1: // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    fmov d17, xzr
 ; CHECK-NEXT:    cmpeq p2.d, p0/z, z17.d, #0
@@ -43,22 +43,22 @@ define fastcc i8 @allocno_reload_assign() {
 ; CHECK-NEXT:    punpkhi p3.h, p3.b
 ; CHECK-NEXT:    punpklo p5.h, p4.b
 ; CHECK-NEXT:    punpkhi p4.h, p4.b
-; CHECK-NEXT:    st1b { z1.d }, p5, [z0.d]
-; CHECK-NEXT:    punpklo p5.h, p2.b
-; CHECK-NEXT:    st1b { z2.d }, p4, [z0.d]
+; CHECK-NEXT:    st1b { z0.d }, p5, [z16.d]
+; CHECK-NEXT:    st1b { z1.d }, p4, [z16.d]
 ; CHECK-NEXT:    punpklo p4.h, p3.b
-; CHECK-NEXT:    punpkhi p2.h, p2.b
 ; CHECK-NEXT:    punpkhi p3.h, p3.b
-; CHECK-NEXT:    st1b { z3.d }, p4, [z0.d]
-; CHECK-NEXT:    punpklo p4.h, p5.b
-; CHECK-NEXT:    st1b { z4.d }, p3, [z0.d]
-; CHECK-NEXT:    punpkhi p3.h, p5.b
-; CHECK-NEXT:    st1b { z5.d }, p4, [z0.d]
+; CHECK-NEXT:    st1b { z2.d }, p4, [z16.d]
 ; CHECK-NEXT:    punpklo p4.h, p2.b
 ; CHECK-NEXT:    punpkhi p2.h, p2.b
-; CHECK-NEXT:    st1b { z6.d }, p3, [z0.d]
-; CHECK-NEXT:    st1b { z7.d }, p4, [z0.d]
-; CHECK-NEXT:    st1b { z16.d }, p2, [z0.d]
+; CHECK-NEXT:    st1b { z3.d }, p3, [z16.d]
+; CHECK-NEXT:    punpklo p3.h, p4.b
+; CHECK-NEXT:    st1b { z4.d }, p3, [z16.d]
+; CHECK-NEXT:    punpkhi p3.h, p4.b
+; CHECK-NEXT:    st1b { z5.d }, p3, [z16.d]
+; CHECK-NEXT:    punpklo p3.h, p2.b
+; CHECK-NEXT:    punpkhi p2.h, p2.b
+; CHECK-NEXT:    st1b { z6.d }, p3, [z16.d]
+; CHECK-NEXT:    st1b { z7.d }, p2, [z16.d]
 ; CHECK-NEXT:    b .LBB0_1
   br label %1
 

diff  --git a/llvm/test/CodeGen/AArch64/dag-combine-select.ll b/llvm/test/CodeGen/AArch64/dag-combine-select.ll
index 92ae9410b965fd..56208f19782cec 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-select.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-select.ll
@@ -35,14 +35,14 @@ define void @test1(i32 %bitset, i32 %val0, i32 %val1) {
 ; SDISEL-LABEL: test1:
 ; SDISEL:       // %bb.0:
 ; SDISEL-NEXT:    cmp w0, #7
-; SDISEL-NEXT:    adrp x8, out
-; SDISEL-NEXT:    csel w9, w1, w2, eq
-; SDISEL-NEXT:    cmp w9, #13
-; SDISEL-NEXT:    csel w9, w1, w2, lo
+; SDISEL-NEXT:    adrp x9, out
+; SDISEL-NEXT:    csel w8, w1, w2, eq
+; SDISEL-NEXT:    cmp w8, #13
+; SDISEL-NEXT:    csel w8, w1, w2, lo
 ; SDISEL-NEXT:    cmp w0, #42
-; SDISEL-NEXT:    csel w10, w1, w9, eq
-; SDISEL-NEXT:    str w9, [x8, :lo12:out]
-; SDISEL-NEXT:    str w10, [x8, :lo12:out]
+; SDISEL-NEXT:    csel w10, w1, w8, eq
+; SDISEL-NEXT:    str w8, [x9, :lo12:out]
+; SDISEL-NEXT:    str w10, [x9, :lo12:out]
 ; SDISEL-NEXT:    ret
 ;
 ; GISEL-LABEL: test1:

diff  --git a/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll b/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
index cba2e7ac657b06..a48a4e0e723ebc 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
@@ -35,8 +35,8 @@ define i1 @combine_setcc_eq_vecreduce_or_v32i1(<32 x i8> %a) {
 ; CHECK-LABEL: combine_setcc_eq_vecreduce_or_v32i1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmeq v1.16b, v1.16b, #0
-; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    cmeq v0.16b, v0.16b, #0
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    umaxv b0, v0.16b
 ; CHECK-NEXT:    fmov w9, s0
@@ -52,16 +52,16 @@ define i1 @combine_setcc_eq_vecreduce_or_v64i1(<64 x i8> %a) {
 ; CHECK-LABEL: combine_setcc_eq_vecreduce_or_v64i1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmeq v2.16b, v2.16b, #0
-; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    cmeq v0.16b, v0.16b, #0
+; CHECK-NEXT:    mov w9, #1 // =0x1
 ; CHECK-NEXT:    cmeq v3.16b, v3.16b, #0
 ; CHECK-NEXT:    cmeq v1.16b, v1.16b, #0
-; CHECK-NEXT:    cmeq v0.16b, v0.16b, #0
-; CHECK-NEXT:    orr v1.16b, v1.16b, v3.16b
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    orr v1.16b, v1.16b, v3.16b
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    umaxv b0, v0.16b
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    bic w0, w8, w9
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    bic w0, w9, w8
 ; CHECK-NEXT:    ret
   %cmp1 = icmp eq <64 x i8> %a, zeroinitializer
   %cast = bitcast <64 x i1> %cmp1 to i64
@@ -223,8 +223,8 @@ define i1 @combine_setcc_ne_vecreduce_and_v32i1(<32 x i8> %a) {
 ; CHECK-LABEL: combine_setcc_ne_vecreduce_and_v32i1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmtst v0.16b, v0.16b, v0.16b
-; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    cmeq v1.16b, v1.16b, #0
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    bic v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    uminv b0, v0.16b
 ; CHECK-NEXT:    fmov w9, s0
@@ -240,16 +240,16 @@ define i1 @combine_setcc_ne_vecreduce_and_v64i1(<64 x i8> %a) {
 ; CHECK-LABEL: combine_setcc_ne_vecreduce_and_v64i1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmtst v1.16b, v1.16b, v1.16b
-; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    cmtst v0.16b, v0.16b, v0.16b
-; CHECK-NEXT:    cmeq v3.16b, v3.16b, #0
+; CHECK-NEXT:    mov w9, #1 // =0x1
 ; CHECK-NEXT:    cmeq v2.16b, v2.16b, #0
+; CHECK-NEXT:    cmeq v3.16b, v3.16b, #0
 ; CHECK-NEXT:    bic v1.16b, v1.16b, v3.16b
 ; CHECK-NEXT:    bic v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    uminv b0, v0.16b
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    bic w0, w8, w9
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    bic w0, w9, w8
 ; CHECK-NEXT:    ret
   %cmp1 = icmp ne <64 x i8> %a, zeroinitializer
   %cast = bitcast <64 x i1> %cmp1 to i64
@@ -260,10 +260,10 @@ define i1 @combine_setcc_ne_vecreduce_and_v64i1(<64 x i8> %a) {
 define i1 @combine_setcc_eq0_conjunction_xor_or(ptr %a, ptr %b) {
 ; CHECK-LABEL: combine_setcc_eq0_conjunction_xor_or:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x9, [x0]
-; CHECK-NEXT:    ldp x10, x11, [x1]
-; CHECK-NEXT:    cmp x8, x10
-; CHECK-NEXT:    ccmp x9, x11, #0, eq
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %bcmp = tail call i32 @bcmp(ptr dereferenceable(16) %a, ptr dereferenceable(16) %b, i64 16)
@@ -274,10 +274,10 @@ define i1 @combine_setcc_eq0_conjunction_xor_or(ptr %a, ptr %b) {
 define i1 @combine_setcc_ne0_conjunction_xor_or(ptr %a, ptr %b) {
 ; CHECK-LABEL: combine_setcc_ne0_conjunction_xor_or:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x9, [x0]
-; CHECK-NEXT:    ldp x10, x11, [x1]
-; CHECK-NEXT:    cmp x8, x10
-; CHECK-NEXT:    ccmp x9, x11, #0, eq
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %bcmp = tail call i32 @bcmp(ptr dereferenceable(16) %a, ptr dereferenceable(16) %b, i64 16)
@@ -289,9 +289,9 @@ define i1 @combine_setcc_ne0_conjunction_xor_or(ptr %a, ptr %b) {
 define i32 @combine_setcc_multiuse(i32 %0, i32 %1, i32 %2, i32 %3) {
 ; CHECK-LABEL: combine_setcc_multiuse:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor w8, w1, w0
-; CHECK-NEXT:    eor w9, w3, w2
-; CHECK-NEXT:    orr w8, w9, w8
+; CHECK-NEXT:    eor w8, w3, w2
+; CHECK-NEXT:    eor w9, w1, w0
+; CHECK-NEXT:    orr w8, w8, w9
 ; CHECK-NEXT:    cbz w8, .LBB18_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    mov w0, w8

diff  --git a/llvm/test/CodeGen/AArch64/dag-numsignbits.ll b/llvm/test/CodeGen/AArch64/dag-numsignbits.ll
index 802df9d116049a..575102ce4ff962 100644
--- a/llvm/test/CodeGen/AArch64/dag-numsignbits.ll
+++ b/llvm/test/CodeGen/AArch64/dag-numsignbits.ll
@@ -6,13 +6,13 @@
 define void @signbits_vXi1(<4 x i16> %a1) {
 ; CHECK-LABEL: signbits_vXi1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI0_0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    adrp x8, .LCPI0_0
 ; CHECK-NEXT:    movi v2.4h, #1
-; CHECK-NEXT:    dup v0.4h, v0.h[0]
 ; CHECK-NEXT:    mov w1, wzr
-; CHECK-NEXT:    mov w2, wzr
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT:    mov w2, wzr
 ; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    cmgt v0.4h, v2.4h, v0.4h
 ; CHECK-NEXT:    umov w0, v0.h[0]

diff  --git a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll
index 49eeb476c601fe..49ad3ae7d62907 100644
--- a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll
@@ -78,16 +78,17 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
 ; ALL-NEXT:    smov w13, v0.b[4]
 ; ALL-NEXT:    smov w14, v0.b[5]
 ; ALL-NEXT:    smov w15, v0.b[6]
-; ALL-NEXT:    sdiv w8, w9, w8
-; ALL-NEXT:    smov w9, v1.b[0]
 ; ALL-NEXT:    smov w16, v0.b[7]
 ; ALL-NEXT:    smov w17, v0.b[8]
+; ALL-NEXT:    smov w18, v0.b[9]
+; ALL-NEXT:    sdiv w8, w9, w8
+; ALL-NEXT:    smov w9, v1.b[0]
 ; ALL-NEXT:    sdiv w9, w10, w9
 ; ALL-NEXT:    smov w10, v1.b[2]
 ; ALL-NEXT:    sdiv w10, w11, w10
 ; ALL-NEXT:    smov w11, v1.b[3]
 ; ALL-NEXT:    fmov s2, w9
-; ALL-NEXT:    smov w9, v1.b[9]
+; ALL-NEXT:    smov w9, v1.b[10]
 ; ALL-NEXT:    mov v2.b[1], w8
 ; ALL-NEXT:    sdiv w11, w12, w11
 ; ALL-NEXT:    smov w12, v1.b[4]
@@ -109,10 +110,9 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
 ; ALL-NEXT:    smov w16, v1.b[8]
 ; ALL-NEXT:    mov v2.b[6], w14
 ; ALL-NEXT:    sdiv w16, w17, w16
-; ALL-NEXT:    smov w17, v0.b[9]
+; ALL-NEXT:    smov w17, v1.b[9]
 ; ALL-NEXT:    mov v2.b[7], w15
-; ALL-NEXT:    sdiv w8, w17, w9
-; ALL-NEXT:    smov w9, v1.b[10]
+; ALL-NEXT:    sdiv w8, w18, w17
 ; ALL-NEXT:    mov v2.b[8], w16
 ; ALL-NEXT:    sdiv w9, w10, w9
 ; ALL-NEXT:    smov w10, v1.b[11]
@@ -153,6 +153,7 @@ define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, ptr %divdst) nounw
 ; ALL-NEXT:    smov w11, v0.h[2]
 ; ALL-NEXT:    smov w12, v0.h[3]
 ; ALL-NEXT:    smov w13, v0.h[4]
+; ALL-NEXT:    smov w14, v0.h[5]
 ; ALL-NEXT:    sdiv w8, w9, w8
 ; ALL-NEXT:    smov w9, v1.h[0]
 ; ALL-NEXT:    sdiv w9, w10, w9
@@ -160,18 +161,17 @@ define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, ptr %divdst) nounw
 ; ALL-NEXT:    sdiv w10, w11, w10
 ; ALL-NEXT:    smov w11, v1.h[3]
 ; ALL-NEXT:    fmov s2, w9
-; ALL-NEXT:    smov w9, v1.h[5]
+; ALL-NEXT:    smov w9, v1.h[6]
 ; ALL-NEXT:    mov v2.h[1], w8
 ; ALL-NEXT:    sdiv w11, w12, w11
 ; ALL-NEXT:    smov w12, v1.h[4]
 ; ALL-NEXT:    mov v2.h[2], w10
 ; ALL-NEXT:    smov w10, v0.h[6]
 ; ALL-NEXT:    sdiv w12, w13, w12
-; ALL-NEXT:    smov w13, v0.h[5]
+; ALL-NEXT:    smov w13, v1.h[5]
 ; ALL-NEXT:    mov v2.h[3], w11
 ; ALL-NEXT:    smov w11, v0.h[7]
-; ALL-NEXT:    sdiv w8, w13, w9
-; ALL-NEXT:    smov w9, v1.h[6]
+; ALL-NEXT:    sdiv w8, w14, w13
 ; ALL-NEXT:    mov v2.h[4], w12
 ; ALL-NEXT:    sdiv w9, w10, w9
 ; ALL-NEXT:    smov w10, v1.h[7]
@@ -226,15 +226,15 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, ptr %divdst) nounw
 ; ALL-NEXT:    mov x10, v1.d[1]
 ; ALL-NEXT:    mov x11, v0.d[1]
 ; ALL-NEXT:    sdiv x9, x9, x8
-; ALL-NEXT:    mul x8, x9, x8
 ; ALL-NEXT:    sdiv x11, x11, x10
-; ALL-NEXT:    fmov d2, x9
+; ALL-NEXT:    mul x8, x9, x8
 ; ALL-NEXT:    fmov d1, x8
 ; ALL-NEXT:    mul x10, x11, x10
-; ALL-NEXT:    mov v2.d[1], x11
 ; ALL-NEXT:    mov v1.d[1], x10
-; ALL-NEXT:    str q2, [x0]
 ; ALL-NEXT:    sub v0.2d, v0.2d, v1.2d
+; ALL-NEXT:    fmov d1, x9
+; ALL-NEXT:    mov v1.d[1], x11
+; ALL-NEXT:    str q1, [x0]
 ; ALL-NEXT:    ret
   %div = sdiv <2 x i64> %x, %y
   store <2 x i64> %div, ptr %divdst, align 16

diff  --git a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll
index aff2087025e328..3bc50b2f03d83d 100644
--- a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll
@@ -78,16 +78,17 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
 ; ALL-NEXT:    umov w13, v0.b[4]
 ; ALL-NEXT:    umov w14, v0.b[5]
 ; ALL-NEXT:    umov w15, v0.b[6]
-; ALL-NEXT:    udiv w8, w9, w8
-; ALL-NEXT:    umov w9, v1.b[0]
 ; ALL-NEXT:    umov w16, v0.b[7]
 ; ALL-NEXT:    umov w17, v0.b[8]
+; ALL-NEXT:    umov w18, v0.b[9]
+; ALL-NEXT:    udiv w8, w9, w8
+; ALL-NEXT:    umov w9, v1.b[0]
 ; ALL-NEXT:    udiv w9, w10, w9
 ; ALL-NEXT:    umov w10, v1.b[2]
 ; ALL-NEXT:    udiv w10, w11, w10
 ; ALL-NEXT:    umov w11, v1.b[3]
 ; ALL-NEXT:    fmov s2, w9
-; ALL-NEXT:    umov w9, v1.b[9]
+; ALL-NEXT:    umov w9, v1.b[10]
 ; ALL-NEXT:    mov v2.b[1], w8
 ; ALL-NEXT:    udiv w11, w12, w11
 ; ALL-NEXT:    umov w12, v1.b[4]
@@ -109,10 +110,9 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
 ; ALL-NEXT:    umov w16, v1.b[8]
 ; ALL-NEXT:    mov v2.b[6], w14
 ; ALL-NEXT:    udiv w16, w17, w16
-; ALL-NEXT:    umov w17, v0.b[9]
+; ALL-NEXT:    umov w17, v1.b[9]
 ; ALL-NEXT:    mov v2.b[7], w15
-; ALL-NEXT:    udiv w8, w17, w9
-; ALL-NEXT:    umov w9, v1.b[10]
+; ALL-NEXT:    udiv w8, w18, w17
 ; ALL-NEXT:    mov v2.b[8], w16
 ; ALL-NEXT:    udiv w9, w10, w9
 ; ALL-NEXT:    umov w10, v1.b[11]
@@ -153,6 +153,7 @@ define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, ptr %divdst) nounw
 ; ALL-NEXT:    umov w11, v0.h[2]
 ; ALL-NEXT:    umov w12, v0.h[3]
 ; ALL-NEXT:    umov w13, v0.h[4]
+; ALL-NEXT:    umov w14, v0.h[5]
 ; ALL-NEXT:    udiv w8, w9, w8
 ; ALL-NEXT:    umov w9, v1.h[0]
 ; ALL-NEXT:    udiv w9, w10, w9
@@ -160,18 +161,17 @@ define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, ptr %divdst) nounw
 ; ALL-NEXT:    udiv w10, w11, w10
 ; ALL-NEXT:    umov w11, v1.h[3]
 ; ALL-NEXT:    fmov s2, w9
-; ALL-NEXT:    umov w9, v1.h[5]
+; ALL-NEXT:    umov w9, v1.h[6]
 ; ALL-NEXT:    mov v2.h[1], w8
 ; ALL-NEXT:    udiv w11, w12, w11
 ; ALL-NEXT:    umov w12, v1.h[4]
 ; ALL-NEXT:    mov v2.h[2], w10
 ; ALL-NEXT:    umov w10, v0.h[6]
 ; ALL-NEXT:    udiv w12, w13, w12
-; ALL-NEXT:    umov w13, v0.h[5]
+; ALL-NEXT:    umov w13, v1.h[5]
 ; ALL-NEXT:    mov v2.h[3], w11
 ; ALL-NEXT:    umov w11, v0.h[7]
-; ALL-NEXT:    udiv w8, w13, w9
-; ALL-NEXT:    umov w9, v1.h[6]
+; ALL-NEXT:    udiv w8, w14, w13
 ; ALL-NEXT:    mov v2.h[4], w12
 ; ALL-NEXT:    udiv w9, w10, w9
 ; ALL-NEXT:    umov w10, v1.h[7]
@@ -226,15 +226,15 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, ptr %divdst) nounw
 ; ALL-NEXT:    mov x10, v1.d[1]
 ; ALL-NEXT:    mov x11, v0.d[1]
 ; ALL-NEXT:    udiv x9, x9, x8
-; ALL-NEXT:    mul x8, x9, x8
 ; ALL-NEXT:    udiv x11, x11, x10
-; ALL-NEXT:    fmov d2, x9
+; ALL-NEXT:    mul x8, x9, x8
 ; ALL-NEXT:    fmov d1, x8
 ; ALL-NEXT:    mul x10, x11, x10
-; ALL-NEXT:    mov v2.d[1], x11
 ; ALL-NEXT:    mov v1.d[1], x10
-; ALL-NEXT:    str q2, [x0]
 ; ALL-NEXT:    sub v0.2d, v0.2d, v1.2d
+; ALL-NEXT:    fmov d1, x9
+; ALL-NEXT:    mov v1.d[1], x11
+; ALL-NEXT:    str q1, [x0]
 ; ALL-NEXT:    ret
   %div = udiv <2 x i64> %x, %y
   store <2 x i64> %div, ptr %divdst, align 16

diff  --git a/llvm/test/CodeGen/AArch64/double_reduct.ll b/llvm/test/CodeGen/AArch64/double_reduct.ll
index cb2e7a32f5c485..b10114bc0ffa7e 100644
--- a/llvm/test/CodeGen/AArch64/double_reduct.ll
+++ b/llvm/test/CodeGen/AArch64/double_reduct.ll
@@ -87,9 +87,9 @@ define float @fminimumnum_f32(<8 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: fminimumnum_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    fminv s2, v2.4s
+; CHECK-NEXT:    fminv s1, v2.4s
 ; CHECK-NEXT:    fminv s0, v0.4s
-; CHECK-NEXT:    fminnm s0, s0, s2
+; CHECK-NEXT:    fminnm s0, s0, s1
 ; CHECK-NEXT:    ret
   %r1 = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> %a)
   %r2 = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %b)
@@ -101,9 +101,9 @@ define float @fmaxnumimum_f32(<8 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: fmaxnumimum_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    fmaxnmv s2, v2.4s
+; CHECK-NEXT:    fmaxnmv s1, v2.4s
 ; CHECK-NEXT:    fmaxnmv s0, v0.4s
-; CHECK-NEXT:    fmax s0, s0, s2
+; CHECK-NEXT:    fmax s0, s0, s1
 ; CHECK-NEXT:    ret
   %r1 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a)
   %r2 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b)

diff  --git a/llvm/test/CodeGen/AArch64/expand-select.ll b/llvm/test/CodeGen/AArch64/expand-select.ll
index 57d24fe86ea4a6..f8397290ab5e14 100644
--- a/llvm/test/CodeGen/AArch64/expand-select.ll
+++ b/llvm/test/CodeGen/AArch64/expand-select.ll
@@ -6,18 +6,18 @@ define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, ptr %Out) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    fmov s0, wzr
+; CHECK-NEXT:    ldr x11, [sp]
 ; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    ldp x9, x8, [sp]
+; CHECK-NEXT:    ldp x9, x10, [sp, #8]
 ; CHECK-NEXT:    cmeq v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    fmov w10, s0
-; CHECK-NEXT:    tst w10, #0x1
-; CHECK-NEXT:    ldr x10, [sp, #16]
-; CHECK-NEXT:    csel x8, x5, x8, ne
-; CHECK-NEXT:    csel x9, x4, x9, ne
-; CHECK-NEXT:    csel x11, x3, x7, ne
-; CHECK-NEXT:    csel x12, x2, x6, ne
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel x8, x5, x9, ne
+; CHECK-NEXT:    csel x9, x4, x11, ne
 ; CHECK-NEXT:    stp x9, x8, [x10, #16]
-; CHECK-NEXT:    stp x12, x11, [x10]
+; CHECK-NEXT:    csel x8, x3, x7, ne
+; CHECK-NEXT:    csel x9, x2, x6, ne
+; CHECK-NEXT:    stp x9, x8, [x10]
 ; CHECK-NEXT:    ret
   %cond = and i32 %In1, 1
   %cbool = icmp eq i32 %cond, 0
@@ -33,24 +33,24 @@ define void @bar(i32 %In1, <2 x i96> %In2, <2 x i96> %In3, ptr %Out) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    fmov s0, wzr
-; CHECK-NEXT:    ldp x10, x9, [sp]
-; CHECK-NEXT:    fmov s1, w8
 ; CHECK-NEXT:    ldr x11, [sp, #16]
+; CHECK-NEXT:    fmov s1, w8
+; CHECK-NEXT:    ldp x9, x10, [sp]
 ; CHECK-NEXT:    cmeq v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    dup v1.4s, v0.s[0]
 ; CHECK-NEXT:    mov x8, v1.d[1]
 ; CHECK-NEXT:    lsr x8, x8, #32
 ; CHECK-NEXT:    tst w8, #0x1
 ; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    csel x9, x5, x9, ne
-; CHECK-NEXT:    csel x10, x4, x10, ne
+; CHECK-NEXT:    csel x10, x5, x10, ne
+; CHECK-NEXT:    csel x9, x4, x9, ne
+; CHECK-NEXT:    stur x9, [x11, #12]
 ; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    str w10, [x11, #20]
 ; CHECK-NEXT:    csel x8, x2, x6, ne
-; CHECK-NEXT:    csel x12, x3, x7, ne
-; CHECK-NEXT:    stur x10, [x11, #12]
-; CHECK-NEXT:    str w9, [x11, #20]
+; CHECK-NEXT:    csel x9, x3, x7, ne
 ; CHECK-NEXT:    str x8, [x11]
-; CHECK-NEXT:    str w12, [x11, #8]
+; CHECK-NEXT:    str w9, [x11, #8]
 ; CHECK-NEXT:    ret
   %cond = and i32 %In1, 1
   %cbool = icmp eq i32 %cond, 0

diff  --git a/llvm/test/CodeGen/AArch64/expand-vector-rot.ll b/llvm/test/CodeGen/AArch64/expand-vector-rot.ll
index de9a0fe9b23aeb..1207b375f2171c 100644
--- a/llvm/test/CodeGen/AArch64/expand-vector-rot.ll
+++ b/llvm/test/CodeGen/AArch64/expand-vector-rot.ll
@@ -7,14 +7,14 @@ define <2 x i16>  @rotlv2_16(<2 x i16> %vec2_16, <2 x i16> %shift) {
 ; CHECK-LABEL: rotlv2_16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v2.2s, #15
-; CHECK-NEXT:    movi d3, #0x00ffff0000ffff
-; CHECK-NEXT:    neg v4.2s, v1.2s
-; CHECK-NEXT:    and v4.8b, v4.8b, v2.8b
-; CHECK-NEXT:    and v3.8b, v0.8b, v3.8b
-; CHECK-NEXT:    neg v4.2s, v4.2s
+; CHECK-NEXT:    neg v3.2s, v1.2s
+; CHECK-NEXT:    movi d4, #0x00ffff0000ffff
+; CHECK-NEXT:    and v3.8b, v3.8b, v2.8b
+; CHECK-NEXT:    and v4.8b, v0.8b, v4.8b
 ; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-NEXT:    neg v3.2s, v3.2s
 ; CHECK-NEXT:    ushl v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    ushl v2.2s, v3.2s, v4.2s
+; CHECK-NEXT:    ushl v2.2s, v4.2s, v3.2s
 ; CHECK-NEXT:    orr v0.8b, v0.8b, v2.8b
 ; CHECK-NEXT:    ret
   %1 = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %vec2_16, <2 x i16> %vec2_16, <2 x i16> %shift)

diff  --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll
index 94cf75391254fe..a93762918cd87c 100644
--- a/llvm/test/CodeGen/AArch64/extbinopload.ll
+++ b/llvm/test/CodeGen/AArch64/extbinopload.ll
@@ -110,21 +110,21 @@ define <4 x i32> @load_v4i12_v4i32(ptr %p) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    ldr w9, [x0, #8]
-; CHECK-NEXT:    ubfx x10, x8, #48, #12
-; CHECK-NEXT:    lsr x11, x8, #60
-; CHECK-NEXT:    orr w11, w11, w9, lsl #4
-; CHECK-NEXT:    and w12, w8, #0xfff
-; CHECK-NEXT:    and w11, w11, #0xfff
-; CHECK-NEXT:    fmov s0, w10
+; CHECK-NEXT:    lsr x10, x8, #60
+; CHECK-NEXT:    ubfx x11, x8, #48, #12
+; CHECK-NEXT:    ubfx w12, w9, #8, #12
+; CHECK-NEXT:    orr w10, w10, w9, lsl #4
+; CHECK-NEXT:    fmov s0, w11
+; CHECK-NEXT:    and w11, w8, #0xfff
+; CHECK-NEXT:    fmov s1, w11
+; CHECK-NEXT:    lsr x9, x9, #20
+; CHECK-NEXT:    and w10, w10, #0xfff
+; CHECK-NEXT:    mov v0.h[1], w10
 ; CHECK-NEXT:    ubfx w10, w8, #12, #12
-; CHECK-NEXT:    fmov s1, w12
-; CHECK-NEXT:    mov v0.h[1], w11
-; CHECK-NEXT:    ubfx w11, w9, #8, #12
 ; CHECK-NEXT:    mov v1.h[1], w10
 ; CHECK-NEXT:    ubfx x10, x8, #24, #12
-; CHECK-NEXT:    lsr x9, x9, #20
 ; CHECK-NEXT:    ubfx x8, x8, #36, #12
-; CHECK-NEXT:    mov v0.h[2], w11
+; CHECK-NEXT:    mov v0.h[2], w12
 ; CHECK-NEXT:    mov v1.h[2], w10
 ; CHECK-NEXT:    mov v0.h[3], w9
 ; CHECK-NEXT:    mov v1.h[3], w8
@@ -264,12 +264,12 @@ define <2 x i16> @std_v2i8_v2i16(ptr %p) {
 ; CHECK-LABEL: std_v2i8_v2i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrb w8, [x0, #2]
-; CHECK-NEXT:    ldrb w9, [x0]
+; CHECK-NEXT:    ldrb w9, [x0, #3]
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    ldrb w8, [x0, #3]
-; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    fmov s1, w8
+; CHECK-NEXT:    mov v0.s[1], w9
 ; CHECK-NEXT:    ldrb w9, [x0, #1]
-; CHECK-NEXT:    mov v0.s[1], w8
 ; CHECK-NEXT:    mov v1.s[1], w9
 ; CHECK-NEXT:    shl v0.2s, v0.2s, #3
 ; CHECK-NEXT:    add v0.2s, v1.2s, v0.2s
@@ -370,12 +370,12 @@ define <12 x i32> @load_bv_3xv4i8_i32(ptr %p, ptr %q, ptr %r) {
 ; CHECK-NEXT:    ld1 { v1.s }[1], [x1]
 ; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
 ; CHECK-NEXT:    ushll v3.8h, v3.8b, #0
-; CHECK-NEXT:    ushll v2.4s, v2.4h, #3
 ; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    uaddw v2.4s, v2.4s, v3.4h
-; CHECK-NEXT:    ushll2 v3.4s, v1.8h, #3
+; CHECK-NEXT:    ushll v2.4s, v2.4h, #3
+; CHECK-NEXT:    ushll2 v4.4s, v1.8h, #3
 ; CHECK-NEXT:    ushll v1.4s, v1.4h, #3
-; CHECK-NEXT:    uaddw2 v3.4s, v3.4s, v0.8h
+; CHECK-NEXT:    uaddw v2.4s, v2.4s, v3.4h
+; CHECK-NEXT:    uaddw2 v3.4s, v4.4s, v0.8h
 ; CHECK-NEXT:    uaddw v0.4s, v1.4s, v0.4h
 ; CHECK-NEXT:    stp q3, q2, [x8, #16]
 ; CHECK-NEXT:    str q0, [x8]
@@ -450,10 +450,10 @@ define <8 x i32> @double_bv_2xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s) {
 ; CHECK-NEXT:    usubl v0.8h, v0.8b, v1.8b
 ; CHECK-NEXT:    ld1 { v3.s }[1], [x3]
 ; CHECK-NEXT:    usubl v2.8h, v2.8b, v3.8b
+; CHECK-NEXT:    shll v3.4s, v2.4h, #16
 ; CHECK-NEXT:    shll2 v1.4s, v2.8h, #16
-; CHECK-NEXT:    shll v2.4s, v2.4h, #16
 ; CHECK-NEXT:    saddw2 v1.4s, v1.4s, v0.8h
-; CHECK-NEXT:    saddw v0.4s, v2.4s, v0.4h
+; CHECK-NEXT:    saddw v0.4s, v3.4s, v0.4h
 ; CHECK-NEXT:    ret
   %j1 = load <4 x i8>, ptr %p
   %p1 = getelementptr i8, ptr %p, i32 4
@@ -493,7 +493,7 @@ define <16 x i32> @double_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s, ptr %t,
 ; CHECK-NEXT:    ld1 { v1.s }[1], [x1]
 ; CHECK-NEXT:    ld1 { v2.s }[1], [x3], #4
 ; CHECK-NEXT:    ldp s4, s5, [x4]
-; CHECK-NEXT:    usubl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    usubl v1.8h, v0.8b, v1.8b
 ; CHECK-NEXT:    ld1 { v3.s }[1], [x3]
 ; CHECK-NEXT:    ld1 { v4.s }[1], [x5], #4
 ; CHECK-NEXT:    ldp s6, s7, [x6]
@@ -502,15 +502,15 @@ define <16 x i32> @double_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s, ptr %t,
 ; CHECK-NEXT:    ld1 { v6.s }[1], [x7], #4
 ; CHECK-NEXT:    usubl v4.8h, v4.8b, v5.8b
 ; CHECK-NEXT:    ld1 { v7.s }[1], [x7]
-; CHECK-NEXT:    shll v3.4s, v4.4h, #16
-; CHECK-NEXT:    shll2 v1.4s, v4.8h, #16
-; CHECK-NEXT:    usubl v4.8h, v6.8b, v7.8b
-; CHECK-NEXT:    saddw2 v1.4s, v1.4s, v0.8h
-; CHECK-NEXT:    saddw v0.4s, v3.4s, v0.4h
-; CHECK-NEXT:    shll2 v3.4s, v4.8h, #16
-; CHECK-NEXT:    shll v4.4s, v4.4h, #16
+; CHECK-NEXT:    usubl v5.8h, v6.8b, v7.8b
+; CHECK-NEXT:    shll v0.4s, v4.4h, #16
+; CHECK-NEXT:    shll2 v4.4s, v4.8h, #16
+; CHECK-NEXT:    saddw v0.4s, v0.4s, v1.4h
+; CHECK-NEXT:    saddw2 v1.4s, v4.4s, v1.8h
+; CHECK-NEXT:    shll v6.4s, v5.4h, #16
+; CHECK-NEXT:    shll2 v3.4s, v5.8h, #16
 ; CHECK-NEXT:    saddw2 v3.4s, v3.4s, v2.8h
-; CHECK-NEXT:    saddw v2.4s, v4.4s, v2.4h
+; CHECK-NEXT:    saddw v2.4s, v6.4s, v2.4h
 ; CHECK-NEXT:    ret
   %j1 = load <4 x i8>, ptr %p
   %p1 = getelementptr i8, ptr %p, i32 4
@@ -564,26 +564,26 @@ define <16 x i32> @double_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s, ptr %t,
 define <16 x i32> @double2_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s, ptr %t, ptr %u, ptr %v, ptr %w) {
 ; CHECK-LABEL: double2_bv_4xv4i8_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x4]
-; CHECK-NEXT:    ldr d2, [x0]
-; CHECK-NEXT:    ldr d3, [x1]
-; CHECK-NEXT:    ldr d6, [x5]
-; CHECK-NEXT:    ldr d1, [x2]
-; CHECK-NEXT:    ldr d4, [x3]
-; CHECK-NEXT:    ldr d5, [x7]
-; CHECK-NEXT:    ldr d7, [x6]
-; CHECK-NEXT:    usubl v0.8h, v2.8b, v0.8b
-; CHECK-NEXT:    usubl v2.8h, v3.8b, v6.8b
-; CHECK-NEXT:    usubl v4.8h, v4.8b, v5.8b
-; CHECK-NEXT:    usubl v3.8h, v1.8b, v7.8b
-; CHECK-NEXT:    shll2 v1.4s, v0.8h, #16
+; CHECK-NEXT:    ldr d0, [x2]
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
+; CHECK-NEXT:    ldr d3, [x3]
+; CHECK-NEXT:    ldr d4, [x4]
+; CHECK-NEXT:    ldr d5, [x5]
+; CHECK-NEXT:    ldr d6, [x6]
+; CHECK-NEXT:    ldr d7, [x7]
+; CHECK-NEXT:    usubl v1.8h, v1.8b, v4.8b
+; CHECK-NEXT:    usubl v2.8h, v2.8b, v5.8b
+; CHECK-NEXT:    usubl v3.8h, v3.8b, v7.8b
+; CHECK-NEXT:    usubl v4.8h, v0.8b, v6.8b
+; CHECK-NEXT:    shll2 v0.4s, v1.8h, #16
 ; CHECK-NEXT:    shll2 v5.4s, v2.8h, #16
-; CHECK-NEXT:    saddw v0.4s, v1.4s, v0.4h
+; CHECK-NEXT:    shll2 v6.4s, v4.8h, #16
+; CHECK-NEXT:    shll2 v7.4s, v3.8h, #16
+; CHECK-NEXT:    saddw v0.4s, v0.4s, v1.4h
 ; CHECK-NEXT:    saddw v1.4s, v5.4s, v2.4h
-; CHECK-NEXT:    shll2 v2.4s, v3.8h, #16
-; CHECK-NEXT:    shll2 v5.4s, v4.8h, #16
-; CHECK-NEXT:    saddw v2.4s, v2.4s, v3.4h
-; CHECK-NEXT:    saddw v3.4s, v5.4s, v4.4h
+; CHECK-NEXT:    saddw v2.4s, v6.4s, v4.4h
+; CHECK-NEXT:    saddw v3.4s, v7.4s, v3.4h
 ; CHECK-NEXT:    ret
   %j1 = load <4 x i8>, ptr %p
   %p1 = getelementptr i8, ptr %p, i32 4
@@ -646,47 +646,47 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-LABEL: extrause_load:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    add x10, x1, #4
-; CHECK-NEXT:    add x11, x1, #8
-; CHECK-NEXT:    add x12, x1, #12
+; CHECK-NEXT:    add x8, x3, #8
+; CHECK-NEXT:    add x11, x1, #12
 ; CHECK-NEXT:    str s0, [x4]
-; CHECK-NEXT:    ldp s1, s5, [x2]
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ldp s1, s5, [x2]
 ; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    umov w8, v1.h[0]
-; CHECK-NEXT:    umov w9, v1.h[1]
-; CHECK-NEXT:    mov v2.b[8], w8
-; CHECK-NEXT:    umov w8, v1.h[2]
-; CHECK-NEXT:    mov v2.b[9], w9
-; CHECK-NEXT:    umov w9, v1.h[3]
+; CHECK-NEXT:    umov w9, v1.h[0]
+; CHECK-NEXT:    umov w10, v1.h[1]
+; CHECK-NEXT:    mov v2.b[8], w9
+; CHECK-NEXT:    umov w9, v1.h[2]
+; CHECK-NEXT:    mov v2.b[9], w10
+; CHECK-NEXT:    umov w10, v1.h[3]
 ; CHECK-NEXT:    ldr s1, [x1]
-; CHECK-NEXT:    mov v2.b[10], w8
-; CHECK-NEXT:    add x8, x3, #8
 ; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    mov v2.b[10], w9
+; CHECK-NEXT:    add x9, x1, #4
 ; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v2.b[11], w9
-; CHECK-NEXT:    add x9, x3, #12
+; CHECK-NEXT:    mov v2.b[11], w10
+; CHECK-NEXT:    add x10, x3, #12
 ; CHECK-NEXT:    ld1 { v2.s }[3], [x3], #4
-; CHECK-NEXT:    ldp s3, s4, [x0, #4]
+; CHECK-NEXT:    ldr s4, [x0, #12]
+; CHECK-NEXT:    ldp s3, s16, [x0, #4]
 ; CHECK-NEXT:    ldp s6, s7, [x2, #8]
-; CHECK-NEXT:    ldr s16, [x0, #12]
-; CHECK-NEXT:    ld1 { v5.s }[1], [x3]
 ; CHECK-NEXT:    ld1 { v4.s }[1], [x11]
+; CHECK-NEXT:    ld1 { v5.s }[1], [x3]
+; CHECK-NEXT:    ld1 { v3.s }[1], [x9]
 ; CHECK-NEXT:    ld1 { v6.s }[1], [x8]
-; CHECK-NEXT:    ld1 { v3.s }[1], [x10]
-; CHECK-NEXT:    ld1 { v16.s }[1], [x12]
-; CHECK-NEXT:    ld1 { v7.s }[1], [x9]
-; CHECK-NEXT:    ushll v1.8h, v6.8b, #0
-; CHECK-NEXT:    uaddl v0.8h, v0.8b, v4.8b
-; CHECK-NEXT:    uaddl v3.8h, v3.8b, v16.8b
+; CHECK-NEXT:    ld1 { v7.s }[1], [x10]
+; CHECK-NEXT:    add x8, x1, #8
+; CHECK-NEXT:    ld1 { v16.s }[1], [x8]
+; CHECK-NEXT:    uaddl v1.8h, v3.8b, v4.8b
+; CHECK-NEXT:    ushll v3.8h, v6.8b, #0
 ; CHECK-NEXT:    uaddl v4.8h, v5.8b, v7.8b
-; CHECK-NEXT:    uaddw2 v2.8h, v1.8h, v2.16b
-; CHECK-NEXT:    ushll v5.4s, v3.4h, #3
-; CHECK-NEXT:    ushll2 v1.4s, v3.8h, #3
+; CHECK-NEXT:    uaddl v5.8h, v0.8b, v16.8b
+; CHECK-NEXT:    uaddw2 v2.8h, v3.8h, v2.16b
+; CHECK-NEXT:    ushll v0.4s, v1.4h, #3
+; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #3
 ; CHECK-NEXT:    ushll v6.4s, v4.4h, #3
 ; CHECK-NEXT:    ushll2 v3.4s, v4.8h, #3
-; CHECK-NEXT:    uaddw2 v1.4s, v1.4s, v0.8h
-; CHECK-NEXT:    uaddw v0.4s, v5.4s, v0.4h
+; CHECK-NEXT:    uaddw v0.4s, v0.4s, v5.4h
+; CHECK-NEXT:    uaddw2 v1.4s, v1.4s, v5.8h
 ; CHECK-NEXT:    uaddw2 v3.4s, v3.4s, v2.8h
 ; CHECK-NEXT:    uaddw v2.4s, v6.4s, v2.4h
 ; CHECK-NEXT:    ret
@@ -757,38 +757,38 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 define <16 x i32> @extrause_shuffle(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-LABEL: extrause_shuffle:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp s0, s1, [x2]
-; CHECK-NEXT:    ldp s2, s3, [x0]
-; CHECK-NEXT:    ldp s6, s7, [x0, #8]
-; CHECK-NEXT:    ldr s18, [x1, #12]
+; CHECK-NEXT:    ldp s2, s3, [x0, #8]
 ; CHECK-NEXT:    add x8, x3, #8
+; CHECK-NEXT:    ldr s16, [x1, #12]
+; CHECK-NEXT:    ldp s0, s1, [x2]
+; CHECK-NEXT:    ldp s6, s7, [x0]
 ; CHECK-NEXT:    add x9, x1, #8
+; CHECK-NEXT:    mov v4.16b, v3.16b
+; CHECK-NEXT:    ldp s17, s18, [x2, #8]
 ; CHECK-NEXT:    ldr s5, [x3, #12]
-; CHECK-NEXT:    ldp s16, s17, [x2, #8]
+; CHECK-NEXT:    mov v3.s[1], v16.s[0]
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x3], #4
-; CHECK-NEXT:    ld1 { v2.s }[1], [x1], #4
-; CHECK-NEXT:    mov v4.16b, v7.16b
-; CHECK-NEXT:    ld1 { v6.s }[1], [x9]
-; CHECK-NEXT:    mov v4.s[1], v18.s[0]
-; CHECK-NEXT:    ld1 { v16.s }[1], [x8]
-; CHECK-NEXT:    mov v7.s[1], v18.s[0]
+; CHECK-NEXT:    mov v4.s[1], v16.s[0]
+; CHECK-NEXT:    ld1 { v6.s }[1], [x1], #4
+; CHECK-NEXT:    ld1 { v2.s }[1], [x9]
+; CHECK-NEXT:    ld1 { v17.s }[1], [x8]
 ; CHECK-NEXT:    ld1 { v1.s }[1], [x3]
-; CHECK-NEXT:    ld1 { v3.s }[1], [x1]
-; CHECK-NEXT:    uaddl v2.8h, v2.8b, v6.8b
-; CHECK-NEXT:    mov v4.s[2], v17.s[0]
-; CHECK-NEXT:    mov v17.s[1], v5.s[0]
-; CHECK-NEXT:    uaddl v3.8h, v3.8b, v7.8b
-; CHECK-NEXT:    uaddl v6.8h, v0.8b, v16.8b
+; CHECK-NEXT:    ld1 { v7.s }[1], [x1]
+; CHECK-NEXT:    mov v4.s[2], v18.s[0]
+; CHECK-NEXT:    mov v18.s[1], v5.s[0]
+; CHECK-NEXT:    uaddl v2.8h, v6.8b, v2.8b
+; CHECK-NEXT:    uaddl v6.8h, v0.8b, v17.8b
+; CHECK-NEXT:    uaddl v3.8h, v7.8b, v3.8b
+; CHECK-NEXT:    uaddl v1.8h, v1.8b, v18.8b
 ; CHECK-NEXT:    mov v4.s[3], v5.s[0]
-; CHECK-NEXT:    uaddl v7.8h, v1.8b, v17.8b
-; CHECK-NEXT:    ushll2 v0.4s, v3.8h, #3
-; CHECK-NEXT:    ushll v3.4s, v3.4h, #3
-; CHECK-NEXT:    uaddw2 v1.4s, v0.4s, v2.8h
+; CHECK-NEXT:    ushll v0.4s, v3.4h, #3
+; CHECK-NEXT:    ushll v7.4s, v1.4h, #3
+; CHECK-NEXT:    ushll2 v16.4s, v1.8h, #3
+; CHECK-NEXT:    ushll2 v1.4s, v3.8h, #3
+; CHECK-NEXT:    uaddw v0.4s, v0.4s, v2.4h
 ; CHECK-NEXT:    str q4, [x4]
-; CHECK-NEXT:    uaddw v0.4s, v3.4s, v2.4h
-; CHECK-NEXT:    ushll2 v2.4s, v7.8h, #3
-; CHECK-NEXT:    ushll v7.4s, v7.4h, #3
-; CHECK-NEXT:    uaddw2 v3.4s, v2.4s, v6.8h
+; CHECK-NEXT:    uaddw2 v1.4s, v1.4s, v2.8h
+; CHECK-NEXT:    uaddw2 v3.4s, v16.4s, v6.8h
 ; CHECK-NEXT:    uaddw v2.4s, v7.4s, v6.4h
 ; CHECK-NEXT:    ret
   %lp1 = load <4 x i8>, ptr %p
@@ -859,35 +859,35 @@ define <16 x i32> @extrause_ext(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-LABEL: extrause_ext:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp s0, s1, [x2]
-; CHECK-NEXT:    ldp s2, s3, [x0]
 ; CHECK-NEXT:    add x8, x3, #8
+; CHECK-NEXT:    ldp s2, s3, [x0]
 ; CHECK-NEXT:    add x9, x1, #8
-; CHECK-NEXT:    ldp s5, s6, [x2, #8]
-; CHECK-NEXT:    add x10, x1, #12
-; CHECK-NEXT:    add x11, x3, #12
-; CHECK-NEXT:    ldp s7, s4, [x0, #8]
+; CHECK-NEXT:    add x10, x3, #12
+; CHECK-NEXT:    ldp s4, s5, [x2, #8]
+; CHECK-NEXT:    ldp s6, s7, [x0, #8]
+; CHECK-NEXT:    add x11, x1, #12
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x3], #4
 ; CHECK-NEXT:    ld1 { v2.s }[1], [x1], #4
-; CHECK-NEXT:    ld1 { v6.s }[1], [x11]
-; CHECK-NEXT:    ld1 { v4.s }[1], [x10]
+; CHECK-NEXT:    ld1 { v5.s }[1], [x10]
+; CHECK-NEXT:    ld1 { v7.s }[1], [x11]
+; CHECK-NEXT:    ld1 { v6.s }[1], [x9]
+; CHECK-NEXT:    ld1 { v4.s }[1], [x8]
 ; CHECK-NEXT:    ld1 { v1.s }[1], [x3]
 ; CHECK-NEXT:    ld1 { v3.s }[1], [x1]
-; CHECK-NEXT:    ld1 { v7.s }[1], [x9]
-; CHECK-NEXT:    ld1 { v5.s }[1], [x8]
-; CHECK-NEXT:    uaddl v1.8h, v1.8b, v6.8b
-; CHECK-NEXT:    uaddl v3.8h, v3.8b, v4.8b
-; CHECK-NEXT:    uaddl v2.8h, v2.8b, v7.8b
-; CHECK-NEXT:    uaddl v5.8h, v0.8b, v5.8b
-; CHECK-NEXT:    ushll v7.4s, v1.4h, #3
-; CHECK-NEXT:    ushll v0.4s, v3.4h, #3
-; CHECK-NEXT:    ushll2 v3.4s, v3.8h, #3
+; CHECK-NEXT:    uaddl v2.8h, v2.8b, v6.8b
+; CHECK-NEXT:    uaddl v4.8h, v0.8b, v4.8b
+; CHECK-NEXT:    uaddl v1.8h, v1.8b, v5.8b
+; CHECK-NEXT:    ushll v5.8h, v5.8b, #0
+; CHECK-NEXT:    uaddl v3.8h, v3.8b, v7.8b
+; CHECK-NEXT:    ushll v6.4s, v1.4h, #3
 ; CHECK-NEXT:    ushll2 v16.4s, v1.8h, #3
-; CHECK-NEXT:    uaddw2 v1.4s, v3.4s, v2.8h
-; CHECK-NEXT:    uaddw2 v3.4s, v16.4s, v5.8h
+; CHECK-NEXT:    ushll v0.4s, v3.4h, #3
+; CHECK-NEXT:    ushll2 v1.4s, v3.8h, #3
+; CHECK-NEXT:    uaddw2 v3.4s, v16.4s, v4.8h
 ; CHECK-NEXT:    uaddw v0.4s, v0.4s, v2.4h
-; CHECK-NEXT:    uaddw v2.4s, v7.4s, v5.4h
-; CHECK-NEXT:    ushll v5.8h, v6.8b, #0
-; CHECK-NEXT:    ushll v4.8h, v4.8b, #0
+; CHECK-NEXT:    uaddw2 v1.4s, v1.4s, v2.8h
+; CHECK-NEXT:    uaddw v2.4s, v6.4s, v4.4h
+; CHECK-NEXT:    ushll v4.8h, v7.8b, #0
 ; CHECK-NEXT:    stp q4, q5, [x4]
 ; CHECK-NEXT:    ret
   %lp1 = load <4 x i8>, ptr %p
@@ -958,34 +958,34 @@ define <16 x i32> @extrause_add(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-LABEL: extrause_add:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp s0, s1, [x0]
-; CHECK-NEXT:    ldp s2, s3, [x2]
 ; CHECK-NEXT:    add x8, x3, #8
+; CHECK-NEXT:    ldp s2, s3, [x2]
 ; CHECK-NEXT:    add x9, x1, #8
+; CHECK-NEXT:    add x10, x3, #12
 ; CHECK-NEXT:    ldp s4, s5, [x0, #8]
-; CHECK-NEXT:    add x10, x1, #12
-; CHECK-NEXT:    add x11, x3, #12
 ; CHECK-NEXT:    ldp s6, s7, [x2, #8]
+; CHECK-NEXT:    add x11, x1, #12
 ; CHECK-NEXT:    ld1 { v2.s }[1], [x3], #4
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x1], #4
+; CHECK-NEXT:    ld1 { v5.s }[1], [x11]
+; CHECK-NEXT:    ld1 { v7.s }[1], [x10]
 ; CHECK-NEXT:    ld1 { v4.s }[1], [x9]
-; CHECK-NEXT:    ld1 { v7.s }[1], [x11]
+; CHECK-NEXT:    ld1 { v6.s }[1], [x8]
 ; CHECK-NEXT:    ld1 { v3.s }[1], [x3]
 ; CHECK-NEXT:    ld1 { v1.s }[1], [x1]
-; CHECK-NEXT:    ld1 { v5.s }[1], [x10]
-; CHECK-NEXT:    ld1 { v6.s }[1], [x8]
-; CHECK-NEXT:    uaddl v0.8h, v0.8b, v4.8b
-; CHECK-NEXT:    uaddl v4.8h, v3.8b, v7.8b
-; CHECK-NEXT:    uaddl v5.8h, v1.8b, v5.8b
 ; CHECK-NEXT:    uaddl v2.8h, v2.8b, v6.8b
-; CHECK-NEXT:    ushll v6.4s, v4.4h, #3
-; CHECK-NEXT:    ushll v7.4s, v5.4h, #3
-; CHECK-NEXT:    stp q5, q4, [x4]
-; CHECK-NEXT:    ushll2 v1.4s, v5.8h, #3
-; CHECK-NEXT:    ushll2 v3.4s, v4.8h, #3
-; CHECK-NEXT:    uaddw2 v1.4s, v1.4s, v0.8h
+; CHECK-NEXT:    uaddl v7.8h, v3.8b, v7.8b
+; CHECK-NEXT:    uaddl v5.8h, v1.8b, v5.8b
+; CHECK-NEXT:    uaddl v1.8h, v0.8b, v4.8b
+; CHECK-NEXT:    ushll v4.4s, v7.4h, #3
+; CHECK-NEXT:    ushll2 v3.4s, v7.8h, #3
+; CHECK-NEXT:    ushll v0.4s, v5.4h, #3
+; CHECK-NEXT:    ushll2 v6.4s, v5.8h, #3
+; CHECK-NEXT:    stp q5, q7, [x4]
 ; CHECK-NEXT:    uaddw2 v3.4s, v3.4s, v2.8h
-; CHECK-NEXT:    uaddw v0.4s, v7.4s, v0.4h
-; CHECK-NEXT:    uaddw v2.4s, v6.4s, v2.4h
+; CHECK-NEXT:    uaddw v2.4s, v4.4s, v2.4h
+; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NEXT:    uaddw2 v1.4s, v6.4s, v1.8h
 ; CHECK-NEXT:    ret
   %lp1 = load <4 x i8>, ptr %p
   %p2 = getelementptr i8, ptr %p, i32 4
@@ -1055,38 +1055,38 @@ define <16 x i32> @extrause_ext2(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-LABEL: extrause_ext2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp s0, s1, [x2]
-; CHECK-NEXT:    ldp s2, s3, [x0]
 ; CHECK-NEXT:    add x8, x3, #8
+; CHECK-NEXT:    ldp s2, s3, [x0]
 ; CHECK-NEXT:    add x9, x1, #8
+; CHECK-NEXT:    add x10, x3, #12
 ; CHECK-NEXT:    ldp s4, s5, [x2, #8]
-; CHECK-NEXT:    add x10, x1, #12
-; CHECK-NEXT:    add x11, x3, #12
 ; CHECK-NEXT:    ldp s6, s7, [x0, #8]
+; CHECK-NEXT:    add x11, x1, #12
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x3], #4
 ; CHECK-NEXT:    ld1 { v2.s }[1], [x1], #4
-; CHECK-NEXT:    ld1 { v5.s }[1], [x11]
+; CHECK-NEXT:    ld1 { v5.s }[1], [x10]
+; CHECK-NEXT:    ld1 { v7.s }[1], [x11]
 ; CHECK-NEXT:    ld1 { v6.s }[1], [x9]
-; CHECK-NEXT:    ld1 { v7.s }[1], [x10]
-; CHECK-NEXT:    ld1 { v3.s }[1], [x1]
-; CHECK-NEXT:    ld1 { v1.s }[1], [x3]
 ; CHECK-NEXT:    ld1 { v4.s }[1], [x8]
+; CHECK-NEXT:    ld1 { v1.s }[1], [x3]
+; CHECK-NEXT:    ld1 { v3.s }[1], [x1]
 ; CHECK-NEXT:    uaddl v2.8h, v2.8b, v6.8b
-; CHECK-NEXT:    uaddl v6.8h, v3.8b, v7.8b
-; CHECK-NEXT:    uaddl v5.8h, v1.8b, v5.8b
 ; CHECK-NEXT:    uaddl v4.8h, v0.8b, v4.8b
-; CHECK-NEXT:    ushll2 v0.4s, v6.8h, #3
-; CHECK-NEXT:    ushll v3.4s, v6.4h, #3
-; CHECK-NEXT:    ushll v7.4s, v5.4h, #0
-; CHECK-NEXT:    ushll2 v16.4s, v5.8h, #0
-; CHECK-NEXT:    uaddw2 v1.4s, v0.4s, v2.8h
-; CHECK-NEXT:    uaddw v0.4s, v3.4s, v2.4h
-; CHECK-NEXT:    stp q7, q16, [x4, #32]
-; CHECK-NEXT:    ushll2 v2.4s, v5.8h, #3
-; CHECK-NEXT:    ushll v5.4s, v5.4h, #3
-; CHECK-NEXT:    uaddw2 v3.4s, v2.4s, v4.8h
+; CHECK-NEXT:    uaddl v7.8h, v3.8b, v7.8b
+; CHECK-NEXT:    uaddl v3.8h, v1.8b, v5.8b
+; CHECK-NEXT:    ushll v0.4s, v7.4h, #3
+; CHECK-NEXT:    ushll2 v1.4s, v7.8h, #3
+; CHECK-NEXT:    ushll v5.4s, v3.4h, #3
+; CHECK-NEXT:    ushll2 v6.4s, v3.8h, #3
+; CHECK-NEXT:    ushll2 v16.4s, v3.8h, #0
+; CHECK-NEXT:    ushll v17.4s, v3.4h, #0
+; CHECK-NEXT:    uaddw2 v1.4s, v1.4s, v2.8h
+; CHECK-NEXT:    uaddw v0.4s, v0.4s, v2.4h
 ; CHECK-NEXT:    uaddw v2.4s, v5.4s, v4.4h
-; CHECK-NEXT:    ushll2 v4.4s, v6.8h, #0
-; CHECK-NEXT:    ushll v5.4s, v6.4h, #0
+; CHECK-NEXT:    uaddw2 v3.4s, v6.4s, v4.8h
+; CHECK-NEXT:    ushll2 v4.4s, v7.8h, #0
+; CHECK-NEXT:    ushll v5.4s, v7.4h, #0
+; CHECK-NEXT:    stp q17, q16, [x4, #32]
 ; CHECK-NEXT:    stp q5, q4, [x4]
 ; CHECK-NEXT:    ret
   %lp1 = load <4 x i8>, ptr %p
@@ -1157,35 +1157,35 @@ define <16 x i32> @extrause_shl(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-LABEL: extrause_shl:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp s0, s1, [x0]
-; CHECK-NEXT:    ldp s2, s3, [x2]
 ; CHECK-NEXT:    add x8, x3, #8
+; CHECK-NEXT:    ldp s2, s3, [x2]
 ; CHECK-NEXT:    add x9, x1, #8
+; CHECK-NEXT:    add x10, x3, #12
 ; CHECK-NEXT:    ldp s4, s5, [x0, #8]
-; CHECK-NEXT:    add x10, x1, #12
-; CHECK-NEXT:    add x11, x3, #12
 ; CHECK-NEXT:    ldp s6, s7, [x2, #8]
+; CHECK-NEXT:    add x11, x1, #12
 ; CHECK-NEXT:    ld1 { v2.s }[1], [x3], #4
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x1], #4
-; CHECK-NEXT:    ld1 { v5.s }[1], [x10]
-; CHECK-NEXT:    ld1 { v7.s }[1], [x11]
-; CHECK-NEXT:    ld1 { v3.s }[1], [x3]
-; CHECK-NEXT:    ld1 { v1.s }[1], [x1]
+; CHECK-NEXT:    ld1 { v5.s }[1], [x11]
+; CHECK-NEXT:    ld1 { v7.s }[1], [x10]
 ; CHECK-NEXT:    ld1 { v4.s }[1], [x9]
 ; CHECK-NEXT:    ld1 { v6.s }[1], [x8]
+; CHECK-NEXT:    ld1 { v3.s }[1], [x3]
+; CHECK-NEXT:    ld1 { v1.s }[1], [x1]
+; CHECK-NEXT:    uaddl v4.8h, v0.8b, v4.8b
+; CHECK-NEXT:    uaddl v2.8h, v2.8b, v6.8b
 ; CHECK-NEXT:    uaddl v3.8h, v3.8b, v7.8b
 ; CHECK-NEXT:    uaddl v1.8h, v1.8b, v5.8b
-; CHECK-NEXT:    uaddl v0.8h, v0.8b, v4.8b
-; CHECK-NEXT:    uaddl v2.8h, v2.8b, v6.8b
-; CHECK-NEXT:    ushll v4.4s, v1.4h, #3
-; CHECK-NEXT:    ushll v5.4s, v3.4h, #3
-; CHECK-NEXT:    ushll2 v6.4s, v1.8h, #3
-; CHECK-NEXT:    ushll2 v7.4s, v3.8h, #3
-; CHECK-NEXT:    uaddw2 v1.4s, v6.4s, v0.8h
-; CHECK-NEXT:    stp q4, q6, [x4]
-; CHECK-NEXT:    uaddw2 v3.4s, v7.4s, v2.8h
-; CHECK-NEXT:    stp q5, q7, [x4, #32]
-; CHECK-NEXT:    uaddw v0.4s, v4.4s, v0.4h
-; CHECK-NEXT:    uaddw v2.4s, v5.4s, v2.4h
+; CHECK-NEXT:    ushll v6.4s, v3.4h, #3
+; CHECK-NEXT:    ushll2 v16.4s, v3.8h, #3
+; CHECK-NEXT:    ushll v5.4s, v1.4h, #3
+; CHECK-NEXT:    ushll2 v7.4s, v1.8h, #3
+; CHECK-NEXT:    uaddw2 v3.4s, v16.4s, v2.8h
+; CHECK-NEXT:    uaddw v2.4s, v6.4s, v2.4h
+; CHECK-NEXT:    stp q6, q16, [x4, #32]
+; CHECK-NEXT:    uaddw v0.4s, v5.4s, v4.4h
+; CHECK-NEXT:    uaddw2 v1.4s, v7.4s, v4.8h
+; CHECK-NEXT:    stp q5, q7, [x4]
 ; CHECK-NEXT:    ret
   %lp1 = load <4 x i8>, ptr %p
   %p2 = getelementptr i8, ptr %p, i32 4
@@ -1283,16 +1283,16 @@ define <8 x i32> @commuted_loads(ptr %p1, ptr %p2) {
 define <8 x i32> @commuted_loads2(ptr %p1, ptr %p2) {
 ; CHECK-LABEL: commuted_loads2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp d0, d1, [x0]
-; CHECK-NEXT:    ldp d2, d3, [x1]
-; CHECK-NEXT:    add v0.8b, v0.8b, v2.8b
-; CHECK-NEXT:    add v1.8b, v1.8b, v3.8b
+; CHECK-NEXT:    ldp d0, d3, [x1]
+; CHECK-NEXT:    ldp d1, d2, [x0]
+; CHECK-NEXT:    add v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    add v1.8b, v2.8b, v3.8b
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-NEXT:    ushll v2.8h, v1.8b, #0
-; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #3
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #3
-; CHECK-NEXT:    uaddw2 v1.4s, v1.4s, v2.8h
-; CHECK-NEXT:    uaddw v0.4s, v0.4s, v2.4h
+; CHECK-NEXT:    ushll v3.4s, v0.4h, #3
+; CHECK-NEXT:    ushll2 v0.4s, v0.8h, #3
+; CHECK-NEXT:    uaddw2 v1.4s, v0.4s, v2.8h
+; CHECK-NEXT:    uaddw v0.4s, v3.4s, v2.4h
 ; CHECK-NEXT:    ret
   %l11 = load <8 x i8>, ptr %p1
   %q1 = getelementptr i8, ptr %p1, i32 8
@@ -1312,16 +1312,17 @@ define <8 x i32> @commuted_loads2(ptr %p1, ptr %p2) {
 define <8 x i32> @commuted_sub(ptr %p1, ptr %p2) {
 ; CHECK-LABEL: commuted_sub:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp d0, d1, [x0]
-; CHECK-NEXT:    ldp d3, d2, [x1]
-; CHECK-NEXT:    add v0.8b, v0.8b, v3.8b
+; CHECK-NEXT:    ldp d2, d1, [x1]
+; CHECK-NEXT:    ldr d0, [x0, #8]
+; CHECK-NEXT:    add v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ldr d1, [x0]
 ; CHECK-NEXT:    add v1.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    ushll2 v2.4s, v1.8h, #3
-; CHECK-NEXT:    ushll v3.4s, v1.4h, #3
-; CHECK-NEXT:    usubw2 v1.4s, v2.4s, v0.8h
-; CHECK-NEXT:    usubw v0.4s, v3.4s, v0.4h
+; CHECK-NEXT:    ushll v2.8h, v1.8b, #0
+; CHECK-NEXT:    ushll v3.4s, v0.4h, #3
+; CHECK-NEXT:    ushll2 v0.4s, v0.8h, #3
+; CHECK-NEXT:    usubw2 v1.4s, v0.4s, v2.8h
+; CHECK-NEXT:    usubw v0.4s, v3.4s, v2.4h
 ; CHECK-NEXT:    ret
   %l11 = load <8 x i8>, ptr %p1
   %q1 = getelementptr i8, ptr %p1, i32 8
@@ -1362,15 +1363,15 @@ define <4 x i32> @atomic(ptr %p) {
 ; CHECK-LABEL: atomic:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldar w8, [x0]
-; CHECK-NEXT:    ldr s0, [x0, #4]
-; CHECK-NEXT:    movi v2.2d, #0x0000ff000000ff
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    zip1 v1.8b, v1.8b, v0.8b
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #3
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    movi v0.2d, #0x0000ff000000ff
+; CHECK-NEXT:    ldr s1, [x0, #4]
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    zip1 v2.8b, v2.8b, v0.8b
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #3
+; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-NEXT:    and v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %l1b = load atomic float, ptr %p acquire, align 4
   %l1 = bitcast float %l1b to <4 x i8>
@@ -1391,8 +1392,8 @@ define <4 x i32> @volatile(ptr %p) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ldr s1, [x0, #4]
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-NEXT:    ushll v1.4s, v1.4h, #3
 ; CHECK-NEXT:    uaddw v0.4s, v1.4s, v0.4h
 ; CHECK-NEXT:    add sp, sp, #16

diff  --git a/llvm/test/CodeGen/AArch64/extend_inreg_of_concat_subvectors.ll b/llvm/test/CodeGen/AArch64/extend_inreg_of_concat_subvectors.ll
index f0db7cc23c4e5a..1f1bfe6906482e 100644
--- a/llvm/test/CodeGen/AArch64/extend_inreg_of_concat_subvectors.ll
+++ b/llvm/test/CodeGen/AArch64/extend_inreg_of_concat_subvectors.ll
@@ -18,14 +18,14 @@ define void @zext_of_concat(ptr %a, ptr %b, ptr %c, ptr %d) nounwind {
 ;
 ; CHECK-BE-LABEL: zext_of_concat:
 ; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ld1 { v0.2s }, [x0]
 ; CHECK-BE-NEXT:    ld1 { v1.2s }, [x1]
-; CHECK-BE-NEXT:    ld1 { v2.2s }, [x0]
-; CHECK-BE-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-BE-NEXT:    add v1.2s, v2.2s, v1.2s
-; CHECK-BE-NEXT:    ld1 { v2.4s }, [x2]
-; CHECK-BE-NEXT:    zip1 v1.4s, v1.4s, v1.4s
-; CHECK-BE-NEXT:    trn2 v0.4s, v1.4s, v0.4s
-; CHECK-BE-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-BE-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-BE-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-BE-NEXT:    zip1 v0.4s, v0.4s, v0.4s
+; CHECK-BE-NEXT:    trn2 v0.4s, v0.4s, v1.4s
+; CHECK-BE-NEXT:    ld1 { v1.4s }, [x2]
+; CHECK-BE-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-BE-NEXT:    st1 { v0.4s }, [x2]
 ; CHECK-BE-NEXT:    ret
   %i0.a = load <2 x i32>, ptr %a
@@ -42,29 +42,29 @@ define void @zext_of_concat(ptr %a, ptr %b, ptr %c, ptr %d) nounwind {
 define void @zext_of_concat_extrause(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) nounwind {
 ; CHECK-LABEL: zext_of_concat_extrause:
 ; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr d0, [x1]
 ; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    ldr d2, [x1]
-; CHECK-NEXT:    movi.2d v0, #0000000000000000
-; CHECK-NEXT:    add.2s v1, v1, v2
-; CHECK-NEXT:    mov.d v1[1], v1[0]
-; CHECK-NEXT:    str q1, [x4]
-; CHECK-NEXT:    zip1.4s v0, v1, v0
-; CHECK-NEXT:    ldr q1, [x2]
-; CHECK-NEXT:    add.4s v0, v0, v1
+; CHECK-NEXT:    add.2s v0, v1, v0
+; CHECK-NEXT:    movi.2d v1, #0000000000000000
+; CHECK-NEXT:    mov.d v0[1], v0[0]
+; CHECK-NEXT:    zip1.4s v1, v0, v1
+; CHECK-NEXT:    str q0, [x4]
+; CHECK-NEXT:    ldr q0, [x2]
+; CHECK-NEXT:    add.4s v0, v1, v0
 ; CHECK-NEXT:    str q0, [x2]
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: zext_of_concat_extrause:
 ; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ld1 { v0.2s }, [x1]
 ; CHECK-BE-NEXT:    ld1 { v1.2s }, [x0]
-; CHECK-BE-NEXT:    ld1 { v2.2s }, [x1]
-; CHECK-BE-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-BE-NEXT:    add v1.2s, v1.2s, v2.2s
-; CHECK-BE-NEXT:    mov v1.d[1], v1.d[0]
-; CHECK-BE-NEXT:    zip1 v2.4s, v1.4s, v1.4s
-; CHECK-BE-NEXT:    st1 { v1.4s }, [x4]
+; CHECK-BE-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-BE-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-BE-NEXT:    mov v0.d[1], v0.d[0]
+; CHECK-BE-NEXT:    zip1 v1.4s, v0.4s, v0.4s
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x4]
+; CHECK-BE-NEXT:    trn2 v0.4s, v1.4s, v2.4s
 ; CHECK-BE-NEXT:    ld1 { v1.4s }, [x2]
-; CHECK-BE-NEXT:    trn2 v0.4s, v2.4s, v0.4s
 ; CHECK-BE-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-BE-NEXT:    st1 { v0.4s }, [x2]
 ; CHECK-BE-NEXT:    ret
@@ -116,9 +116,9 @@ define void @aext_of_concat(ptr %a, ptr %b, ptr %c, ptr %d) nounwind {
 define void @aext_of_concat_extrause(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) nounwind {
 ; CHECK-LABEL: aext_of_concat_extrause:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    add.2s v0, v0, v1
+; CHECK-NEXT:    ldr d0, [x1]
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    add.2s v0, v1, v0
 ; CHECK-NEXT:    mov.16b v1, v0
 ; CHECK-NEXT:    mov.d v1[1], v0[0]
 ; CHECK-NEXT:    zip1.4s v0, v0, v0
@@ -130,9 +130,9 @@ define void @aext_of_concat_extrause(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) nou
 ;
 ; CHECK-BE-LABEL: aext_of_concat_extrause:
 ; CHECK-BE:       // %bb.0:
-; CHECK-BE-NEXT:    ld1 { v0.2s }, [x0]
-; CHECK-BE-NEXT:    ld1 { v1.2s }, [x1]
-; CHECK-BE-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-BE-NEXT:    ld1 { v0.2s }, [x1]
+; CHECK-BE-NEXT:    ld1 { v1.2s }, [x0]
+; CHECK-BE-NEXT:    add v0.2s, v1.2s, v0.2s
 ; CHECK-BE-NEXT:    mov v1.16b, v0.16b
 ; CHECK-BE-NEXT:    mov v1.d[1], v0.d[0]
 ; CHECK-BE-NEXT:    zip1 v0.4s, v0.4s, v0.4s

diff  --git a/llvm/test/CodeGen/AArch64/extract-bits.ll b/llvm/test/CodeGen/AArch64/extract-bits.ll
index 2913d73886998c..d4ea143a3d8473 100644
--- a/llvm/test/CodeGen/AArch64/extract-bits.ll
+++ b/llvm/test/CodeGen/AArch64/extract-bits.ll
@@ -21,7 +21,7 @@
 define i32 @bextr32_a0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr32_a0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    lsr w9, w0, w1
 ; CHECK-NEXT:    lsl w8, w8, w2
 ; CHECK-NEXT:    sub w8, w8, #1
@@ -37,7 +37,7 @@ define i32 @bextr32_a0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 define i32 @bextr32_a0_arithmetic(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr32_a0_arithmetic:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    asr w9, w0, w1
 ; CHECK-NEXT:    lsl w8, w8, w2
 ; CHECK-NEXT:    sub w8, w8, #1
@@ -53,7 +53,7 @@ define i32 @bextr32_a0_arithmetic(i32 %val, i32 %numskipbits, i32 %numlowbits) n
 define i32 @bextr32_a1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
 ; CHECK-LABEL: bextr32_a1_indexzext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    lsr w9, w0, w1
 ; CHECK-NEXT:    lsl w8, w8, w2
 ; CHECK-NEXT:    sub w8, w8, #1
@@ -71,12 +71,12 @@ define i32 @bextr32_a1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %
 define i32 @bextr32_a2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr32_a2_load:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    mov w8, #1
-; CHECK-NEXT:    lsl w8, w8, w2
-; CHECK-NEXT:    sub w8, w8, #1
-; CHECK-NEXT:    lsr w9, w9, w1
-; CHECK-NEXT:    and w0, w8, w9
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    mov w9, #1 // =0x1
+; CHECK-NEXT:    lsl w9, w9, w2
+; CHECK-NEXT:    lsr w8, w8, w1
+; CHECK-NEXT:    sub w9, w9, #1
+; CHECK-NEXT:    and w0, w9, w8
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %w
   %shifted = lshr i32 %val, %numskipbits
@@ -89,12 +89,12 @@ define i32 @bextr32_a2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind
 define i32 @bextr32_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
 ; CHECK-LABEL: bextr32_a3_load_indexzext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    mov w8, #1
-; CHECK-NEXT:    lsl w8, w8, w2
-; CHECK-NEXT:    sub w8, w8, #1
-; CHECK-NEXT:    lsr w9, w9, w1
-; CHECK-NEXT:    and w0, w8, w9
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    mov w9, #1 // =0x1
+; CHECK-NEXT:    lsl w9, w9, w2
+; CHECK-NEXT:    lsr w8, w8, w1
+; CHECK-NEXT:    sub w9, w9, #1
+; CHECK-NEXT:    and w0, w9, w8
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %w
   %skip = zext i8 %numskipbits to i32
@@ -109,7 +109,7 @@ define i32 @bextr32_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex
 define i32 @bextr32_a4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr32_a4_commutative:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    lsr w9, w0, w1
 ; CHECK-NEXT:    lsl w8, w8, w2
 ; CHECK-NEXT:    sub w8, w8, #1
@@ -127,7 +127,7 @@ define i32 @bextr32_a4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_a0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    lsr x9, x0, x1
 ; CHECK-NEXT:    lsl x8, x8, x2
 ; CHECK-NEXT:    sub x8, x8, #1
@@ -143,7 +143,7 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_a0_arithmetic:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    asr x9, x0, x1
 ; CHECK-NEXT:    lsl x8, x8, x2
 ; CHECK-NEXT:    sub x8, x8, #1
@@ -159,7 +159,7 @@ define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) n
 define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_a1_indexzext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-NEXT:    lsr x9, x0, x1
@@ -179,12 +179,12 @@ define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 define i64 @bextr64_a2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_a2_load:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    mov w8, #1
-; CHECK-NEXT:    lsl x8, x8, x2
-; CHECK-NEXT:    sub x8, x8, #1
-; CHECK-NEXT:    lsr x9, x9, x1
-; CHECK-NEXT:    and x0, x8, x9
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    mov w9, #1 // =0x1
+; CHECK-NEXT:    lsl x9, x9, x2
+; CHECK-NEXT:    lsr x8, x8, x1
+; CHECK-NEXT:    sub x9, x9, #1
+; CHECK-NEXT:    and x0, x9, x8
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %w
   %shifted = lshr i64 %val, %numskipbits
@@ -197,14 +197,14 @@ define i64 @bextr64_a2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind
 define i64 @bextr64_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_a3_load_indexzext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    mov w9, #1 // =0x1
 ; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
-; CHECK-NEXT:    lsl x8, x8, x2
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    sub x8, x8, #1
-; CHECK-NEXT:    lsr x9, x9, x1
-; CHECK-NEXT:    and x0, x8, x9
+; CHECK-NEXT:    lsl x9, x9, x2
+; CHECK-NEXT:    lsr x8, x8, x1
+; CHECK-NEXT:    sub x9, x9, #1
+; CHECK-NEXT:    and x0, x9, x8
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %w
   %skip = zext i8 %numskipbits to i64
@@ -219,7 +219,7 @@ define i64 @bextr64_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex
 define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_a4_commutative:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    lsr x9, x0, x1
 ; CHECK-NEXT:    lsl x8, x8, x2
 ; CHECK-NEXT:    sub x8, x8, #1
@@ -238,7 +238,7 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 define i32 @bextr64_32_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_32_a0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    lsr x9, x0, x1
 ; CHECK-NEXT:    lsl x8, x8, x2
 ; CHECK-NEXT:    sub w8, w8, #1
@@ -256,7 +256,7 @@ define i32 @bextr64_32_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind
 define i32 @bextr64_32_a1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_32_a1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    lsr x9, x0, x1
 ; CHECK-NEXT:    lsl w8, w8, w2
 ; CHECK-NEXT:    sub w8, w8, #1
@@ -275,7 +275,7 @@ define i32 @bextr64_32_a1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind
 define i32 @bextr64_32_a2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_32_a2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    lsr x9, x0, x1
 ; CHECK-NEXT:    lsl w8, w8, w2
 ; CHECK-NEXT:    sub w8, w8, #1
@@ -297,7 +297,7 @@ define i32 @bextr64_32_a2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind
 define i32 @bextr32_b0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr32_b0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-1
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    lsr w9, w0, w1
 ; CHECK-NEXT:    lsl w8, w8, w2
 ; CHECK-NEXT:    bic w0, w9, w8
@@ -312,7 +312,7 @@ define i32 @bextr32_b0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 define i32 @bextr32_b1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
 ; CHECK-LABEL: bextr32_b1_indexzext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-1
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    lsr w9, w0, w1
 ; CHECK-NEXT:    lsl w8, w8, w2
 ; CHECK-NEXT:    bic w0, w9, w8
@@ -329,11 +329,11 @@ define i32 @bextr32_b1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %
 define i32 @bextr32_b2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr32_b2_load:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    mov w8, #-1
-; CHECK-NEXT:    lsl w8, w8, w2
-; CHECK-NEXT:    lsr w9, w9, w1
-; CHECK-NEXT:    bic w0, w9, w8
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    mov w9, #-1 // =0xffffffff
+; CHECK-NEXT:    lsl w9, w9, w2
+; CHECK-NEXT:    lsr w8, w8, w1
+; CHECK-NEXT:    bic w0, w8, w9
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %w
   %shifted = lshr i32 %val, %numskipbits
@@ -346,11 +346,11 @@ define i32 @bextr32_b2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind
 define i32 @bextr32_b3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
 ; CHECK-LABEL: bextr32_b3_load_indexzext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    mov w8, #-1
-; CHECK-NEXT:    lsl w8, w8, w2
-; CHECK-NEXT:    lsr w9, w9, w1
-; CHECK-NEXT:    bic w0, w9, w8
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    mov w9, #-1 // =0xffffffff
+; CHECK-NEXT:    lsl w9, w9, w2
+; CHECK-NEXT:    lsr w8, w8, w1
+; CHECK-NEXT:    bic w0, w8, w9
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %w
   %skip = zext i8 %numskipbits to i32
@@ -365,7 +365,7 @@ define i32 @bextr32_b3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex
 define i32 @bextr32_b4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr32_b4_commutative:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-1
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    lsr w9, w0, w1
 ; CHECK-NEXT:    lsl w8, w8, w2
 ; CHECK-NEXT:    bic w0, w9, w8
@@ -382,7 +382,7 @@ define i32 @bextr32_b4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_b0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-1
+; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    lsr x9, x0, x1
 ; CHECK-NEXT:    lsl x8, x8, x2
 ; CHECK-NEXT:    bic x0, x9, x8
@@ -397,7 +397,7 @@ define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_b1_indexzext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-1
+; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-NEXT:    lsr x9, x0, x1
@@ -416,11 +416,11 @@ define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 define i64 @bextr64_b2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_b2_load:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    mov x8, #-1
-; CHECK-NEXT:    lsl x8, x8, x2
-; CHECK-NEXT:    lsr x9, x9, x1
-; CHECK-NEXT:    bic x0, x9, x8
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    mov x9, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    lsl x9, x9, x2
+; CHECK-NEXT:    lsr x8, x8, x1
+; CHECK-NEXT:    bic x0, x8, x9
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %w
   %shifted = lshr i64 %val, %numskipbits
@@ -433,13 +433,13 @@ define i64 @bextr64_b2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind
 define i64 @bextr64_b3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_b3_load_indexzext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    mov x8, #-1
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    mov x9, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    lsl x8, x8, x2
-; CHECK-NEXT:    lsr x9, x9, x1
-; CHECK-NEXT:    bic x0, x9, x8
+; CHECK-NEXT:    lsl x9, x9, x2
+; CHECK-NEXT:    lsr x8, x8, x1
+; CHECK-NEXT:    bic x0, x8, x9
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %w
   %skip = zext i8 %numskipbits to i64
@@ -454,7 +454,7 @@ define i64 @bextr64_b3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex
 define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_b4_commutative:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-1
+; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    lsr x9, x0, x1
 ; CHECK-NEXT:    lsl x8, x8, x2
 ; CHECK-NEXT:    bic x0, x9, x8
@@ -472,7 +472,7 @@ define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 define i32 @bextr64_32_b0(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_32_b0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-1
+; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    lsr x9, x0, x1
 ; CHECK-NEXT:    lsl x8, x8, x2
@@ -491,7 +491,7 @@ define i32 @bextr64_32_b0(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind {
 define i32 @bextr64_32_b1(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_32_b1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-1
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    lsr x9, x0, x1
 ; CHECK-NEXT:    lsl w8, w8, w2
@@ -511,7 +511,7 @@ define i32 @bextr64_32_b1(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind {
 define i32 @bextr64_32_b2(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_32_b2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-1
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    lsr x9, x0, x1
 ; CHECK-NEXT:    lsl w8, w8, w2
@@ -535,7 +535,7 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr32_c0:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    neg w8, w2
-; CHECK-NEXT:    mov w9, #-1
+; CHECK-NEXT:    mov w9, #-1 // =0xffffffff
 ; CHECK-NEXT:    lsr w10, w0, w1
 ; CHECK-NEXT:    lsr w8, w9, w8
 ; CHECK-NEXT:    and w0, w8, w10
@@ -550,11 +550,11 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 define i32 @bextr32_c1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr32_c1_indexzext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32
-; CHECK-NEXT:    mov w9, #-1
-; CHECK-NEXT:    sub w8, w8, w2
+; CHECK-NEXT:    mov w8, #32 // =0x20
+; CHECK-NEXT:    mov w9, #-1 // =0xffffffff
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-NEXT:    lsr w10, w0, w1
+; CHECK-NEXT:    sub w8, w8, w2
 ; CHECK-NEXT:    lsr w8, w9, w8
 ; CHECK-NEXT:    and w0, w8, w10
 ; CHECK-NEXT:    ret
@@ -570,12 +570,12 @@ define i32 @bextr32_c1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) noun
 define i32 @bextr32_c2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr32_c2_load:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg w8, w2
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    mov w10, #-1
-; CHECK-NEXT:    lsr w9, w9, w1
-; CHECK-NEXT:    lsr w8, w10, w8
-; CHECK-NEXT:    and w0, w8, w9
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    neg w9, w2
+; CHECK-NEXT:    mov w10, #-1 // =0xffffffff
+; CHECK-NEXT:    lsr w9, w10, w9
+; CHECK-NEXT:    lsr w8, w8, w1
+; CHECK-NEXT:    and w0, w9, w8
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %w
   %shifted = lshr i32 %val, %numskipbits
@@ -588,14 +588,14 @@ define i32 @bextr32_c2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind
 define i32 @bextr32_c3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr32_c3_load_indexzext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    sub w8, w8, w2
-; CHECK-NEXT:    mov w10, #-1
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    mov w9, #32 // =0x20
+; CHECK-NEXT:    mov w10, #-1 // =0xffffffff
+; CHECK-NEXT:    sub w9, w9, w2
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    lsr w9, w9, w1
-; CHECK-NEXT:    lsr w8, w10, w8
-; CHECK-NEXT:    and w0, w8, w9
+; CHECK-NEXT:    lsr w8, w8, w1
+; CHECK-NEXT:    lsr w9, w10, w9
+; CHECK-NEXT:    and w0, w9, w8
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %w
   %skip = zext i8 %numskipbits to i32
@@ -611,7 +611,7 @@ define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ; CHECK-LABEL: bextr32_c4_commutative:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    neg w8, w2
-; CHECK-NEXT:    mov w9, #-1
+; CHECK-NEXT:    mov w9, #-1 // =0xffffffff
 ; CHECK-NEXT:    lsr w10, w0, w1
 ; CHECK-NEXT:    lsr w8, w9, w8
 ; CHECK-NEXT:    and w0, w10, w8
@@ -629,7 +629,7 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_c0:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    neg x8, x2
-; CHECK-NEXT:    mov x9, #-1
+; CHECK-NEXT:    mov x9, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    lsr x10, x0, x1
 ; CHECK-NEXT:    lsr x8, x9, x8
 ; CHECK-NEXT:    and x0, x8, x10
@@ -644,11 +644,11 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_c1_indexzext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #64
-; CHECK-NEXT:    mov x9, #-1
-; CHECK-NEXT:    sub w8, w8, w2
+; CHECK-NEXT:    mov w8, #64 // =0x40
+; CHECK-NEXT:    mov x9, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-NEXT:    lsr x10, x0, x1
+; CHECK-NEXT:    sub w8, w8, w2
 ; CHECK-NEXT:    lsr x8, x9, x8
 ; CHECK-NEXT:    and x0, x8, x10
 ; CHECK-NEXT:    ret
@@ -664,12 +664,12 @@ define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 define i64 @bextr64_c2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_c2_load:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg x8, x2
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    mov x10, #-1
-; CHECK-NEXT:    lsr x9, x9, x1
-; CHECK-NEXT:    lsr x8, x10, x8
-; CHECK-NEXT:    and x0, x8, x9
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    neg x9, x2
+; CHECK-NEXT:    mov x10, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    lsr x9, x10, x9
+; CHECK-NEXT:    lsr x8, x8, x1
+; CHECK-NEXT:    and x0, x9, x8
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %w
   %shifted = lshr i64 %val, %numskipbits
@@ -682,14 +682,14 @@ define i64 @bextr64_c2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind
 define i64 @bextr64_c3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_c3_load_indexzext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #64
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    sub w8, w8, w2
-; CHECK-NEXT:    mov x10, #-1
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    mov w9, #64 // =0x40
+; CHECK-NEXT:    mov x10, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    sub w9, w9, w2
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    lsr x9, x9, x1
-; CHECK-NEXT:    lsr x8, x10, x8
-; CHECK-NEXT:    and x0, x8, x9
+; CHECK-NEXT:    lsr x8, x8, x1
+; CHECK-NEXT:    lsr x9, x10, x9
+; CHECK-NEXT:    and x0, x9, x8
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %w
   %skip = zext i8 %numskipbits to i64
@@ -705,7 +705,7 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; CHECK-LABEL: bextr64_c4_commutative:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    neg x8, x2
-; CHECK-NEXT:    mov x9, #-1
+; CHECK-NEXT:    mov x9, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    lsr x10, x0, x1
 ; CHECK-NEXT:    lsr x8, x9, x8
 ; CHECK-NEXT:    and x0, x10, x8
@@ -724,7 +724,7 @@ define i32 @bextr64_32_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind
 ; CHECK-LABEL: bextr64_32_c0:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    neg x8, x2
-; CHECK-NEXT:    mov x9, #-1
+; CHECK-NEXT:    mov x9, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    lsr x10, x0, x1
 ; CHECK-NEXT:    lsr x8, x9, x8
 ; CHECK-NEXT:    and w0, w8, w10
@@ -742,7 +742,7 @@ define i32 @bextr64_32_c1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind
 ; CHECK-LABEL: bextr64_32_c1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    neg w8, w2
-; CHECK-NEXT:    mov w9, #-1
+; CHECK-NEXT:    mov w9, #-1 // =0xffffffff
 ; CHECK-NEXT:    lsr x10, x0, x1
 ; CHECK-NEXT:    lsr w8, w9, w8
 ; CHECK-NEXT:    and w0, w8, w10
@@ -761,7 +761,7 @@ define i32 @bextr64_32_c2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind
 ; CHECK-LABEL: bextr64_32_c2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    neg w8, w2
-; CHECK-NEXT:    mov w9, #-1
+; CHECK-NEXT:    mov w9, #-1 // =0xffffffff
 ; CHECK-NEXT:    lsr x10, x0, x1
 ; CHECK-NEXT:    lsr w8, w9, w8
 ; CHECK-NEXT:    and w0, w8, w10
@@ -782,10 +782,10 @@ define i32 @bextr64_32_c2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind
 define i32 @bextr32_d0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr32_d0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg w8, w2
-; CHECK-NEXT:    lsr w9, w0, w1
-; CHECK-NEXT:    lsl w9, w9, w8
-; CHECK-NEXT:    lsr w0, w9, w8
+; CHECK-NEXT:    lsr w8, w0, w1
+; CHECK-NEXT:    neg w9, w2
+; CHECK-NEXT:    lsl w8, w8, w9
+; CHECK-NEXT:    lsr w0, w8, w9
 ; CHECK-NEXT:    ret
   %shifted = lshr i32 %val, %numskipbits
   %numhighbits = sub i32 32, %numlowbits
@@ -797,12 +797,12 @@ define i32 @bextr32_d0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 define i32 @bextr32_d1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr32_d1_indexzext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    lsr w9, w0, w1
-; CHECK-NEXT:    sub w8, w8, w2
-; CHECK-NEXT:    lsl w9, w9, w8
-; CHECK-NEXT:    lsr w0, w9, w8
+; CHECK-NEXT:    lsr w8, w0, w1
+; CHECK-NEXT:    mov w9, #32 // =0x20
+; CHECK-NEXT:    sub w9, w9, w2
+; CHECK-NEXT:    lsl w8, w8, w9
+; CHECK-NEXT:    lsr w0, w8, w9
 ; CHECK-NEXT:    ret
   %skip = zext i8 %numskipbits to i32
   %shifted = lshr i32 %val, %skip
@@ -833,13 +833,13 @@ define i32 @bextr32_d2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind
 define i32 @bextr32_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr32_d3_load_indexzext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    sub w8, w8, w2
+; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    lsr w9, w9, w1
-; CHECK-NEXT:    lsl w9, w9, w8
-; CHECK-NEXT:    lsr w0, w9, w8
+; CHECK-NEXT:    mov w9, #32 // =0x20
+; CHECK-NEXT:    sub w9, w9, w2
+; CHECK-NEXT:    lsr w8, w8, w1
+; CHECK-NEXT:    lsl w8, w8, w9
+; CHECK-NEXT:    lsr w0, w8, w9
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %w
   %skip = zext i8 %numskipbits to i32
@@ -856,10 +856,10 @@ define i32 @bextr32_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n
 define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_d0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg x8, x2
-; CHECK-NEXT:    lsr x9, x0, x1
-; CHECK-NEXT:    lsl x9, x9, x8
-; CHECK-NEXT:    lsr x0, x9, x8
+; CHECK-NEXT:    lsr x8, x0, x1
+; CHECK-NEXT:    neg x9, x2
+; CHECK-NEXT:    lsl x8, x8, x9
+; CHECK-NEXT:    lsr x0, x8, x9
 ; CHECK-NEXT:    ret
   %shifted = lshr i64 %val, %numskipbits
   %numhighbits = sub i64 64, %numlowbits
@@ -871,12 +871,12 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_d1_indexzext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #64
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    lsr x9, x0, x1
-; CHECK-NEXT:    sub w8, w8, w2
-; CHECK-NEXT:    lsl x9, x9, x8
-; CHECK-NEXT:    lsr x0, x9, x8
+; CHECK-NEXT:    lsr x8, x0, x1
+; CHECK-NEXT:    mov w9, #64 // =0x40
+; CHECK-NEXT:    sub w9, w9, w2
+; CHECK-NEXT:    lsl x8, x8, x9
+; CHECK-NEXT:    lsr x0, x8, x9
 ; CHECK-NEXT:    ret
   %skip = zext i8 %numskipbits to i64
   %shifted = lshr i64 %val, %skip
@@ -907,13 +907,13 @@ define i64 @bextr64_d2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind
 define i64 @bextr64_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_d3_load_indexzext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #64
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    sub w8, w8, w2
+; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    lsr x9, x9, x1
-; CHECK-NEXT:    lsl x9, x9, x8
-; CHECK-NEXT:    lsr x0, x9, x8
+; CHECK-NEXT:    mov w9, #64 // =0x40
+; CHECK-NEXT:    sub w9, w9, w2
+; CHECK-NEXT:    lsr x8, x8, x1
+; CHECK-NEXT:    lsl x8, x8, x9
+; CHECK-NEXT:    lsr x0, x8, x9
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %w
   %skip = zext i8 %numskipbits to i64
@@ -931,10 +931,10 @@ define i64 @bextr64_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n
 define i32 @bextr64_32_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_32_d0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg x8, x2
-; CHECK-NEXT:    lsr x9, x0, x1
-; CHECK-NEXT:    lsl x9, x9, x8
-; CHECK-NEXT:    lsr x0, x9, x8
+; CHECK-NEXT:    lsr x8, x0, x1
+; CHECK-NEXT:    neg x9, x2
+; CHECK-NEXT:    lsl x8, x8, x9
+; CHECK-NEXT:    lsr x0, x8, x9
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
   %shifted = lshr i64 %val, %numskipbits
@@ -949,10 +949,10 @@ define i32 @bextr64_32_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind
 define i32 @bextr64_32_d1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_32_d1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg w8, w2
-; CHECK-NEXT:    lsr x9, x0, x1
-; CHECK-NEXT:    lsl w9, w9, w8
-; CHECK-NEXT:    lsr w0, w9, w8
+; CHECK-NEXT:    lsr x8, x0, x1
+; CHECK-NEXT:    neg w9, w2
+; CHECK-NEXT:    lsl w8, w8, w9
+; CHECK-NEXT:    lsr w0, w8, w9
 ; CHECK-NEXT:    ret
   %shifted = lshr i64 %val, %numskipbits
   %truncshifted = trunc i64 %shifted to i32

diff  --git a/llvm/test/CodeGen/AArch64/extract-lowbits.ll b/llvm/test/CodeGen/AArch64/extract-lowbits.ll
index 741a1fb05eda2c..4b8f3e86b5fefe 100644
--- a/llvm/test/CodeGen/AArch64/extract-lowbits.ll
+++ b/llvm/test/CodeGen/AArch64/extract-lowbits.ll
@@ -162,8 +162,8 @@ define i64 @bzhi64_a3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind {
 ; CHECK-LABEL: bzhi64_a3_load_indexzext:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    ldr x9, [x0]
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    ldr x9, [x0]
 ; CHECK-NEXT:    lsl x8, x8, x1
 ; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    and x0, x8, x9
@@ -224,10 +224,10 @@ define i32 @bzhi32_b1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind {
 define i32 @bzhi32_b2_load(ptr %w, i32 %numlowbits) nounwind {
 ; CHECK-LABEL: bzhi32_b2_load:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    mov w9, #-1 // =0xffffffff
-; CHECK-NEXT:    lsl w9, w9, w1
-; CHECK-NEXT:    bic w0, w8, w9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    ldr w9, [x0]
+; CHECK-NEXT:    lsl w8, w8, w1
+; CHECK-NEXT:    bic w0, w9, w8
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %w
   %notmask = shl i32 -1, %numlowbits
@@ -239,10 +239,10 @@ define i32 @bzhi32_b2_load(ptr %w, i32 %numlowbits) nounwind {
 define i32 @bzhi32_b3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind {
 ; CHECK-LABEL: bzhi32_b3_load_indexzext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    mov w9, #-1 // =0xffffffff
-; CHECK-NEXT:    lsl w9, w9, w1
-; CHECK-NEXT:    bic w0, w8, w9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    ldr w9, [x0]
+; CHECK-NEXT:    lsl w8, w8, w1
+; CHECK-NEXT:    bic w0, w9, w8
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %w
   %conv = zext i8 %numlowbits to i32
@@ -298,10 +298,10 @@ define i64 @bzhi64_b1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind {
 define i64 @bzhi64_b2_load(ptr %w, i64 %numlowbits) nounwind {
 ; CHECK-LABEL: bzhi64_b2_load:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    mov x9, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    lsl x9, x9, x1
-; CHECK-NEXT:    bic x0, x8, x9
+; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    ldr x9, [x0]
+; CHECK-NEXT:    lsl x8, x8, x1
+; CHECK-NEXT:    bic x0, x9, x8
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %w
   %notmask = shl i64 -1, %numlowbits
@@ -313,11 +313,11 @@ define i64 @bzhi64_b2_load(ptr %w, i64 %numlowbits) nounwind {
 define i64 @bzhi64_b3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind {
 ; CHECK-LABEL: bzhi64_b3_load_indexzext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    mov x9, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    lsl x9, x9, x1
-; CHECK-NEXT:    bic x0, x8, x9
+; CHECK-NEXT:    ldr x9, [x0]
+; CHECK-NEXT:    lsl x8, x8, x1
+; CHECK-NEXT:    bic x0, x9, x8
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %w
   %conv = zext i8 %numlowbits to i64
@@ -347,9 +347,9 @@ define i64 @bzhi64_b4_commutative(i64 %val, i64 %numlowbits) nounwind {
 define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
 ; CHECK-LABEL: bzhi32_c0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg w8, w1
-; CHECK-NEXT:    mov w9, #-1 // =0xffffffff
-; CHECK-NEXT:    lsr w8, w9, w8
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    neg w9, w1
+; CHECK-NEXT:    lsr w8, w8, w9
 ; CHECK-NEXT:    and w0, w8, w0
 ; CHECK-NEXT:    ret
   %numhighbits = sub i32 32, %numlowbits
@@ -377,11 +377,11 @@ define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 define i32 @bzhi32_c2_load(ptr %w, i32 %numlowbits) nounwind {
 ; CHECK-LABEL: bzhi32_c2_load:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg w8, w1
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    mov w10, #-1 // =0xffffffff
-; CHECK-NEXT:    lsr w8, w10, w8
-; CHECK-NEXT:    and w0, w8, w9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    neg w9, w1
+; CHECK-NEXT:    ldr w10, [x0]
+; CHECK-NEXT:    lsr w8, w8, w9
+; CHECK-NEXT:    and w0, w8, w10
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %w
   %numhighbits = sub i32 32, %numlowbits
@@ -394,11 +394,11 @@ define i32 @bzhi32_c3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
 ; CHECK-LABEL: bzhi32_c3_load_indexzext:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #32 // =0x20
-; CHECK-NEXT:    ldr w9, [x0]
+; CHECK-NEXT:    mov w9, #-1 // =0xffffffff
+; CHECK-NEXT:    ldr w10, [x0]
 ; CHECK-NEXT:    sub w8, w8, w1
-; CHECK-NEXT:    mov w10, #-1 // =0xffffffff
-; CHECK-NEXT:    lsr w8, w10, w8
-; CHECK-NEXT:    and w0, w8, w9
+; CHECK-NEXT:    lsr w8, w9, w8
+; CHECK-NEXT:    and w0, w8, w10
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %w
   %numhighbits = sub i8 32, %numlowbits
@@ -411,9 +411,9 @@ define i32 @bzhi32_c3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
 define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ; CHECK-LABEL: bzhi32_c4_commutative:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg w8, w1
-; CHECK-NEXT:    mov w9, #-1 // =0xffffffff
-; CHECK-NEXT:    lsr w8, w9, w8
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    neg w9, w1
+; CHECK-NEXT:    lsr w8, w8, w9
 ; CHECK-NEXT:    and w0, w0, w8
 ; CHECK-NEXT:    ret
   %numhighbits = sub i32 32, %numlowbits
@@ -427,9 +427,9 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
 define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind {
 ; CHECK-LABEL: bzhi64_c0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg x8, x1
-; CHECK-NEXT:    mov x9, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    lsr x8, x9, x8
+; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    neg x9, x1
+; CHECK-NEXT:    lsr x8, x8, x9
 ; CHECK-NEXT:    and x0, x8, x0
 ; CHECK-NEXT:    ret
   %numhighbits = sub i64 64, %numlowbits
@@ -457,11 +457,11 @@ define i64 @bzhi64_c1_indexzext(i64 %val, i8 %numlowbits) nounwind {
 define i64 @bzhi64_c2_load(ptr %w, i64 %numlowbits) nounwind {
 ; CHECK-LABEL: bzhi64_c2_load:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg x8, x1
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    mov x10, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    lsr x8, x10, x8
-; CHECK-NEXT:    and x0, x8, x9
+; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    neg x9, x1
+; CHECK-NEXT:    ldr x10, [x0]
+; CHECK-NEXT:    lsr x8, x8, x9
+; CHECK-NEXT:    and x0, x8, x10
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %w
   %numhighbits = sub i64 64, %numlowbits
@@ -474,11 +474,11 @@ define i64 @bzhi64_c3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
 ; CHECK-LABEL: bzhi64_c3_load_indexzext:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #64 // =0x40
-; CHECK-NEXT:    ldr x9, [x0]
+; CHECK-NEXT:    mov x9, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    ldr x10, [x0]
 ; CHECK-NEXT:    sub w8, w8, w1
-; CHECK-NEXT:    mov x10, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    lsr x8, x10, x8
-; CHECK-NEXT:    and x0, x8, x9
+; CHECK-NEXT:    lsr x8, x9, x8
+; CHECK-NEXT:    and x0, x8, x10
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %w
   %numhighbits = sub i8 64, %numlowbits
@@ -491,9 +491,9 @@ define i64 @bzhi64_c3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
 define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ; CHECK-LABEL: bzhi64_c4_commutative:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg x8, x1
-; CHECK-NEXT:    mov x9, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    lsr x8, x9, x8
+; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    neg x9, x1
+; CHECK-NEXT:    lsr x8, x8, x9
 ; CHECK-NEXT:    and x0, x0, x8
 ; CHECK-NEXT:    ret
   %numhighbits = sub i64 64, %numlowbits
@@ -537,10 +537,10 @@ define i32 @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 define i32 @bzhi32_d2_load(ptr %w, i32 %numlowbits) nounwind {
 ; CHECK-LABEL: bzhi32_d2_load:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg w8, w1
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    lsl w9, w9, w8
-; CHECK-NEXT:    lsr w0, w9, w8
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    neg w9, w1
+; CHECK-NEXT:    lsl w8, w8, w9
+; CHECK-NEXT:    lsr w0, w8, w9
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %w
   %numhighbits = sub i32 32, %numlowbits
@@ -599,10 +599,10 @@ define i64 @bzhi64_d1_indexzext(i64 %val, i8 %numlowbits) nounwind {
 define i64 @bzhi64_d2_load(ptr %w, i64 %numlowbits) nounwind {
 ; CHECK-LABEL: bzhi64_d2_load:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg x8, x1
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    lsl x9, x9, x8
-; CHECK-NEXT:    lsr x0, x9, x8
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    neg x9, x1
+; CHECK-NEXT:    lsl x8, x8, x9
+; CHECK-NEXT:    lsr x0, x8, x9
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %w
   %numhighbits = sub i64 64, %numlowbits

diff  --git a/llvm/test/CodeGen/AArch64/f16-instructions.ll b/llvm/test/CodeGen/AArch64/f16-instructions.ll
index f7491f21471c6d..0be41f246512ff 100644
--- a/llvm/test/CodeGen/AArch64/f16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/f16-instructions.ll
@@ -480,8 +480,8 @@ define i1 @test_fcmp_ord(half %a, half %b) #0 {
 ; CHECK-CVT-NEXT: fmov  s2, #5.00000000
 ; CHECK-CVT-NEXT: fcmp  s1, s2
 ; CHECK-CVT-NEXT: fmov  s2, #8.00000000
-; CHECK-CVT-NEXT: adrp x8
 ; CHECK-CVT-NEXT: fccmp s1, s2, #4, mi
+; CHECK-CVT-NEXT: adrp x8
 ; CHECK-CVT-NEXT: ldr h1, [x8,
 ; CHECK-CVT-NEXT: fcsel s0, s0, s1, gt
 ; CHECK-CVT-NEXT: str   h0, [x0]

diff  --git a/llvm/test/CodeGen/AArch64/fabs.ll b/llvm/test/CodeGen/AArch64/fabs.ll
index 8ffa20dbcecf33..dde188f702042e 100644
--- a/llvm/test/CodeGen/AArch64/fabs.ll
+++ b/llvm/test/CodeGen/AArch64/fabs.ll
@@ -164,40 +164,40 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: fabs_v7f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fabs s3, s3
+; CHECK-SD-NOFP16-NEXT:    fabs s2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
 ; CHECK-SD-NOFP16-NEXT:    fabs s5, s1
-; CHECK-SD-NOFP16-NEXT:    fabs s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
+; CHECK-SD-NOFP16-NEXT:    fabs s2, s3
 ; CHECK-SD-NOFP16-NEXT:    fabs s4, s4
 ; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    fabs s5, s5
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fabs s4, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fabs s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    fabs s3, s3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fabs s2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
 ; CHECK-SD-NOFP16-NEXT:    fabs s0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
@@ -227,18 +227,18 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    fabs v1.4s, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    fabs v0.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov s2, v1.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov s3, v1.s[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov s3, v1.s[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[2], v3.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[3], v0.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
@@ -291,16 +291,16 @@ define <4 x half> @fabs_v4f16(<4 x half> %a) {
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fabs s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fabs s0, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h4
 ; CHECK-SD-NOFP16-NEXT:    fabs s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h4
-; CHECK-SD-NOFP16-NEXT:    fabs s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fabs s2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
+; CHECK-SD-NOFP16-NEXT:    fabs s2, s3
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[3], v1.h[0]
@@ -332,40 +332,40 @@ define <8 x half> @fabs_v8f16(<8 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: fabs_v8f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fabs s3, s3
+; CHECK-SD-NOFP16-NEXT:    fabs s2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
 ; CHECK-SD-NOFP16-NEXT:    fabs s5, s1
-; CHECK-SD-NOFP16-NEXT:    fabs s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
+; CHECK-SD-NOFP16-NEXT:    fabs s2, s3
 ; CHECK-SD-NOFP16-NEXT:    fabs s4, s4
 ; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    fabs s5, s5
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fabs s4, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fabs s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    fabs s3, s3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fabs s2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
 ; CHECK-SD-NOFP16-NEXT:    fabs s0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
@@ -401,78 +401,78 @@ define <16 x half> @fabs_v16f16(<16 x half> %a) {
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[1]
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s4, h0
+; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s6, h1
 ; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h18, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h20, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    mov h21, v1.h[4]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
 ; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fabs s5, s5
+; CHECK-SD-NOFP16-NEXT:    fabs s4, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
 ; CHECK-SD-NOFP16-NEXT:    fabs s6, s6
 ; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fabs s16, s2
-; CHECK-SD-NOFP16-NEXT:    fabs s17, s3
-; CHECK-SD-NOFP16-NEXT:    fabs s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s5
+; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    fabs s18, s2
+; CHECK-SD-NOFP16-NEXT:    fabs s19, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
+; CHECK-SD-NOFP16-NEXT:    fabs s4, s5
 ; CHECK-SD-NOFP16-NEXT:    fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT:    fabs s7, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s17
-; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h18
+; CHECK-SD-NOFP16-NEXT:    fabs s6, s7
+; CHECK-SD-NOFP16-NEXT:    fabs s16, s16
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
+; CHECK-SD-NOFP16-NEXT:    fcvt h7, s19
+; CHECK-SD-NOFP16-NEXT:    fcvt s18, h20
+; CHECK-SD-NOFP16-NEXT:    fcvt s19, h21
 ; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s7
+; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
+; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
 ; CHECK-SD-NOFP16-NEXT:    mov v2.h[1], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fabs s17, s17
+; CHECK-SD-NOFP16-NEXT:    fabs s5, s17
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v7.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[5]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[5]
+; CHECK-SD-NOFP16-NEXT:    fabs s18, s18
 ; CHECK-SD-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fabs s16, s16
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fabs s5, s5
-; CHECK-SD-NOFP16-NEXT:    fabs s6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    fabs s4, s4
+; CHECK-SD-NOFP16-NEXT:    fabs s4, s19
 ; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h7
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v6.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[6]
+; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    mov h19, v1.h[6]
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fabs s6, s6
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v16.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
+; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
+; CHECK-SD-NOFP16-NEXT:    fabs s7, s7
+; CHECK-SD-NOFP16-NEXT:    fabs s16, s17
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h19
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fabs s5, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    fabs s4, s6
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
+; CHECK-SD-NOFP16-NEXT:    fcvt h6, s16
+; CHECK-SD-NOFP16-NEXT:    fabs s7, s17
 ; CHECK-SD-NOFP16-NEXT:    fabs s0, s0
-; CHECK-SD-NOFP16-NEXT:    fabs s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v6.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fabs s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v6.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v5.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v2.h[7], v0.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v3.h[7], v1.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v2.16b

diff  --git a/llvm/test/CodeGen/AArch64/fadd-combines.ll b/llvm/test/CodeGen/AArch64/fadd-combines.ll
index b9729ce534f660..f7bf92888cd374 100644
--- a/llvm/test/CodeGen/AArch64/fadd-combines.ll
+++ b/llvm/test/CodeGen/AArch64/fadd-combines.ll
@@ -28,9 +28,9 @@ define double @test2(double %a, double %b) {
 define double @test3(double %a, double %b, double %c) {
 ; CHECK-LABEL: test3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fadd d2, d2, d2
 ; CHECK-NEXT:    fmul d0, d0, d1
-; CHECK-NEXT:    fsub d0, d0, d2
+; CHECK-NEXT:    fadd d1, d2, d2
+; CHECK-NEXT:    fsub d0, d0, d1
 ; CHECK-NEXT:    ret
   %mul = fmul double %a, %b
   %mul1 = fmul double %c, 2.000000e+00
@@ -41,9 +41,9 @@ define double @test3(double %a, double %b, double %c) {
 define double @test4(double %a, double %b, double %c) {
 ; CHECK-LABEL: test4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fadd d2, d2, d2
 ; CHECK-NEXT:    fmul d0, d0, d1
-; CHECK-NEXT:    fsub d0, d0, d2
+; CHECK-NEXT:    fadd d1, d2, d2
+; CHECK-NEXT:    fsub d0, d0, d1
 ; CHECK-NEXT:    ret
   %mul = fmul double %a, %b
   %mul1 = fmul double %c, -2.000000e+00
@@ -132,8 +132,8 @@ define double @test7(double %a, double %b) nounwind {
 define float @fadd_const_multiuse_fmf(float %x) {
 ; CHECK-LABEL: fadd_const_multiuse_fmf:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1109917696
-; CHECK-NEXT:    mov w9, #1114374144
+; CHECK-NEXT:    mov w8, #1109917696 // =0x42280000
+; CHECK-NEXT:    mov w9, #1114374144 // =0x426c0000
 ; CHECK-NEXT:    fmov s1, w8
 ; CHECK-NEXT:    fmov s2, w9
 ; CHECK-NEXT:    fadd s1, s0, s1
@@ -150,8 +150,8 @@ define float @fadd_const_multiuse_fmf(float %x) {
 define float @fadd_const_multiuse_attr(float %x) {
 ; CHECK-LABEL: fadd_const_multiuse_attr:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1109917696
-; CHECK-NEXT:    mov w9, #1114374144
+; CHECK-NEXT:    mov w8, #1109917696 // =0x42280000
+; CHECK-NEXT:    mov w9, #1114374144 // =0x426c0000
 ; CHECK-NEXT:    fmov s1, w8
 ; CHECK-NEXT:    fmov s2, w9
 ; CHECK-NEXT:    fadd s1, s0, s1

diff  --git a/llvm/test/CodeGen/AArch64/faddp-half.ll b/llvm/test/CodeGen/AArch64/faddp-half.ll
index 0c8676da6862dd..6a450881dc978b 100644
--- a/llvm/test/CodeGen/AArch64/faddp-half.ll
+++ b/llvm/test/CodeGen/AArch64/faddp-half.ll
@@ -152,51 +152,51 @@ define <8 x half> @addp_v8f16(<8 x half> %a) {
 ; CHECKNOFP16-NEXT:    rev32 v2.8h, v0.8h
 ; CHECKNOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECKNOFP16-NEXT:    fcvt s4, h0
-; CHECKNOFP16-NEXT:    mov h6, v0.h[2]
+; CHECKNOFP16-NEXT:    mov h5, v0.h[2]
 ; CHECKNOFP16-NEXT:    mov h16, v0.h[3]
 ; CHECKNOFP16-NEXT:    mov h3, v2.h[1]
+; CHECKNOFP16-NEXT:    fcvt s6, h2
 ; CHECKNOFP16-NEXT:    fcvt s1, h1
-; CHECKNOFP16-NEXT:    fcvt s5, h2
 ; CHECKNOFP16-NEXT:    mov h7, v2.h[2]
-; CHECKNOFP16-NEXT:    mov h17, v2.h[3]
+; CHECKNOFP16-NEXT:    fcvt s5, h5
+; CHECKNOFP16-NEXT:    fcvt s16, h16
 ; CHECKNOFP16-NEXT:    fcvt s3, h3
-; CHECKNOFP16-NEXT:    fadd s4, s5, s4
-; CHECKNOFP16-NEXT:    fcvt s5, h6
-; CHECKNOFP16-NEXT:    fcvt s6, h7
-; CHECKNOFP16-NEXT:    fcvt s7, h16
-; CHECKNOFP16-NEXT:    fcvt s16, h17
+; CHECKNOFP16-NEXT:    fadd s4, s6, s4
+; CHECKNOFP16-NEXT:    mov h6, v2.h[3]
+; CHECKNOFP16-NEXT:    fcvt s7, h7
 ; CHECKNOFP16-NEXT:    fadd s3, s3, s1
+; CHECKNOFP16-NEXT:    fcvt s6, h6
 ; CHECKNOFP16-NEXT:    fcvt h1, s4
-; CHECKNOFP16-NEXT:    fadd s4, s6, s5
+; CHECKNOFP16-NEXT:    fadd s4, s7, s5
 ; CHECKNOFP16-NEXT:    mov h5, v0.h[4]
-; CHECKNOFP16-NEXT:    mov h6, v2.h[4]
-; CHECKNOFP16-NEXT:    fadd s7, s16, s7
+; CHECKNOFP16-NEXT:    mov h7, v2.h[4]
 ; CHECKNOFP16-NEXT:    fcvt h3, s3
+; CHECKNOFP16-NEXT:    fadd s6, s6, s16
 ; CHECKNOFP16-NEXT:    mov h16, v2.h[5]
-; CHECKNOFP16-NEXT:    fcvt s5, h5
-; CHECKNOFP16-NEXT:    fcvt s6, h6
-; CHECKNOFP16-NEXT:    fcvt h7, s7
+; CHECKNOFP16-NEXT:    fcvt h4, s4
 ; CHECKNOFP16-NEXT:    mov v1.h[1], v3.h[0]
-; CHECKNOFP16-NEXT:    fcvt h3, s4
-; CHECKNOFP16-NEXT:    mov h4, v0.h[5]
-; CHECKNOFP16-NEXT:    fadd s5, s6, s5
-; CHECKNOFP16-NEXT:    mov h6, v0.h[6]
-; CHECKNOFP16-NEXT:    mov v1.h[2], v3.h[0]
+; CHECKNOFP16-NEXT:    fcvt s3, h5
+; CHECKNOFP16-NEXT:    fcvt s5, h7
+; CHECKNOFP16-NEXT:    mov h7, v0.h[5]
+; CHECKNOFP16-NEXT:    fcvt h6, s6
+; CHECKNOFP16-NEXT:    fcvt s16, h16
+; CHECKNOFP16-NEXT:    mov v1.h[2], v4.h[0]
+; CHECKNOFP16-NEXT:    mov h4, v0.h[6]
+; CHECKNOFP16-NEXT:    fadd s3, s5, s3
+; CHECKNOFP16-NEXT:    mov h5, v2.h[6]
+; CHECKNOFP16-NEXT:    fcvt s7, h7
 ; CHECKNOFP16-NEXT:    mov h0, v0.h[7]
-; CHECKNOFP16-NEXT:    fcvt s3, h4
-; CHECKNOFP16-NEXT:    fcvt s4, h16
-; CHECKNOFP16-NEXT:    mov h16, v2.h[6]
 ; CHECKNOFP16-NEXT:    mov h2, v2.h[7]
-; CHECKNOFP16-NEXT:    mov v1.h[3], v7.h[0]
+; CHECKNOFP16-NEXT:    mov v1.h[3], v6.h[0]
+; CHECKNOFP16-NEXT:    fcvt s4, h4
+; CHECKNOFP16-NEXT:    fcvt h3, s3
+; CHECKNOFP16-NEXT:    fcvt s5, h5
+; CHECKNOFP16-NEXT:    fadd s6, s16, s7
 ; CHECKNOFP16-NEXT:    fcvt s0, h0
-; CHECKNOFP16-NEXT:    fadd s3, s4, s3
-; CHECKNOFP16-NEXT:    fcvt h4, s5
-; CHECKNOFP16-NEXT:    fcvt s5, h6
-; CHECKNOFP16-NEXT:    fcvt s6, h16
 ; CHECKNOFP16-NEXT:    fcvt s2, h2
-; CHECKNOFP16-NEXT:    mov v1.h[4], v4.h[0]
-; CHECKNOFP16-NEXT:    fcvt h3, s3
-; CHECKNOFP16-NEXT:    fadd s4, s6, s5
+; CHECKNOFP16-NEXT:    mov v1.h[4], v3.h[0]
+; CHECKNOFP16-NEXT:    fadd s4, s5, s4
+; CHECKNOFP16-NEXT:    fcvt h3, s6
 ; CHECKNOFP16-NEXT:    fadd s0, s2, s0
 ; CHECKNOFP16-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECKNOFP16-NEXT:    fcvt h3, s4
@@ -221,112 +221,112 @@ define <16 x half> @addp_v16f16(<16 x half> %a) {
 ;
 ; CHECKNOFP16-LABEL: addp_v16f16:
 ; CHECKNOFP16:       // %bb.0: // %entry
-; CHECKNOFP16-NEXT:    rev32 v4.8h, v0.8h
+; CHECKNOFP16-NEXT:    rev32 v5.8h, v0.8h
+; CHECKNOFP16-NEXT:    rev32 v4.8h, v1.8h
 ; CHECKNOFP16-NEXT:    mov h2, v0.h[1]
-; CHECKNOFP16-NEXT:    fcvt s6, h0
-; CHECKNOFP16-NEXT:    rev32 v5.8h, v1.8h
+; CHECKNOFP16-NEXT:    mov h6, v1.h[1]
+; CHECKNOFP16-NEXT:    fcvt s16, h0
 ; CHECKNOFP16-NEXT:    mov h17, v0.h[2]
-; CHECKNOFP16-NEXT:    mov h18, v0.h[3]
-; CHECKNOFP16-NEXT:    mov h3, v4.h[1]
+; CHECKNOFP16-NEXT:    fcvt s20, h1
+; CHECKNOFP16-NEXT:    mov h21, v1.h[2]
+; CHECKNOFP16-NEXT:    mov h3, v5.h[1]
+; CHECKNOFP16-NEXT:    mov h7, v4.h[1]
 ; CHECKNOFP16-NEXT:    fcvt s2, h2
-; CHECKNOFP16-NEXT:    fcvt s7, h4
-; CHECKNOFP16-NEXT:    mov h20, v4.h[2]
-; CHECKNOFP16-NEXT:    mov h16, v5.h[1]
-; CHECKNOFP16-NEXT:    fcvt s19, h5
-; CHECKNOFP16-NEXT:    mov h21, v4.h[3]
-; CHECKNOFP16-NEXT:    mov h22, v0.h[4]
+; CHECKNOFP16-NEXT:    fcvt s18, h5
+; CHECKNOFP16-NEXT:    mov h19, v5.h[2]
+; CHECKNOFP16-NEXT:    fcvt s6, h6
+; CHECKNOFP16-NEXT:    fcvt s22, h4
+; CHECKNOFP16-NEXT:    mov h23, v4.h[2]
+; CHECKNOFP16-NEXT:    fcvt s17, h17
+; CHECKNOFP16-NEXT:    mov h24, v5.h[3]
+; CHECKNOFP16-NEXT:    fcvt s21, h21
+; CHECKNOFP16-NEXT:    mov h25, v4.h[6]
 ; CHECKNOFP16-NEXT:    fcvt s3, h3
-; CHECKNOFP16-NEXT:    fadd s6, s7, s6
-; CHECKNOFP16-NEXT:    mov h7, v1.h[1]
-; CHECKNOFP16-NEXT:    fcvt s16, h16
-; CHECKNOFP16-NEXT:    fadd s3, s3, s2
-; CHECKNOFP16-NEXT:    fcvt h2, s6
-; CHECKNOFP16-NEXT:    fcvt s6, h1
 ; CHECKNOFP16-NEXT:    fcvt s7, h7
-; CHECKNOFP16-NEXT:    fcvt h3, s3
-; CHECKNOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECKNOFP16-NEXT:    fadd s3, s19, s6
-; CHECKNOFP16-NEXT:    fadd s6, s16, s7
-; CHECKNOFP16-NEXT:    fcvt s7, h17
-; CHECKNOFP16-NEXT:    fcvt s16, h20
-; CHECKNOFP16-NEXT:    fcvt s17, h18
-; CHECKNOFP16-NEXT:    fcvt s18, h21
-; CHECKNOFP16-NEXT:    mov h19, v1.h[2]
-; CHECKNOFP16-NEXT:    mov h20, v5.h[2]
-; CHECKNOFP16-NEXT:    fcvt h3, s3
+; CHECKNOFP16-NEXT:    fadd s16, s18, s16
+; CHECKNOFP16-NEXT:    fcvt s18, h19
+; CHECKNOFP16-NEXT:    mov h19, v0.h[3]
+; CHECKNOFP16-NEXT:    fadd s20, s22, s20
+; CHECKNOFP16-NEXT:    fcvt s22, h23
+; CHECKNOFP16-NEXT:    mov h23, v4.h[3]
+; CHECKNOFP16-NEXT:    fadd s3, s3, s2
+; CHECKNOFP16-NEXT:    fadd s6, s7, s6
+; CHECKNOFP16-NEXT:    mov h7, v1.h[3]
+; CHECKNOFP16-NEXT:    fcvt h2, s16
+; CHECKNOFP16-NEXT:    fadd s16, s18, s17
+; CHECKNOFP16-NEXT:    fcvt s18, h19
+; CHECKNOFP16-NEXT:    fcvt s19, h24
+; CHECKNOFP16-NEXT:    mov h24, v5.h[6]
+; CHECKNOFP16-NEXT:    fcvt h17, s3
+; CHECKNOFP16-NEXT:    fcvt h3, s20
+; CHECKNOFP16-NEXT:    fadd s20, s22, s21
 ; CHECKNOFP16-NEXT:    fcvt h6, s6
-; CHECKNOFP16-NEXT:    fadd s7, s16, s7
-; CHECKNOFP16-NEXT:    mov h16, v1.h[3]
-; CHECKNOFP16-NEXT:    fadd s17, s18, s17
-; CHECKNOFP16-NEXT:    mov h18, v4.h[4]
-; CHECKNOFP16-NEXT:    fcvt s19, h19
-; CHECKNOFP16-NEXT:    fcvt s20, h20
+; CHECKNOFP16-NEXT:    fcvt s7, h7
+; CHECKNOFP16-NEXT:    fcvt s22, h23
+; CHECKNOFP16-NEXT:    mov h21, v0.h[4]
+; CHECKNOFP16-NEXT:    mov h23, v5.h[4]
+; CHECKNOFP16-NEXT:    fcvt h16, s16
+; CHECKNOFP16-NEXT:    fadd s18, s19, s18
+; CHECKNOFP16-NEXT:    mov h19, v4.h[4]
+; CHECKNOFP16-NEXT:    mov v2.h[1], v17.h[0]
+; CHECKNOFP16-NEXT:    mov h17, v1.h[4]
 ; CHECKNOFP16-NEXT:    mov v3.h[1], v6.h[0]
-; CHECKNOFP16-NEXT:    mov h6, v5.h[3]
-; CHECKNOFP16-NEXT:    fcvt h7, s7
+; CHECKNOFP16-NEXT:    fcvt h6, s20
+; CHECKNOFP16-NEXT:    fadd s7, s22, s7
+; CHECKNOFP16-NEXT:    fcvt s20, h21
+; CHECKNOFP16-NEXT:    mov h21, v0.h[5]
+; CHECKNOFP16-NEXT:    mov h22, v5.h[5]
+; CHECKNOFP16-NEXT:    fcvt h18, s18
+; CHECKNOFP16-NEXT:    fcvt s19, h19
+; CHECKNOFP16-NEXT:    mov h5, v5.h[7]
+; CHECKNOFP16-NEXT:    mov v2.h[2], v16.h[0]
+; CHECKNOFP16-NEXT:    fcvt s16, h23
+; CHECKNOFP16-NEXT:    fcvt s17, h17
+; CHECKNOFP16-NEXT:    mov v3.h[2], v6.h[0]
+; CHECKNOFP16-NEXT:    fcvt h6, s7
+; CHECKNOFP16-NEXT:    mov h7, v1.h[5]
+; CHECKNOFP16-NEXT:    mov h23, v4.h[5]
+; CHECKNOFP16-NEXT:    mov h4, v4.h[7]
+; CHECKNOFP16-NEXT:    fcvt s5, h5
+; CHECKNOFP16-NEXT:    fadd s16, s16, s20
+; CHECKNOFP16-NEXT:    mov h20, v0.h[6]
+; CHECKNOFP16-NEXT:    fadd s17, s19, s17
+; CHECKNOFP16-NEXT:    mov h19, v1.h[6]
+; CHECKNOFP16-NEXT:    mov v2.h[3], v18.h[0]
+; CHECKNOFP16-NEXT:    fcvt s18, h21
 ; CHECKNOFP16-NEXT:    fcvt s21, h22
-; CHECKNOFP16-NEXT:    fcvt s18, h18
-; CHECKNOFP16-NEXT:    fadd s19, s20, s19
-; CHECKNOFP16-NEXT:    fcvt s16, h16
-; CHECKNOFP16-NEXT:    fcvt s6, h6
-; CHECKNOFP16-NEXT:    fcvt h17, s17
-; CHECKNOFP16-NEXT:    mov v2.h[2], v7.h[0]
-; CHECKNOFP16-NEXT:    mov h20, v5.h[4]
-; CHECKNOFP16-NEXT:    fadd s7, s18, s21
-; CHECKNOFP16-NEXT:    mov h18, v1.h[4]
-; CHECKNOFP16-NEXT:    fadd s6, s6, s16
-; CHECKNOFP16-NEXT:    fcvt h16, s19
-; CHECKNOFP16-NEXT:    mov v2.h[3], v17.h[0]
-; CHECKNOFP16-NEXT:    mov h19, v5.h[5]
-; CHECKNOFP16-NEXT:    fcvt h7, s7
-; CHECKNOFP16-NEXT:    fcvt s17, h18
-; CHECKNOFP16-NEXT:    fcvt s18, h20
-; CHECKNOFP16-NEXT:    fcvt h6, s6
-; CHECKNOFP16-NEXT:    mov v3.h[2], v16.h[0]
-; CHECKNOFP16-NEXT:    mov h16, v0.h[5]
-; CHECKNOFP16-NEXT:    mov v2.h[4], v7.h[0]
-; CHECKNOFP16-NEXT:    fadd s7, s18, s17
-; CHECKNOFP16-NEXT:    mov h17, v4.h[5]
-; CHECKNOFP16-NEXT:    mov h18, v1.h[5]
 ; CHECKNOFP16-NEXT:    mov v3.h[3], v6.h[0]
-; CHECKNOFP16-NEXT:    fcvt h6, s7
-; CHECKNOFP16-NEXT:    fcvt s7, h16
-; CHECKNOFP16-NEXT:    fcvt s16, h17
-; CHECKNOFP16-NEXT:    fcvt s17, h18
-; CHECKNOFP16-NEXT:    fcvt s18, h19
-; CHECKNOFP16-NEXT:    mov h19, v0.h[6]
-; CHECKNOFP16-NEXT:    mov h0, v0.h[7]
-; CHECKNOFP16-NEXT:    mov v3.h[4], v6.h[0]
-; CHECKNOFP16-NEXT:    mov h6, v4.h[6]
-; CHECKNOFP16-NEXT:    fadd s7, s16, s7
-; CHECKNOFP16-NEXT:    fadd s16, s18, s17
-; CHECKNOFP16-NEXT:    mov h17, v1.h[6]
-; CHECKNOFP16-NEXT:    mov h18, v5.h[6]
+; CHECKNOFP16-NEXT:    fcvt s6, h7
+; CHECKNOFP16-NEXT:    fcvt s7, h23
+; CHECKNOFP16-NEXT:    fcvt s22, h24
+; CHECKNOFP16-NEXT:    fcvt s23, h25
+; CHECKNOFP16-NEXT:    fcvt h16, s16
+; CHECKNOFP16-NEXT:    fcvt s20, h20
+; CHECKNOFP16-NEXT:    fcvt h17, s17
 ; CHECKNOFP16-NEXT:    fcvt s19, h19
-; CHECKNOFP16-NEXT:    fcvt s6, h6
-; CHECKNOFP16-NEXT:    mov h4, v4.h[7]
+; CHECKNOFP16-NEXT:    mov h0, v0.h[7]
 ; CHECKNOFP16-NEXT:    mov h1, v1.h[7]
-; CHECKNOFP16-NEXT:    mov h5, v5.h[7]
-; CHECKNOFP16-NEXT:    fcvt s17, h17
-; CHECKNOFP16-NEXT:    fcvt h7, s7
-; CHECKNOFP16-NEXT:    fcvt s18, h18
-; CHECKNOFP16-NEXT:    fcvt s0, h0
-; CHECKNOFP16-NEXT:    fadd s6, s6, s19
+; CHECKNOFP16-NEXT:    fadd s18, s21, s18
 ; CHECKNOFP16-NEXT:    fcvt s4, h4
+; CHECKNOFP16-NEXT:    fadd s6, s7, s6
+; CHECKNOFP16-NEXT:    mov v2.h[4], v16.h[0]
+; CHECKNOFP16-NEXT:    fadd s7, s22, s20
+; CHECKNOFP16-NEXT:    mov v3.h[4], v17.h[0]
+; CHECKNOFP16-NEXT:    fadd s16, s23, s19
+; CHECKNOFP16-NEXT:    fcvt s0, h0
 ; CHECKNOFP16-NEXT:    fcvt s1, h1
-; CHECKNOFP16-NEXT:    fcvt s5, h5
-; CHECKNOFP16-NEXT:    fcvt h16, s16
-; CHECKNOFP16-NEXT:    fadd s17, s18, s17
-; CHECKNOFP16-NEXT:    mov v2.h[5], v7.h[0]
+; CHECKNOFP16-NEXT:    fcvt h17, s18
 ; CHECKNOFP16-NEXT:    fcvt h6, s6
-; CHECKNOFP16-NEXT:    fadd s0, s4, s0
-; CHECKNOFP16-NEXT:    fadd s1, s5, s1
-; CHECKNOFP16-NEXT:    mov v3.h[5], v16.h[0]
-; CHECKNOFP16-NEXT:    fcvt h4, s17
-; CHECKNOFP16-NEXT:    mov v2.h[6], v6.h[0]
+; CHECKNOFP16-NEXT:    fadd s0, s5, s0
+; CHECKNOFP16-NEXT:    fcvt h5, s7
+; CHECKNOFP16-NEXT:    fadd s1, s4, s1
+; CHECKNOFP16-NEXT:    mov v2.h[5], v17.h[0]
+; CHECKNOFP16-NEXT:    mov v3.h[5], v6.h[0]
+; CHECKNOFP16-NEXT:    fcvt h6, s16
 ; CHECKNOFP16-NEXT:    fcvt h0, s0
 ; CHECKNOFP16-NEXT:    fcvt h1, s1
-; CHECKNOFP16-NEXT:    mov v3.h[6], v4.h[0]
+; CHECKNOFP16-NEXT:    mov v2.h[6], v5.h[0]
+; CHECKNOFP16-NEXT:    mov v3.h[6], v6.h[0]
 ; CHECKNOFP16-NEXT:    mov v2.h[7], v0.h[0]
 ; CHECKNOFP16-NEXT:    mov v3.h[7], v1.h[0]
 ; CHECKNOFP16-NEXT:    mov v0.16b, v2.16b

diff  --git a/llvm/test/CodeGen/AArch64/faddp.ll b/llvm/test/CodeGen/AArch64/faddp.ll
index d933d7fb7554de..d0c8f6e26d4d6e 100644
--- a/llvm/test/CodeGen/AArch64/faddp.ll
+++ b/llvm/test/CodeGen/AArch64/faddp.ll
@@ -216,10 +216,10 @@ entry:
 define <8 x float> @addp_v8f32(<8 x float> %a) {
 ; CHECK-LABEL: addp_v8f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rev64 v2.4s, v0.4s
-; CHECK-NEXT:    rev64 v3.4s, v1.4s
-; CHECK-NEXT:    fadd v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    fadd v1.4s, v3.4s, v1.4s
+; CHECK-NEXT:    rev64 v2.4s, v1.4s
+; CHECK-NEXT:    rev64 v3.4s, v0.4s
+; CHECK-NEXT:    fadd v0.4s, v3.4s, v0.4s
+; CHECK-NEXT:    fadd v1.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    ret
 entry:
   %s = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>

diff  --git a/llvm/test/CodeGen/AArch64/fast-isel-addressing-modes.ll b/llvm/test/CodeGen/AArch64/fast-isel-addressing-modes.ll
index 422d2c7a823421..5e14e63a23d3ca 100644
--- a/llvm/test/CodeGen/AArch64/fast-isel-addressing-modes.ll
+++ b/llvm/test/CodeGen/AArch64/fast-isel-addressing-modes.ll
@@ -97,13 +97,13 @@ define void @store_breg_i1(ptr %a) {
 define void @store_breg_i1_2(ptr %a) {
 ; SDAG-LABEL: store_breg_i1_2:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    mov w8, #1
+; SDAG-NEXT:    mov w8, #1 ; =0x1
 ; SDAG-NEXT:    strb w8, [x0]
 ; SDAG-NEXT:    ret
 ;
 ; FAST-LABEL: store_breg_i1_2:
 ; FAST:       ; %bb.0:
-; FAST-NEXT:    mov w8, #1
+; FAST-NEXT:    mov w8, #1 ; =0x1
 ; FAST-NEXT:    and w8, w8, #0x1
 ; FAST-NEXT:    strb w8, [x0]
 ; FAST-NEXT:    ret
@@ -169,13 +169,13 @@ define void @store_breg_f64(ptr %a) {
 define i32 @load_immoff_1() {
 ; SDAG-LABEL: load_immoff_1:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    mov w8, #128
+; SDAG-NEXT:    mov w8, #128 ; =0x80
 ; SDAG-NEXT:    ldr w0, [x8]
 ; SDAG-NEXT:    ret
 ;
 ; FAST-LABEL: load_immoff_1:
 ; FAST:       ; %bb.0:
-; FAST-NEXT:    mov x8, #128
+; FAST-NEXT:    mov x8, #128 ; =0x80
 ; FAST-NEXT:    ldr w0, [x8]
 ; FAST-NEXT:    ret
   %1 = inttoptr i64 128 to ptr
@@ -250,7 +250,7 @@ define i32 @load_breg_immoff_5(i64 %a) {
 define i32 @load_breg_immoff_6(i64 %a) {
 ; SDAG-LABEL: load_breg_immoff_6:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    mov w8, #16384
+; SDAG-NEXT:    mov w8, #16384 ; =0x4000
 ; SDAG-NEXT:    ldr w0, [x0, x8]
 ; SDAG-NEXT:    ret
 ;
@@ -331,7 +331,7 @@ define void @store_breg_immoff_5(i64 %a) {
 define void @store_breg_immoff_6(i64 %a) {
 ; SDAG-LABEL: store_breg_immoff_6:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    mov w8, #16384
+; SDAG-NEXT:    mov w8, #16384 ; =0x4000
 ; SDAG-NEXT:    str wzr, [x0, x8]
 ; SDAG-NEXT:    ret
 ;
@@ -410,7 +410,7 @@ define i64 @load_breg_offreg_immoff_1(i64 %a, i64 %b) {
 define i64 @load_breg_offreg_immoff_2(i64 %a, i64 %b) {
 ; SDAG-LABEL: load_breg_offreg_immoff_2:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    mov w8, #61440
+; SDAG-NEXT:    mov w8, #61440 ; =0xf000
 ; SDAG-NEXT:    add x9, x0, x1
 ; SDAG-NEXT:    ldr x0, [x9, x8]
 ; SDAG-NEXT:    ret
@@ -772,10 +772,10 @@ define i64 @kill_reg(i64 %a) {
 ;
 ; FAST-LABEL: kill_reg:
 ; FAST:       ; %bb.0:
-; FAST-NEXT:    ldr x8, [x0, #88]
-; FAST-NEXT:    sub x9, x0, #8
-; FAST-NEXT:    add x9, x9, #96
-; FAST-NEXT:    add x0, x9, x8
+; FAST-NEXT:    sub x8, x0, #8
+; FAST-NEXT:    ldr x9, [x0, #88]
+; FAST-NEXT:    add x8, x8, #96
+; FAST-NEXT:    add x0, x8, x9
 ; FAST-NEXT:    ret
   %1 = sub i64 %a, 8
   %2 = add i64 %1, 96
@@ -786,25 +786,15 @@ define i64 @kill_reg(i64 %a) {
 }
 
 define void @store_fi(i64 %i) {
-; SDAG-LABEL: store_fi:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    sub sp, sp, #32
-; SDAG-NEXT:    .cfi_def_cfa_offset 32
-; SDAG-NEXT:    mov x8, sp
-; SDAG-NEXT:    mov w9, #47
-; SDAG-NEXT:    str w9, [x8, x0, lsl #2]
-; SDAG-NEXT:    add sp, sp, #32
-; SDAG-NEXT:    ret
-;
-; FAST-LABEL: store_fi:
-; FAST:       ; %bb.0:
-; FAST-NEXT:    sub sp, sp, #32
-; FAST-NEXT:    .cfi_def_cfa_offset 32
-; FAST-NEXT:    mov w8, #47
-; FAST-NEXT:    mov x9, sp
-; FAST-NEXT:    str w8, [x9, x0, lsl #2]
-; FAST-NEXT:    add sp, sp, #32
-; FAST-NEXT:    ret
+; CHECK-LABEL: store_fi:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    mov w9, #47 ; =0x2f
+; CHECK-NEXT:    str w9, [x8, x0, lsl #2]
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
   %1 = alloca [8 x i32]
   %2 = ptrtoint ptr %1 to i64
   %3 = mul i64 %i, 4

diff  --git a/llvm/test/CodeGen/AArch64/fast-isel-gep.ll b/llvm/test/CodeGen/AArch64/fast-isel-gep.ll
index 9df826a2482e70..3dc4771eb01c15 100644
--- a/llvm/test/CodeGen/AArch64/fast-isel-gep.ll
+++ b/llvm/test/CodeGen/AArch64/fast-isel-gep.ll
@@ -15,7 +15,7 @@ define ptr @test_struct(ptr %f) {
 define ptr @test_array1(ptr %a, i64 %i) {
 ; CHECK-LABEL: test_array1:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov x8, #4
+; CHECK-NEXT:    mov x8, #4 ; =0x4
 ; CHECK-NEXT:    madd x0, x1, x8, x0
 ; CHECK-NEXT:    ret
   %1 = getelementptr inbounds i32, ptr %a, i64 %i
@@ -43,7 +43,7 @@ define ptr @test_array3(ptr %a) {
 define ptr @test_array4(ptr %a) {
 ; CHECK-LABEL: test_array4:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov x8, #4104
+; CHECK-NEXT:    mov x8, #4104 ; =0x1008
 ; CHECK-NEXT:    add x0, x0, x8
 ; CHECK-NEXT:    ret
   %1 = getelementptr inbounds i32, ptr %a, i64 1026
@@ -54,9 +54,9 @@ define ptr @test_array5(ptr %a, i32 %i) {
 ; CHECK-LABEL: test_array5:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    ; kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    mov x8, #4
-; CHECK-NEXT:    sxtw x9, w1
-; CHECK-NEXT:    madd x0, x9, x8, x0
+; CHECK-NEXT:    sxtw x8, w1
+; CHECK-NEXT:    mov x9, #4 ; =0x4
+; CHECK-NEXT:    madd x0, x8, x9, x0
 ; CHECK-NEXT:    ret
   %1 = getelementptr inbounds i32, ptr %a, i32 %i
   ret ptr %1

diff  --git a/llvm/test/CodeGen/AArch64/fast-isel-memcpy.ll b/llvm/test/CodeGen/AArch64/fast-isel-memcpy.ll
index b7971af9906c23..76873cd3ca9e51 100644
--- a/llvm/test/CodeGen/AArch64/fast-isel-memcpy.ll
+++ b/llvm/test/CodeGen/AArch64/fast-isel-memcpy.ll
@@ -5,9 +5,9 @@
 define void @test(i64 %a, ptr %b) {
 ; CHECK-LABEL: test:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    and x8, x0, #0x7fffffffffffffff
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    str x9, [x8]
+; CHECK-NEXT:    ldr x8, [x1]
+; CHECK-NEXT:    and x9, x0, #0x7fffffffffffffff
+; CHECK-NEXT:    str x8, [x9]
 ; CHECK-NEXT:    ret
   %1 = and i64 %a, 9223372036854775807
   %2 = inttoptr i64 %1 to ptr

diff  --git a/llvm/test/CodeGen/AArch64/fast-isel-shift.ll b/llvm/test/CodeGen/AArch64/fast-isel-shift.ll
index f77b3af1e2bed6..95891db80bc4ed 100644
--- a/llvm/test/CodeGen/AArch64/fast-isel-shift.ll
+++ b/llvm/test/CodeGen/AArch64/fast-isel-shift.ll
@@ -391,9 +391,9 @@ define i64 @lsl_i64(i64 %a) {
 define zeroext i8 @lsrv_i8(i8 %a, i8 %b) {
 ; CHECK-LABEL: lsrv_i8:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    and w8, w1, #0xff
-; CHECK-NEXT:    and w9, w0, #0xff
-; CHECK-NEXT:    lsr w8, w9, w8
+; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    and w9, w1, #0xff
+; CHECK-NEXT:    lsr w8, w8, w9
 ; CHECK-NEXT:    and w8, w8, #0xff
 ; CHECK-NEXT:    uxtb w0, w8
 ; CHECK-NEXT:    ret
@@ -458,9 +458,9 @@ define i32 @lsr_sext_i8_i32(i8 %b) {
 define zeroext i16 @lsrv_i16(i16 %a, i16 %b) {
 ; CHECK-LABEL: lsrv_i16:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    and w8, w1, #0xffff
-; CHECK-NEXT:    and w9, w0, #0xffff
-; CHECK-NEXT:    lsr w8, w9, w8
+; CHECK-NEXT:    and w8, w0, #0xffff
+; CHECK-NEXT:    and w9, w1, #0xffff
+; CHECK-NEXT:    lsr w8, w8, w9
 ; CHECK-NEXT:    and w8, w8, #0xffff
 ; CHECK-NEXT:    uxth w0, w8
 ; CHECK-NEXT:    ret
@@ -517,9 +517,9 @@ define i64 @lsr_i64(i64 %a) {
 define zeroext i8 @asrv_i8(i8 %a, i8 %b) {
 ; CHECK-LABEL: asrv_i8:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    and w8, w1, #0xff
-; CHECK-NEXT:    sxtb w9, w0
-; CHECK-NEXT:    asr w8, w9, w8
+; CHECK-NEXT:    sxtb w8, w0
+; CHECK-NEXT:    and w9, w1, #0xff
+; CHECK-NEXT:    asr w8, w8, w9
 ; CHECK-NEXT:    and w8, w8, #0xff
 ; CHECK-NEXT:    uxtb w0, w8
 ; CHECK-NEXT:    ret
@@ -582,9 +582,9 @@ define i32 @asr_sext_i8_i32(i8 %b) {
 define zeroext i16 @asrv_i16(i16 %a, i16 %b) {
 ; CHECK-LABEL: asrv_i16:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    and w8, w1, #0xffff
-; CHECK-NEXT:    sxth w9, w0
-; CHECK-NEXT:    asr w8, w9, w8
+; CHECK-NEXT:    sxth w8, w0
+; CHECK-NEXT:    and w9, w1, #0xffff
+; CHECK-NEXT:    asr w8, w8, w9
 ; CHECK-NEXT:    and w8, w8, #0xffff
 ; CHECK-NEXT:    uxth w0, w8
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll
index 53188b01d34a10..b9713b57cef681 100644
--- a/llvm/test/CodeGen/AArch64/fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/fcopysign.ll
@@ -95,8 +95,8 @@ entry:
 define float @copysign32(float %a, float %b) {
 ; CHECK-LABEL: copysign32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
 ; CHECK-NEXT:    mvni v2.4s, #128, lsl #24
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
 ; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
 ; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
@@ -156,8 +156,8 @@ define half @copysign16(half %a, half %b) {
 ; CHECK-NONEON-NEXT:    fcvt s0, h0
 ; CHECK-NONEON-NEXT:    str h1, [sp, #12]
 ; CHECK-NONEON-NEXT:    ldrb w8, [sp, #13]
-; CHECK-NONEON-NEXT:    fabs s0, s0
 ; CHECK-NONEON-NEXT:    tst w8, #0x80
+; CHECK-NONEON-NEXT:    fabs s0, s0
 ; CHECK-NONEON-NEXT:    fneg s1, s0
 ; CHECK-NONEON-NEXT:    fcsel s0, s1, s0, ne
 ; CHECK-NONEON-NEXT:    fcvt h0, s0

diff  --git a/llvm/test/CodeGen/AArch64/fcvt.ll b/llvm/test/CodeGen/AArch64/fcvt.ll
index 28817ecec5177a..3c5ad83b920df8 100644
--- a/llvm/test/CodeGen/AArch64/fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt.ll
@@ -164,40 +164,40 @@ define <7 x half> @ceil_v7f16(<7 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: ceil_v7f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frintp s3, s3
+; CHECK-SD-NOFP16-NEXT:    frintp s2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
 ; CHECK-SD-NOFP16-NEXT:    frintp s5, s1
-; CHECK-SD-NOFP16-NEXT:    frintp s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
+; CHECK-SD-NOFP16-NEXT:    frintp s2, s3
 ; CHECK-SD-NOFP16-NEXT:    frintp s4, s4
 ; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    frintp s5, s5
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    frintp s4, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    frintp s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    frintp s3, s3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
 ; CHECK-SD-NOFP16-NEXT:    frintp s2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
 ; CHECK-SD-NOFP16-NEXT:    frintp s0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
@@ -227,18 +227,18 @@ define <7 x half> @ceil_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    frintp v1.4s, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    frintp v0.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov s2, v1.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov s3, v1.s[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov s3, v1.s[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[2], v3.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[3], v0.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
@@ -291,16 +291,16 @@ define <4 x half> @ceil_v4f16(<4 x half> %a) {
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frintp s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    frintp s0, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h4
 ; CHECK-SD-NOFP16-NEXT:    frintp s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h4
-; CHECK-SD-NOFP16-NEXT:    frintp s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    frintp s2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
+; CHECK-SD-NOFP16-NEXT:    frintp s2, s3
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[3], v1.h[0]
@@ -332,40 +332,40 @@ define <8 x half> @ceil_v8f16(<8 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: ceil_v8f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frintp s3, s3
+; CHECK-SD-NOFP16-NEXT:    frintp s2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
 ; CHECK-SD-NOFP16-NEXT:    frintp s5, s1
-; CHECK-SD-NOFP16-NEXT:    frintp s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
+; CHECK-SD-NOFP16-NEXT:    frintp s2, s3
 ; CHECK-SD-NOFP16-NEXT:    frintp s4, s4
 ; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    frintp s5, s5
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    frintp s4, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    frintp s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    frintp s3, s3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
 ; CHECK-SD-NOFP16-NEXT:    frintp s2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
 ; CHECK-SD-NOFP16-NEXT:    frintp s0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
@@ -401,78 +401,78 @@ define <16 x half> @ceil_v16f16(<16 x half> %a) {
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[1]
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s4, h0
+; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s6, h1
 ; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h18, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h20, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    mov h21, v1.h[4]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
 ; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frintp s5, s5
+; CHECK-SD-NOFP16-NEXT:    frintp s4, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
 ; CHECK-SD-NOFP16-NEXT:    frintp s6, s6
 ; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    frintp s16, s2
-; CHECK-SD-NOFP16-NEXT:    frintp s17, s3
-; CHECK-SD-NOFP16-NEXT:    frintp s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s5
+; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    frintp s18, s2
+; CHECK-SD-NOFP16-NEXT:    frintp s19, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
+; CHECK-SD-NOFP16-NEXT:    frintp s4, s5
 ; CHECK-SD-NOFP16-NEXT:    fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT:    frintp s7, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s17
-; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h18
+; CHECK-SD-NOFP16-NEXT:    frintp s6, s7
+; CHECK-SD-NOFP16-NEXT:    frintp s16, s16
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
+; CHECK-SD-NOFP16-NEXT:    fcvt h7, s19
+; CHECK-SD-NOFP16-NEXT:    fcvt s18, h20
+; CHECK-SD-NOFP16-NEXT:    fcvt s19, h21
 ; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s7
+; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
+; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
 ; CHECK-SD-NOFP16-NEXT:    mov v2.h[1], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    frintp s17, s17
+; CHECK-SD-NOFP16-NEXT:    frintp s5, s17
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v7.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[5]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[5]
+; CHECK-SD-NOFP16-NEXT:    frintp s18, s18
 ; CHECK-SD-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    frintp s16, s16
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frintp s5, s5
-; CHECK-SD-NOFP16-NEXT:    frintp s6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    frintp s4, s4
+; CHECK-SD-NOFP16-NEXT:    frintp s4, s19
 ; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h7
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v6.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[6]
+; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    mov h19, v1.h[6]
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    frintp s6, s6
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v16.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
+; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
+; CHECK-SD-NOFP16-NEXT:    frintp s7, s7
+; CHECK-SD-NOFP16-NEXT:    frintp s16, s17
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h19
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frintp s5, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    frintp s4, s6
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
+; CHECK-SD-NOFP16-NEXT:    fcvt h6, s16
+; CHECK-SD-NOFP16-NEXT:    frintp s7, s17
 ; CHECK-SD-NOFP16-NEXT:    frintp s0, s0
-; CHECK-SD-NOFP16-NEXT:    frintp s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v6.h[0]
 ; CHECK-SD-NOFP16-NEXT:    frintp s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v6.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v5.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v2.h[7], v0.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v3.h[7], v1.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v2.16b
@@ -671,40 +671,40 @@ define <7 x half> @floor_v7f16(<7 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: floor_v7f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frintm s3, s3
+; CHECK-SD-NOFP16-NEXT:    frintm s2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
 ; CHECK-SD-NOFP16-NEXT:    frintm s5, s1
-; CHECK-SD-NOFP16-NEXT:    frintm s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
+; CHECK-SD-NOFP16-NEXT:    frintm s2, s3
 ; CHECK-SD-NOFP16-NEXT:    frintm s4, s4
 ; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    frintm s5, s5
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    frintm s4, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    frintm s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    frintm s3, s3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
 ; CHECK-SD-NOFP16-NEXT:    frintm s2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
 ; CHECK-SD-NOFP16-NEXT:    frintm s0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
@@ -734,18 +734,18 @@ define <7 x half> @floor_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    frintm v1.4s, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    frintm v0.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov s2, v1.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov s3, v1.s[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov s3, v1.s[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[2], v3.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[3], v0.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
@@ -798,16 +798,16 @@ define <4 x half> @floor_v4f16(<4 x half> %a) {
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frintm s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    frintm s0, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h4
 ; CHECK-SD-NOFP16-NEXT:    frintm s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h4
-; CHECK-SD-NOFP16-NEXT:    frintm s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    frintm s2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
+; CHECK-SD-NOFP16-NEXT:    frintm s2, s3
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[3], v1.h[0]
@@ -839,40 +839,40 @@ define <8 x half> @floor_v8f16(<8 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: floor_v8f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frintm s3, s3
+; CHECK-SD-NOFP16-NEXT:    frintm s2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
 ; CHECK-SD-NOFP16-NEXT:    frintm s5, s1
-; CHECK-SD-NOFP16-NEXT:    frintm s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
+; CHECK-SD-NOFP16-NEXT:    frintm s2, s3
 ; CHECK-SD-NOFP16-NEXT:    frintm s4, s4
 ; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    frintm s5, s5
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    frintm s4, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    frintm s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    frintm s3, s3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
 ; CHECK-SD-NOFP16-NEXT:    frintm s2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
 ; CHECK-SD-NOFP16-NEXT:    frintm s0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
@@ -908,78 +908,78 @@ define <16 x half> @floor_v16f16(<16 x half> %a) {
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[1]
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s4, h0
+; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s6, h1
 ; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h18, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h20, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    mov h21, v1.h[4]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
 ; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frintm s5, s5
+; CHECK-SD-NOFP16-NEXT:    frintm s4, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
 ; CHECK-SD-NOFP16-NEXT:    frintm s6, s6
 ; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    frintm s16, s2
-; CHECK-SD-NOFP16-NEXT:    frintm s17, s3
-; CHECK-SD-NOFP16-NEXT:    frintm s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s5
+; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    frintm s18, s2
+; CHECK-SD-NOFP16-NEXT:    frintm s19, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
+; CHECK-SD-NOFP16-NEXT:    frintm s4, s5
 ; CHECK-SD-NOFP16-NEXT:    fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT:    frintm s7, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s17
-; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h18
+; CHECK-SD-NOFP16-NEXT:    frintm s6, s7
+; CHECK-SD-NOFP16-NEXT:    frintm s16, s16
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
+; CHECK-SD-NOFP16-NEXT:    fcvt h7, s19
+; CHECK-SD-NOFP16-NEXT:    fcvt s18, h20
+; CHECK-SD-NOFP16-NEXT:    fcvt s19, h21
 ; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s7
+; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
+; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
 ; CHECK-SD-NOFP16-NEXT:    mov v2.h[1], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    frintm s17, s17
+; CHECK-SD-NOFP16-NEXT:    frintm s5, s17
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v7.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[5]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[5]
+; CHECK-SD-NOFP16-NEXT:    frintm s18, s18
 ; CHECK-SD-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    frintm s16, s16
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frintm s5, s5
-; CHECK-SD-NOFP16-NEXT:    frintm s6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    frintm s4, s4
+; CHECK-SD-NOFP16-NEXT:    frintm s4, s19
 ; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h7
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v6.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[6]
+; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    mov h19, v1.h[6]
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    frintm s6, s6
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v16.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
+; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
+; CHECK-SD-NOFP16-NEXT:    frintm s7, s7
+; CHECK-SD-NOFP16-NEXT:    frintm s16, s17
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h19
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frintm s5, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    frintm s4, s6
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
+; CHECK-SD-NOFP16-NEXT:    fcvt h6, s16
+; CHECK-SD-NOFP16-NEXT:    frintm s7, s17
 ; CHECK-SD-NOFP16-NEXT:    frintm s0, s0
-; CHECK-SD-NOFP16-NEXT:    frintm s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v6.h[0]
 ; CHECK-SD-NOFP16-NEXT:    frintm s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v6.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v5.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v2.h[7], v0.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v3.h[7], v1.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v2.16b
@@ -1178,40 +1178,40 @@ define <7 x half> @nearbyint_v7f16(<7 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: nearbyint_v7f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frinti s3, s3
+; CHECK-SD-NOFP16-NEXT:    frinti s2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
 ; CHECK-SD-NOFP16-NEXT:    frinti s5, s1
-; CHECK-SD-NOFP16-NEXT:    frinti s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
+; CHECK-SD-NOFP16-NEXT:    frinti s2, s3
 ; CHECK-SD-NOFP16-NEXT:    frinti s4, s4
 ; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    frinti s5, s5
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    frinti s4, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    frinti s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    frinti s3, s3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
 ; CHECK-SD-NOFP16-NEXT:    frinti s2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
 ; CHECK-SD-NOFP16-NEXT:    frinti s0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
@@ -1241,18 +1241,18 @@ define <7 x half> @nearbyint_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    frinti v1.4s, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    frinti v0.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov s2, v1.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov s3, v1.s[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov s3, v1.s[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[2], v3.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[3], v0.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
@@ -1305,16 +1305,16 @@ define <4 x half> @nearbyint_v4f16(<4 x half> %a) {
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frinti s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    frinti s0, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h4
 ; CHECK-SD-NOFP16-NEXT:    frinti s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h4
-; CHECK-SD-NOFP16-NEXT:    frinti s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    frinti s2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
+; CHECK-SD-NOFP16-NEXT:    frinti s2, s3
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[3], v1.h[0]
@@ -1346,40 +1346,40 @@ define <8 x half> @nearbyint_v8f16(<8 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: nearbyint_v8f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frinti s3, s3
+; CHECK-SD-NOFP16-NEXT:    frinti s2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
 ; CHECK-SD-NOFP16-NEXT:    frinti s5, s1
-; CHECK-SD-NOFP16-NEXT:    frinti s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
+; CHECK-SD-NOFP16-NEXT:    frinti s2, s3
 ; CHECK-SD-NOFP16-NEXT:    frinti s4, s4
 ; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    frinti s5, s5
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    frinti s4, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    frinti s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    frinti s3, s3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
 ; CHECK-SD-NOFP16-NEXT:    frinti s2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
 ; CHECK-SD-NOFP16-NEXT:    frinti s0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
@@ -1415,78 +1415,78 @@ define <16 x half> @nearbyint_v16f16(<16 x half> %a) {
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[1]
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s4, h0
+; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s6, h1
 ; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h18, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h20, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    mov h21, v1.h[4]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
 ; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frinti s5, s5
+; CHECK-SD-NOFP16-NEXT:    frinti s4, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
 ; CHECK-SD-NOFP16-NEXT:    frinti s6, s6
 ; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    frinti s16, s2
-; CHECK-SD-NOFP16-NEXT:    frinti s17, s3
-; CHECK-SD-NOFP16-NEXT:    frinti s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s5
+; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    frinti s18, s2
+; CHECK-SD-NOFP16-NEXT:    frinti s19, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
+; CHECK-SD-NOFP16-NEXT:    frinti s4, s5
 ; CHECK-SD-NOFP16-NEXT:    fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT:    frinti s7, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s17
-; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h18
+; CHECK-SD-NOFP16-NEXT:    frinti s6, s7
+; CHECK-SD-NOFP16-NEXT:    frinti s16, s16
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
+; CHECK-SD-NOFP16-NEXT:    fcvt h7, s19
+; CHECK-SD-NOFP16-NEXT:    fcvt s18, h20
+; CHECK-SD-NOFP16-NEXT:    fcvt s19, h21
 ; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s7
+; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
+; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
 ; CHECK-SD-NOFP16-NEXT:    mov v2.h[1], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    frinti s17, s17
+; CHECK-SD-NOFP16-NEXT:    frinti s5, s17
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v7.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[5]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[5]
+; CHECK-SD-NOFP16-NEXT:    frinti s18, s18
 ; CHECK-SD-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    frinti s16, s16
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frinti s5, s5
-; CHECK-SD-NOFP16-NEXT:    frinti s6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    frinti s4, s4
+; CHECK-SD-NOFP16-NEXT:    frinti s4, s19
 ; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h7
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v6.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[6]
+; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    mov h19, v1.h[6]
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    frinti s6, s6
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v16.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
+; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
+; CHECK-SD-NOFP16-NEXT:    frinti s7, s7
+; CHECK-SD-NOFP16-NEXT:    frinti s16, s17
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h19
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frinti s5, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    frinti s4, s6
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
+; CHECK-SD-NOFP16-NEXT:    fcvt h6, s16
+; CHECK-SD-NOFP16-NEXT:    frinti s7, s17
 ; CHECK-SD-NOFP16-NEXT:    frinti s0, s0
-; CHECK-SD-NOFP16-NEXT:    frinti s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v6.h[0]
 ; CHECK-SD-NOFP16-NEXT:    frinti s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v6.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v5.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v2.h[7], v0.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v3.h[7], v1.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v2.16b
@@ -1685,40 +1685,40 @@ define <7 x half> @roundeven_v7f16(<7 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: roundeven_v7f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frintn s3, s3
+; CHECK-SD-NOFP16-NEXT:    frintn s2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
 ; CHECK-SD-NOFP16-NEXT:    frintn s5, s1
-; CHECK-SD-NOFP16-NEXT:    frintn s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
+; CHECK-SD-NOFP16-NEXT:    frintn s2, s3
 ; CHECK-SD-NOFP16-NEXT:    frintn s4, s4
 ; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    frintn s5, s5
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    frintn s4, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    frintn s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    frintn s3, s3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
 ; CHECK-SD-NOFP16-NEXT:    frintn s2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
 ; CHECK-SD-NOFP16-NEXT:    frintn s0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
@@ -1748,18 +1748,18 @@ define <7 x half> @roundeven_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    frintn v1.4s, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    frintn v0.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov s2, v1.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov s3, v1.s[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov s3, v1.s[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[2], v3.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[3], v0.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
@@ -1812,16 +1812,16 @@ define <4 x half> @roundeven_v4f16(<4 x half> %a) {
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frintn s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    frintn s0, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h4
 ; CHECK-SD-NOFP16-NEXT:    frintn s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h4
-; CHECK-SD-NOFP16-NEXT:    frintn s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    frintn s2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
+; CHECK-SD-NOFP16-NEXT:    frintn s2, s3
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[3], v1.h[0]
@@ -1853,40 +1853,40 @@ define <8 x half> @roundeven_v8f16(<8 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: roundeven_v8f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frintn s3, s3
+; CHECK-SD-NOFP16-NEXT:    frintn s2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
 ; CHECK-SD-NOFP16-NEXT:    frintn s5, s1
-; CHECK-SD-NOFP16-NEXT:    frintn s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
+; CHECK-SD-NOFP16-NEXT:    frintn s2, s3
 ; CHECK-SD-NOFP16-NEXT:    frintn s4, s4
 ; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    frintn s5, s5
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    frintn s4, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    frintn s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    frintn s3, s3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
 ; CHECK-SD-NOFP16-NEXT:    frintn s2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
 ; CHECK-SD-NOFP16-NEXT:    frintn s0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
@@ -1922,78 +1922,78 @@ define <16 x half> @roundeven_v16f16(<16 x half> %a) {
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[1]
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s4, h0
+; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s6, h1
 ; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h18, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h20, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    mov h21, v1.h[4]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
 ; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frintn s5, s5
+; CHECK-SD-NOFP16-NEXT:    frintn s4, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
 ; CHECK-SD-NOFP16-NEXT:    frintn s6, s6
 ; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    frintn s16, s2
-; CHECK-SD-NOFP16-NEXT:    frintn s17, s3
-; CHECK-SD-NOFP16-NEXT:    frintn s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s5
+; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    frintn s18, s2
+; CHECK-SD-NOFP16-NEXT:    frintn s19, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
+; CHECK-SD-NOFP16-NEXT:    frintn s4, s5
 ; CHECK-SD-NOFP16-NEXT:    fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT:    frintn s7, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s17
-; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h18
+; CHECK-SD-NOFP16-NEXT:    frintn s6, s7
+; CHECK-SD-NOFP16-NEXT:    frintn s16, s16
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
+; CHECK-SD-NOFP16-NEXT:    fcvt h7, s19
+; CHECK-SD-NOFP16-NEXT:    fcvt s18, h20
+; CHECK-SD-NOFP16-NEXT:    fcvt s19, h21
 ; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s7
+; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
+; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
 ; CHECK-SD-NOFP16-NEXT:    mov v2.h[1], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    frintn s17, s17
+; CHECK-SD-NOFP16-NEXT:    frintn s5, s17
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v7.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[5]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[5]
+; CHECK-SD-NOFP16-NEXT:    frintn s18, s18
 ; CHECK-SD-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    frintn s16, s16
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frintn s5, s5
-; CHECK-SD-NOFP16-NEXT:    frintn s6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    frintn s4, s4
+; CHECK-SD-NOFP16-NEXT:    frintn s4, s19
 ; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h7
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v6.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[6]
+; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    mov h19, v1.h[6]
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    frintn s6, s6
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v16.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
+; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
+; CHECK-SD-NOFP16-NEXT:    frintn s7, s7
+; CHECK-SD-NOFP16-NEXT:    frintn s16, s17
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h19
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frintn s5, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    frintn s4, s6
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
+; CHECK-SD-NOFP16-NEXT:    fcvt h6, s16
+; CHECK-SD-NOFP16-NEXT:    frintn s7, s17
 ; CHECK-SD-NOFP16-NEXT:    frintn s0, s0
-; CHECK-SD-NOFP16-NEXT:    frintn s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v6.h[0]
 ; CHECK-SD-NOFP16-NEXT:    frintn s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v6.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v5.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v2.h[7], v0.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v3.h[7], v1.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v2.16b
@@ -2192,40 +2192,40 @@ define <7 x half> @rint_v7f16(<7 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: rint_v7f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frintx s3, s3
+; CHECK-SD-NOFP16-NEXT:    frintx s2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
 ; CHECK-SD-NOFP16-NEXT:    frintx s5, s1
-; CHECK-SD-NOFP16-NEXT:    frintx s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
+; CHECK-SD-NOFP16-NEXT:    frintx s2, s3
 ; CHECK-SD-NOFP16-NEXT:    frintx s4, s4
 ; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    frintx s5, s5
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    frintx s4, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    frintx s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    frintx s3, s3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
 ; CHECK-SD-NOFP16-NEXT:    frintx s2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
 ; CHECK-SD-NOFP16-NEXT:    frintx s0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
@@ -2255,18 +2255,18 @@ define <7 x half> @rint_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    frintx v1.4s, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    frintx v0.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov s2, v1.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov s3, v1.s[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov s3, v1.s[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[2], v3.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[3], v0.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
@@ -2319,16 +2319,16 @@ define <4 x half> @rint_v4f16(<4 x half> %a) {
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frintx s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    frintx s0, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h4
 ; CHECK-SD-NOFP16-NEXT:    frintx s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h4
-; CHECK-SD-NOFP16-NEXT:    frintx s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    frintx s2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
+; CHECK-SD-NOFP16-NEXT:    frintx s2, s3
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[3], v1.h[0]
@@ -2360,40 +2360,40 @@ define <8 x half> @rint_v8f16(<8 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: rint_v8f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frintx s3, s3
+; CHECK-SD-NOFP16-NEXT:    frintx s2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
 ; CHECK-SD-NOFP16-NEXT:    frintx s5, s1
-; CHECK-SD-NOFP16-NEXT:    frintx s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
+; CHECK-SD-NOFP16-NEXT:    frintx s2, s3
 ; CHECK-SD-NOFP16-NEXT:    frintx s4, s4
 ; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    frintx s5, s5
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    frintx s4, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    frintx s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    frintx s3, s3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
 ; CHECK-SD-NOFP16-NEXT:    frintx s2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
 ; CHECK-SD-NOFP16-NEXT:    frintx s0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
@@ -2429,78 +2429,78 @@ define <16 x half> @rint_v16f16(<16 x half> %a) {
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[1]
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s4, h0
+; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s6, h1
 ; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h18, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h20, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    mov h21, v1.h[4]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
 ; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frintx s5, s5
+; CHECK-SD-NOFP16-NEXT:    frintx s4, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
 ; CHECK-SD-NOFP16-NEXT:    frintx s6, s6
 ; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    frintx s16, s2
-; CHECK-SD-NOFP16-NEXT:    frintx s17, s3
-; CHECK-SD-NOFP16-NEXT:    frintx s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s5
+; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    frintx s18, s2
+; CHECK-SD-NOFP16-NEXT:    frintx s19, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
+; CHECK-SD-NOFP16-NEXT:    frintx s4, s5
 ; CHECK-SD-NOFP16-NEXT:    fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT:    frintx s7, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s17
-; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h18
+; CHECK-SD-NOFP16-NEXT:    frintx s6, s7
+; CHECK-SD-NOFP16-NEXT:    frintx s16, s16
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
+; CHECK-SD-NOFP16-NEXT:    fcvt h7, s19
+; CHECK-SD-NOFP16-NEXT:    fcvt s18, h20
+; CHECK-SD-NOFP16-NEXT:    fcvt s19, h21
 ; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s7
+; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
+; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
 ; CHECK-SD-NOFP16-NEXT:    mov v2.h[1], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    frintx s17, s17
+; CHECK-SD-NOFP16-NEXT:    frintx s5, s17
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v7.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[5]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[5]
+; CHECK-SD-NOFP16-NEXT:    frintx s18, s18
 ; CHECK-SD-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    frintx s16, s16
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frintx s5, s5
-; CHECK-SD-NOFP16-NEXT:    frintx s6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    frintx s4, s4
+; CHECK-SD-NOFP16-NEXT:    frintx s4, s19
 ; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h7
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v6.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[6]
+; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    mov h19, v1.h[6]
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    frintx s6, s6
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v16.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
+; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
+; CHECK-SD-NOFP16-NEXT:    frintx s7, s7
+; CHECK-SD-NOFP16-NEXT:    frintx s16, s17
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h19
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frintx s5, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    frintx s4, s6
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
+; CHECK-SD-NOFP16-NEXT:    fcvt h6, s16
+; CHECK-SD-NOFP16-NEXT:    frintx s7, s17
 ; CHECK-SD-NOFP16-NEXT:    frintx s0, s0
-; CHECK-SD-NOFP16-NEXT:    frintx s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v6.h[0]
 ; CHECK-SD-NOFP16-NEXT:    frintx s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v6.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v5.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v2.h[7], v0.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v3.h[7], v1.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v2.16b
@@ -2699,40 +2699,40 @@ define <7 x half> @round_v7f16(<7 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: round_v7f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frinta s3, s3
+; CHECK-SD-NOFP16-NEXT:    frinta s2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
 ; CHECK-SD-NOFP16-NEXT:    frinta s5, s1
-; CHECK-SD-NOFP16-NEXT:    frinta s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
+; CHECK-SD-NOFP16-NEXT:    frinta s2, s3
 ; CHECK-SD-NOFP16-NEXT:    frinta s4, s4
 ; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    frinta s5, s5
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    frinta s4, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    frinta s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    frinta s3, s3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
 ; CHECK-SD-NOFP16-NEXT:    frinta s2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
 ; CHECK-SD-NOFP16-NEXT:    frinta s0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
@@ -2762,18 +2762,18 @@ define <7 x half> @round_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    frinta v1.4s, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    frinta v0.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov s2, v1.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov s3, v1.s[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov s3, v1.s[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[2], v3.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[3], v0.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
@@ -2826,16 +2826,16 @@ define <4 x half> @round_v4f16(<4 x half> %a) {
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frinta s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    frinta s0, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h4
 ; CHECK-SD-NOFP16-NEXT:    frinta s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h4
-; CHECK-SD-NOFP16-NEXT:    frinta s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    frinta s2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
+; CHECK-SD-NOFP16-NEXT:    frinta s2, s3
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[3], v1.h[0]
@@ -2867,40 +2867,40 @@ define <8 x half> @round_v8f16(<8 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: round_v8f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frinta s3, s3
+; CHECK-SD-NOFP16-NEXT:    frinta s2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
 ; CHECK-SD-NOFP16-NEXT:    frinta s5, s1
-; CHECK-SD-NOFP16-NEXT:    frinta s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
+; CHECK-SD-NOFP16-NEXT:    frinta s2, s3
 ; CHECK-SD-NOFP16-NEXT:    frinta s4, s4
 ; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    frinta s5, s5
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    frinta s4, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    frinta s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    frinta s3, s3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
 ; CHECK-SD-NOFP16-NEXT:    frinta s2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
 ; CHECK-SD-NOFP16-NEXT:    frinta s0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
@@ -2936,78 +2936,78 @@ define <16 x half> @round_v16f16(<16 x half> %a) {
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[1]
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s4, h0
+; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s6, h1
 ; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h18, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h20, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    mov h21, v1.h[4]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
 ; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frinta s5, s5
+; CHECK-SD-NOFP16-NEXT:    frinta s4, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
 ; CHECK-SD-NOFP16-NEXT:    frinta s6, s6
 ; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    frinta s16, s2
-; CHECK-SD-NOFP16-NEXT:    frinta s17, s3
-; CHECK-SD-NOFP16-NEXT:    frinta s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s5
+; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    frinta s18, s2
+; CHECK-SD-NOFP16-NEXT:    frinta s19, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
+; CHECK-SD-NOFP16-NEXT:    frinta s4, s5
 ; CHECK-SD-NOFP16-NEXT:    fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT:    frinta s7, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s17
-; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h18
+; CHECK-SD-NOFP16-NEXT:    frinta s6, s7
+; CHECK-SD-NOFP16-NEXT:    frinta s16, s16
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
+; CHECK-SD-NOFP16-NEXT:    fcvt h7, s19
+; CHECK-SD-NOFP16-NEXT:    fcvt s18, h20
+; CHECK-SD-NOFP16-NEXT:    fcvt s19, h21
 ; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s7
+; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
+; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
 ; CHECK-SD-NOFP16-NEXT:    mov v2.h[1], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    frinta s17, s17
+; CHECK-SD-NOFP16-NEXT:    frinta s5, s17
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v7.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[5]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[5]
+; CHECK-SD-NOFP16-NEXT:    frinta s18, s18
 ; CHECK-SD-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    frinta s16, s16
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frinta s5, s5
-; CHECK-SD-NOFP16-NEXT:    frinta s6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    frinta s4, s4
+; CHECK-SD-NOFP16-NEXT:    frinta s4, s19
 ; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h7
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v6.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[6]
+; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    mov h19, v1.h[6]
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    frinta s6, s6
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v16.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
+; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
+; CHECK-SD-NOFP16-NEXT:    frinta s7, s7
+; CHECK-SD-NOFP16-NEXT:    frinta s16, s17
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h19
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frinta s5, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    frinta s4, s6
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
+; CHECK-SD-NOFP16-NEXT:    fcvt h6, s16
+; CHECK-SD-NOFP16-NEXT:    frinta s7, s17
 ; CHECK-SD-NOFP16-NEXT:    frinta s0, s0
-; CHECK-SD-NOFP16-NEXT:    frinta s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v6.h[0]
 ; CHECK-SD-NOFP16-NEXT:    frinta s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v6.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v5.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v2.h[7], v0.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v3.h[7], v1.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v2.16b
@@ -3206,40 +3206,40 @@ define <7 x half> @trunc_v7f16(<7 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: trunc_v7f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frintz s3, s3
+; CHECK-SD-NOFP16-NEXT:    frintz s2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
 ; CHECK-SD-NOFP16-NEXT:    frintz s5, s1
-; CHECK-SD-NOFP16-NEXT:    frintz s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
+; CHECK-SD-NOFP16-NEXT:    frintz s2, s3
 ; CHECK-SD-NOFP16-NEXT:    frintz s4, s4
 ; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    frintz s5, s5
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    frintz s4, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    frintz s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    frintz s3, s3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
 ; CHECK-SD-NOFP16-NEXT:    frintz s2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
 ; CHECK-SD-NOFP16-NEXT:    frintz s0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
@@ -3269,18 +3269,18 @@ define <7 x half> @trunc_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    frintz v1.4s, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    frintz v0.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov s2, v1.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov s3, v1.s[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov s3, v1.s[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[2], v3.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[3], v0.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
@@ -3333,16 +3333,16 @@ define <4 x half> @trunc_v4f16(<4 x half> %a) {
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    frintz s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    frintz s0, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h4
 ; CHECK-SD-NOFP16-NEXT:    frintz s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h4
-; CHECK-SD-NOFP16-NEXT:    frintz s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    frintz s2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
+; CHECK-SD-NOFP16-NEXT:    frintz s2, s3
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[3], v1.h[0]
@@ -3374,40 +3374,40 @@ define <8 x half> @trunc_v8f16(<8 x half> %a) {
 ; CHECK-SD-NOFP16-LABEL: trunc_v8f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[4]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    frintz s3, s3
+; CHECK-SD-NOFP16-NEXT:    frintz s2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
 ; CHECK-SD-NOFP16-NEXT:    frintz s5, s1
-; CHECK-SD-NOFP16-NEXT:    frintz s2, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s2
+; CHECK-SD-NOFP16-NEXT:    frintz s2, s3
 ; CHECK-SD-NOFP16-NEXT:    frintz s4, s4
 ; CHECK-SD-NOFP16-NEXT:    fcvt h3, s5
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    frintz s5, s5
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    frintz s4, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    frintz s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    frintz s3, s3
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
 ; CHECK-SD-NOFP16-NEXT:    frintz s2, s2
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
 ; CHECK-SD-NOFP16-NEXT:    frintz s0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v0.h[0]
@@ -3443,78 +3443,78 @@ define <16 x half> @trunc_v16f16(<16 x half> %a) {
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[1]
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s4, h0
+; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s6, h1
 ; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    mov h18, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[3]
+; CHECK-SD-NOFP16-NEXT:    mov h20, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    mov h21, v1.h[4]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
 ; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frintz s5, s5
+; CHECK-SD-NOFP16-NEXT:    frintz s4, s4
+; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
 ; CHECK-SD-NOFP16-NEXT:    frintz s6, s6
 ; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    frintz s16, s2
-; CHECK-SD-NOFP16-NEXT:    frintz s17, s3
-; CHECK-SD-NOFP16-NEXT:    frintz s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s5
+; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    frintz s18, s2
+; CHECK-SD-NOFP16-NEXT:    frintz s19, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
+; CHECK-SD-NOFP16-NEXT:    frintz s4, s5
 ; CHECK-SD-NOFP16-NEXT:    fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT:    frintz s7, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s17
-; CHECK-SD-NOFP16-NEXT:    mov h16, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h18
+; CHECK-SD-NOFP16-NEXT:    frintz s6, s7
+; CHECK-SD-NOFP16-NEXT:    frintz s16, s16
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
+; CHECK-SD-NOFP16-NEXT:    fcvt h7, s19
+; CHECK-SD-NOFP16-NEXT:    fcvt s18, h20
+; CHECK-SD-NOFP16-NEXT:    fcvt s19, h21
 ; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s7
+; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
+; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
 ; CHECK-SD-NOFP16-NEXT:    mov v2.h[1], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h6, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    frintz s17, s17
+; CHECK-SD-NOFP16-NEXT:    frintz s5, s17
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v7.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov h7, v0.h[5]
+; CHECK-SD-NOFP16-NEXT:    mov h17, v1.h[5]
+; CHECK-SD-NOFP16-NEXT:    frintz s18, s18
 ; CHECK-SD-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    frintz s16, s16
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frintz s5, s5
-; CHECK-SD-NOFP16-NEXT:    frintz s6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v7.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-SD-NOFP16-NEXT:    frintz s4, s4
+; CHECK-SD-NOFP16-NEXT:    frintz s4, s19
 ; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v6.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h7
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v6.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov h6, v0.h[6]
+; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-SD-NOFP16-NEXT:    mov h19, v1.h[6]
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    frintz s6, s6
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[3], v16.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s18
+; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
+; CHECK-SD-NOFP16-NEXT:    frintz s7, s7
+; CHECK-SD-NOFP16-NEXT:    frintz s16, s17
+; CHECK-SD-NOFP16-NEXT:    fcvt s17, h19
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    frintz s5, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
 ; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[4], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    frintz s4, s6
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
+; CHECK-SD-NOFP16-NEXT:    fcvt h6, s16
+; CHECK-SD-NOFP16-NEXT:    frintz s7, s17
 ; CHECK-SD-NOFP16-NEXT:    frintz s0, s0
-; CHECK-SD-NOFP16-NEXT:    frintz s4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v6.h[0]
 ; CHECK-SD-NOFP16-NEXT:    frintz s1, s1
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[5], v5.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v6.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, s7
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
-; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v2.h[6], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v5.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v2.h[7], v0.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v3.h[7], v1.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v2.16b

diff  --git a/llvm/test/CodeGen/AArch64/fcvt_combine.ll b/llvm/test/CodeGen/AArch64/fcvt_combine.ll
index 67af07e05ab08a..bfe8d173435c41 100644
--- a/llvm/test/CodeGen/AArch64/fcvt_combine.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt_combine.ll
@@ -110,7 +110,7 @@ define <2 x i32> @test9(<2 x float> %f) {
 define <2 x i32> @test10(<2 x float> %f) {
 ; CHECK-LABEL: test10:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #2143289344
+; CHECK-NEXT:    mov w8, #2143289344 // =0x7fc00000
 ; CHECK-NEXT:    dup v0.2s, w8
 ; CHECK-NEXT:    fcvtzu v0.2s, v0.2s
 ; CHECK-NEXT:    ret
@@ -180,48 +180,48 @@ define <3 x i32> @test_illegal_fp_to_int(<3 x float> %in) {
 define <8 x i16> @test_v8f16(<8 x half> %in) {
 ; CHECK-NO16-LABEL: test_v8f16:
 ; CHECK-NO16:       // %bb.0:
-; CHECK-NO16-NEXT:    mov h2, v0.h[4]
-; CHECK-NO16-NEXT:    mov h3, v0.h[5]
-; CHECK-NO16-NEXT:    mov h4, v0.h[1]
+; CHECK-NO16-NEXT:    mov h2, v0.h[1]
+; CHECK-NO16-NEXT:    mov h3, v0.h[4]
+; CHECK-NO16-NEXT:    mov h4, v0.h[5]
 ; CHECK-NO16-NEXT:    mov h5, v0.h[2]
-; CHECK-NO16-NEXT:    mov h6, v0.h[6]
-; CHECK-NO16-NEXT:    fcvt s7, h0
+; CHECK-NO16-NEXT:    fcvt s6, h0
+; CHECK-NO16-NEXT:    mov h7, v0.h[6]
 ; CHECK-NO16-NEXT:    fmov s1, #4.00000000
 ; CHECK-NO16-NEXT:    mov h16, v0.h[3]
+; CHECK-NO16-NEXT:    mov h0, v0.h[7]
 ; CHECK-NO16-NEXT:    fcvt s2, h2
 ; CHECK-NO16-NEXT:    fcvt s3, h3
 ; CHECK-NO16-NEXT:    fcvt s4, h4
-; CHECK-NO16-NEXT:    mov h0, v0.h[7]
+; CHECK-NO16-NEXT:    fmul s6, s6, s1
 ; CHECK-NO16-NEXT:    fcvt s5, h5
-; CHECK-NO16-NEXT:    fcvt s6, h6
-; CHECK-NO16-NEXT:    fmul s7, s7, s1
+; CHECK-NO16-NEXT:    fcvt s7, h7
 ; CHECK-NO16-NEXT:    fcvt s16, h16
+; CHECK-NO16-NEXT:    fcvt s0, h0
 ; CHECK-NO16-NEXT:    fmul s2, s2, s1
 ; CHECK-NO16-NEXT:    fmul s3, s3, s1
 ; CHECK-NO16-NEXT:    fmul s4, s4, s1
-; CHECK-NO16-NEXT:    fcvt s0, h0
 ; CHECK-NO16-NEXT:    fmul s5, s5, s1
-; CHECK-NO16-NEXT:    fmul s6, s6, s1
-; CHECK-NO16-NEXT:    fcvt h7, s7
+; CHECK-NO16-NEXT:    fcvt h6, s6
+; CHECK-NO16-NEXT:    fmul s7, s7, s1
 ; CHECK-NO16-NEXT:    fmul s16, s16, s1
+; CHECK-NO16-NEXT:    fmul s0, s0, s1
 ; CHECK-NO16-NEXT:    fcvt h2, s2
 ; CHECK-NO16-NEXT:    fcvt h3, s3
 ; CHECK-NO16-NEXT:    fcvt h4, s4
-; CHECK-NO16-NEXT:    fmul s0, s0, s1
-; CHECK-NO16-NEXT:    fcvt h1, s5
-; CHECK-NO16-NEXT:    fcvt h5, s6
-; CHECK-NO16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-NO16-NEXT:    fcvt h3, s16
-; CHECK-NO16-NEXT:    mov v7.h[1], v4.h[0]
+; CHECK-NO16-NEXT:    fcvt h5, s5
+; CHECK-NO16-NEXT:    fcvt h1, s7
 ; CHECK-NO16-NEXT:    fcvt h0, s0
-; CHECK-NO16-NEXT:    mov v2.h[2], v5.h[0]
-; CHECK-NO16-NEXT:    mov v7.h[2], v1.h[0]
-; CHECK-NO16-NEXT:    mov v2.h[3], v0.h[0]
-; CHECK-NO16-NEXT:    mov v7.h[3], v3.h[0]
-; CHECK-NO16-NEXT:    fcvtl v0.4s, v2.4h
-; CHECK-NO16-NEXT:    fcvtl v1.4s, v7.4h
-; CHECK-NO16-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NO16-NEXT:    mov v6.h[1], v2.h[0]
+; CHECK-NO16-NEXT:    fcvt h2, s16
+; CHECK-NO16-NEXT:    mov v3.h[1], v4.h[0]
+; CHECK-NO16-NEXT:    mov v6.h[2], v5.h[0]
+; CHECK-NO16-NEXT:    mov v3.h[2], v1.h[0]
+; CHECK-NO16-NEXT:    mov v6.h[3], v2.h[0]
+; CHECK-NO16-NEXT:    mov v3.h[3], v0.h[0]
+; CHECK-NO16-NEXT:    fcvtl v1.4s, v6.4h
+; CHECK-NO16-NEXT:    fcvtl v0.4s, v3.4h
 ; CHECK-NO16-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-NO16-NEXT:    fcvtzs v0.4s, v0.4s
 ; CHECK-NO16-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
 ; CHECK-NO16-NEXT:    ret
 ;
@@ -331,9 +331,9 @@ define <2 x i32> @test4_sat(<2 x double> %d) {
 ; CHECK-NEXT:    fmul v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    mov d1, v0.d[1]
 ; CHECK-NEXT:    fcvtzs w8, d0
+; CHECK-NEXT:    fcvtzs w9, d1
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fcvtzs w8, d1
-; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    mov v0.s[1], w9
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
   %mul.i = fmul <2 x double> %d, <double 16.000000e+00, double 16.000000e+00>
@@ -376,9 +376,9 @@ define <2 x i64> @test6_sat(<2 x float> %f) {
 ; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    mov s1, v0.s[1]
 ; CHECK-NEXT:    fcvtzs x8, s0
+; CHECK-NEXT:    fcvtzs x9, s1
 ; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fcvtzs x8, s1
-; CHECK-NEXT:    mov v0.d[1], x8
+; CHECK-NEXT:    mov v0.d[1], x9
 ; CHECK-NEXT:    ret
   %mul.i = fmul <2 x float> %f, <float 16.000000e+00, float 16.000000e+00>
   %vcvt.i = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> %mul.i)
@@ -426,7 +426,7 @@ define <2 x i32> @test9_sat(<2 x float> %f) {
 define <2 x i32> @test10_sat(<2 x float> %f) {
 ; CHECK-LABEL: test10_sat:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #2143289344
+; CHECK-NEXT:    mov w8, #2143289344 // =0x7fc00000
 ; CHECK-NEXT:    dup v0.2s, w8
 ; CHECK-NEXT:    fcvtzu v0.2s, v0.2s
 ; CHECK-NEXT:    ret
@@ -498,101 +498,101 @@ define <8 x i16> @test_v8f16_sat(<8 x half> %in) {
 ; CHECK-NO16:       // %bb.0:
 ; CHECK-NO16-NEXT:    mov h2, v0.h[4]
 ; CHECK-NO16-NEXT:    mov h3, v0.h[5]
+; CHECK-NO16-NEXT:    mov w9, #32767 // =0x7fff
 ; CHECK-NO16-NEXT:    mov h4, v0.h[6]
 ; CHECK-NO16-NEXT:    fmov s1, #4.00000000
+; CHECK-NO16-NEXT:    mov w11, #-32768 // =0xffff8000
 ; CHECK-NO16-NEXT:    mov h5, v0.h[7]
 ; CHECK-NO16-NEXT:    mov h6, v0.h[1]
 ; CHECK-NO16-NEXT:    mov h7, v0.h[2]
 ; CHECK-NO16-NEXT:    fcvt s16, h0
+; CHECK-NO16-NEXT:    mov h0, v0.h[3]
 ; CHECK-NO16-NEXT:    fcvt s2, h2
 ; CHECK-NO16-NEXT:    fcvt s3, h3
 ; CHECK-NO16-NEXT:    fcvt s4, h4
-; CHECK-NO16-NEXT:    mov h0, v0.h[3]
 ; CHECK-NO16-NEXT:    fcvt s5, h5
 ; CHECK-NO16-NEXT:    fcvt s6, h6
-; CHECK-NO16-NEXT:    mov w9, #32767
-; CHECK-NO16-NEXT:    mov w10, #-32768
+; CHECK-NO16-NEXT:    fcvt s0, h0
 ; CHECK-NO16-NEXT:    fmul s2, s2, s1
 ; CHECK-NO16-NEXT:    fmul s3, s3, s1
 ; CHECK-NO16-NEXT:    fmul s4, s4, s1
-; CHECK-NO16-NEXT:    fcvt s0, h0
 ; CHECK-NO16-NEXT:    fmul s5, s5, s1
 ; CHECK-NO16-NEXT:    fmul s6, s6, s1
+; CHECK-NO16-NEXT:    fmul s0, s0, s1
 ; CHECK-NO16-NEXT:    fcvt h2, s2
 ; CHECK-NO16-NEXT:    fcvt h3, s3
-; CHECK-NO16-NEXT:    fmul s0, s0, s1
+; CHECK-NO16-NEXT:    fcvt h4, s4
 ; CHECK-NO16-NEXT:    fcvt h5, s5
 ; CHECK-NO16-NEXT:    fcvt h6, s6
+; CHECK-NO16-NEXT:    fcvt h0, s0
 ; CHECK-NO16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-NO16-NEXT:    fcvt h3, s4
-; CHECK-NO16-NEXT:    fcvt s4, h7
+; CHECK-NO16-NEXT:    fcvt s3, h7
 ; CHECK-NO16-NEXT:    fmul s7, s16, s1
-; CHECK-NO16-NEXT:    fcvt h0, s0
-; CHECK-NO16-NEXT:    mov v2.h[2], v3.h[0]
-; CHECK-NO16-NEXT:    fmul s3, s4, s1
+; CHECK-NO16-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-NO16-NEXT:    fmul s3, s3, s1
 ; CHECK-NO16-NEXT:    fcvt h4, s7
 ; CHECK-NO16-NEXT:    mov v2.h[3], v5.h[0]
 ; CHECK-NO16-NEXT:    fcvt h1, s3
 ; CHECK-NO16-NEXT:    mov v4.h[1], v6.h[0]
 ; CHECK-NO16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-NO16-NEXT:    mov v4.h[2], v1.h[0]
-; CHECK-NO16-NEXT:    mov s1, v2.s[1]
-; CHECK-NO16-NEXT:    fcvtzs w11, s2
+; CHECK-NO16-NEXT:    mov s3, v2.s[1]
 ; CHECK-NO16-NEXT:    mov v4.h[3], v0.h[0]
 ; CHECK-NO16-NEXT:    mov s0, v2.s[2]
+; CHECK-NO16-NEXT:    fcvtzs w10, s2
 ; CHECK-NO16-NEXT:    mov s2, v2.s[3]
-; CHECK-NO16-NEXT:    fcvtzs w8, s1
+; CHECK-NO16-NEXT:    fcvtzs w8, s3
 ; CHECK-NO16-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-NO16-NEXT:    fcvtzs w12, s0
-; CHECK-NO16-NEXT:    cmp w8, w9
 ; CHECK-NO16-NEXT:    fcvtzs w13, s2
-; CHECK-NO16-NEXT:    csel w8, w8, w9, lt
-; CHECK-NO16-NEXT:    cmn w8, #8, lsl #12 // =32768
-; CHECK-NO16-NEXT:    csel w8, w8, w10, gt
-; CHECK-NO16-NEXT:    cmp w11, w9
-; CHECK-NO16-NEXT:    csel w11, w11, w9, lt
+; CHECK-NO16-NEXT:    cmp w8, w9
 ; CHECK-NO16-NEXT:    mov s0, v1.s[1]
-; CHECK-NO16-NEXT:    cmn w11, #8, lsl #12 // =32768
 ; CHECK-NO16-NEXT:    fcvtzs w15, s1
-; CHECK-NO16-NEXT:    csel w11, w11, w10, gt
+; CHECK-NO16-NEXT:    csel w8, w8, w9, lt
+; CHECK-NO16-NEXT:    cmn w8, #8, lsl #12 // =32768
+; CHECK-NO16-NEXT:    csel w8, w8, w11, gt
+; CHECK-NO16-NEXT:    cmp w10, w9
+; CHECK-NO16-NEXT:    csel w10, w10, w9, lt
+; CHECK-NO16-NEXT:    fcvtzs w14, s0
+; CHECK-NO16-NEXT:    mov s0, v1.s[2]
+; CHECK-NO16-NEXT:    cmn w10, #8, lsl #12 // =32768
+; CHECK-NO16-NEXT:    csel w10, w10, w11, gt
 ; CHECK-NO16-NEXT:    cmp w12, w9
 ; CHECK-NO16-NEXT:    csel w12, w12, w9, lt
 ; CHECK-NO16-NEXT:    cmn w12, #8, lsl #12 // =32768
-; CHECK-NO16-NEXT:    fcvtzs w14, s0
-; CHECK-NO16-NEXT:    csel w12, w12, w10, gt
+; CHECK-NO16-NEXT:    fcvtzs w16, s0
+; CHECK-NO16-NEXT:    mov s0, v1.s[3]
+; CHECK-NO16-NEXT:    csel w12, w12, w11, gt
 ; CHECK-NO16-NEXT:    cmp w13, w9
+; CHECK-NO16-NEXT:    fmov s1, w10
 ; CHECK-NO16-NEXT:    csel w13, w13, w9, lt
-; CHECK-NO16-NEXT:    mov s0, v1.s[2]
 ; CHECK-NO16-NEXT:    cmn w13, #8, lsl #12 // =32768
-; CHECK-NO16-NEXT:    fmov s2, w11
-; CHECK-NO16-NEXT:    csel w13, w13, w10, gt
+; CHECK-NO16-NEXT:    csel w13, w13, w11, gt
 ; CHECK-NO16-NEXT:    cmp w14, w9
+; CHECK-NO16-NEXT:    mov v1.s[1], w8
 ; CHECK-NO16-NEXT:    csel w14, w14, w9, lt
+; CHECK-NO16-NEXT:    fcvtzs w8, s0
 ; CHECK-NO16-NEXT:    cmn w14, #8, lsl #12 // =32768
-; CHECK-NO16-NEXT:    csel w14, w14, w10, gt
+; CHECK-NO16-NEXT:    csel w14, w14, w11, gt
 ; CHECK-NO16-NEXT:    cmp w15, w9
 ; CHECK-NO16-NEXT:    csel w15, w15, w9, lt
+; CHECK-NO16-NEXT:    mov v1.s[2], w12
 ; CHECK-NO16-NEXT:    cmn w15, #8, lsl #12 // =32768
-; CHECK-NO16-NEXT:    csel w11, w15, w10, gt
-; CHECK-NO16-NEXT:    fcvtzs w15, s0
-; CHECK-NO16-NEXT:    mov s0, v1.s[3]
-; CHECK-NO16-NEXT:    mov v2.s[1], w8
-; CHECK-NO16-NEXT:    fmov s1, w11
-; CHECK-NO16-NEXT:    cmp w15, w9
-; CHECK-NO16-NEXT:    csel w8, w15, w9, lt
-; CHECK-NO16-NEXT:    fcvtzs w11, s0
+; CHECK-NO16-NEXT:    csel w10, w15, w11, gt
+; CHECK-NO16-NEXT:    cmp w16, w9
+; CHECK-NO16-NEXT:    fmov s2, w10
+; CHECK-NO16-NEXT:    csel w10, w16, w9, lt
+; CHECK-NO16-NEXT:    cmn w10, #8, lsl #12 // =32768
+; CHECK-NO16-NEXT:    mov v1.s[3], w13
+; CHECK-NO16-NEXT:    csel w10, w10, w11, gt
+; CHECK-NO16-NEXT:    cmp w8, w9
+; CHECK-NO16-NEXT:    mov v2.s[1], w14
+; CHECK-NO16-NEXT:    csel w8, w8, w9, lt
 ; CHECK-NO16-NEXT:    cmn w8, #8, lsl #12 // =32768
-; CHECK-NO16-NEXT:    mov v1.s[1], w14
-; CHECK-NO16-NEXT:    csel w8, w8, w10, gt
-; CHECK-NO16-NEXT:    mov v2.s[2], w12
-; CHECK-NO16-NEXT:    cmp w11, w9
-; CHECK-NO16-NEXT:    csel w9, w11, w9, lt
-; CHECK-NO16-NEXT:    mov v1.s[2], w8
-; CHECK-NO16-NEXT:    cmn w9, #8, lsl #12 // =32768
-; CHECK-NO16-NEXT:    csel w8, w9, w10, gt
-; CHECK-NO16-NEXT:    mov v2.s[3], w13
-; CHECK-NO16-NEXT:    mov v1.s[3], w8
-; CHECK-NO16-NEXT:    uzp1 v0.8h, v1.8h, v2.8h
+; CHECK-NO16-NEXT:    csel w8, w8, w11, gt
+; CHECK-NO16-NEXT:    mov v2.s[2], w10
+; CHECK-NO16-NEXT:    mov v2.s[3], w8
+; CHECK-NO16-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
 ; CHECK-NO16-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_v8f16_sat:

diff  --git a/llvm/test/CodeGen/AArch64/fdiv-combine.ll b/llvm/test/CodeGen/AArch64/fdiv-combine.ll
index 57fef74e502752..1ed63f3ef25077 100644
--- a/llvm/test/CodeGen/AArch64/fdiv-combine.ll
+++ b/llvm/test/CodeGen/AArch64/fdiv-combine.ll
@@ -100,8 +100,8 @@ define void @two_fdiv_double(double %D, double %a, double %b) #0 {
 define void @splat_three_fdiv_4xfloat(float %D, <4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
 ; CHECK-LABEL: splat_three_fdiv_4xfloat:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov v4.4s, #1.00000000
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    fmov v4.4s, #1.00000000
 ; CHECK-NEXT:    dup v0.4s, v0.s[0]
 ; CHECK-NEXT:    fdiv v4.4s, v4.4s, v0.4s
 ; CHECK-NEXT:    fmul v0.4s, v1.4s, v4.4s
@@ -120,8 +120,8 @@ define void @splat_three_fdiv_4xfloat(float %D, <4 x float> %a, <4 x float> %b,
 define <4 x float> @splat_fdiv_v4f32(float %D, <4 x float> %a) #1 {
 ; CHECK-LABEL: splat_fdiv_v4f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov v2.4s, #1.00000000
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    fmov v2.4s, #1.00000000
 ; CHECK-NEXT:    dup v0.4s, v0.s[0]
 ; CHECK-NEXT:    fdiv v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    fmul v0.4s, v1.4s, v0.4s
@@ -171,8 +171,8 @@ entry:
 define <vscale x 2 x double> @splat_fdiv_nxv2f64(double %D, <vscale x 2 x double> %a) #1 {
 ; CHECK-LABEL: splat_fdiv_nxv2f64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z0.d, d0
 ; CHECK-NEXT:    fdivr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
index 66333e9b6a2fd6..1b1cfead0f97ac 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
@@ -9,8 +9,8 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec
 ; CHECK-NEXT:    mov v1.16b, v2.16b
 ; CHECK-NEXT:    mov v1.h[0], v0.h[1]
 ; CHECK-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
 ; CHECK-NEXT:    ret
   %retval = call {<2 x half>, <2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %vec)
   ret {<2 x half>, <2 x half>}   %retval

diff  --git a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
index f9e56633195d9f..071c1ffdbb45dc 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
@@ -13,9 +13,9 @@ define <4 x half> @interleave2_v4f16(<2 x half> %vec0, <2 x half> %vec1) {
 define <8 x half> @interleave2_v8f16(<4 x half> %vec0, <4 x half> %vec1) {
 ; CHECK-LABEL: interleave2_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI1_0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    adrp x8, .LCPI1_0
 ; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI1_0]
 ; CHECK-NEXT:    tbl v0.16b, { v0.16b }, v1.16b

diff  --git a/llvm/test/CodeGen/AArch64/flags-multiuse.ll b/llvm/test/CodeGen/AArch64/flags-multiuse.ll
index 62aaa9e0e82500..005207b28d7d99 100644
--- a/llvm/test/CodeGen/AArch64/flags-multiuse.ll
+++ b/llvm/test/CodeGen/AArch64/flags-multiuse.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -verify-machineinstrs -o - %s | FileCheck %s
 
 ; LLVM should be able to cope with multiple uses of the same flag-setting
@@ -10,26 +11,45 @@ declare void @bar()
 
 define i32 @test_multiflag(i32 %n, i32 %m, i32 %o) {
 ; CHECK-LABEL: test_multiflag:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    adrp x8, :got:var
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    mov w19, w1
+; CHECK-NEXT:    ldr x8, [x8, :got_lo12:var]
+; CHECK-NEXT:    cset w9, ne
+; CHECK-NEXT:    mov w20, w0
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    cmp w20, w19
+; CHECK-NEXT:    b.eq .LBB0_2
+; CHECK-NEXT:  // %bb.1: // %iftrue
+; CHECK-NEXT:    mov w0, #42 // =0x2a
+; CHECK-NEXT:    b .LBB0_3
+; CHECK-NEXT:  .LBB0_2: // %iffalse
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:  .LBB0_3: // %iftrue
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
 
   %test = icmp ne i32 %n, %m
-; CHECK: cmp [[LHS:w[0-9]+]], [[RHS:w[0-9]+]]
 
   %val = zext i1 %test to i32
-; CHECK: cset {{[xw][0-9]+}}, ne
 
-; CHECK: mov [[RHSCOPY:w[0-9]+]], [[RHS]]
-; CHECK: mov [[LHSCOPY:w[0-9]+]], [[LHS]]
 
   store i32 %val, ptr @var
 
   call void @bar()
-; CHECK: bl bar
 
   ; Currently, the comparison is emitted again. An MSR/MRS pair would also be
   ; acceptable, but assuming the call preserves NZCV is not.
   br i1 %test, label %iftrue, label %iffalse
-; CHECK: cmp [[LHSCOPY]], [[RHSCOPY]]
-; CHECK: b.eq
 
 iftrue:
   ret i32 42

diff  --git a/llvm/test/CodeGen/AArch64/fmaximum-legalization.ll b/llvm/test/CodeGen/AArch64/fmaximum-legalization.ll
index fa86e6ceec1afd..86c1474068482e 100644
--- a/llvm/test/CodeGen/AArch64/fmaximum-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/fmaximum-legalization.ll
@@ -13,22 +13,22 @@ define <4 x half> @fmaximum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    mov h2, v1.h[1]
 ; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    fcvt s4, h1
-; CHECK-NEXT:    fcvt s5, h0
-; CHECK-NEXT:    mov h6, v1.h[2]
-; CHECK-NEXT:    mov h7, v0.h[2]
+; CHECK-NEXT:    mov h4, v1.h[2]
+; CHECK-NEXT:    mov h5, v0.h[2]
+; CHECK-NEXT:    fcvt s6, h1
+; CHECK-NEXT:    fcvt s7, h0
 ; CHECK-NEXT:    mov h1, v1.h[3]
 ; CHECK-NEXT:    fcvt s2, h2
 ; CHECK-NEXT:    fcvt s3, h3
-; CHECK-NEXT:    fmax s4, s5, s4
-; CHECK-NEXT:    fcvt s5, h7
 ; CHECK-NEXT:    fcvt s1, h1
 ; CHECK-NEXT:    fmax s2, s3, s2
-; CHECK-NEXT:    fcvt s3, h6
+; CHECK-NEXT:    fcvt s3, h4
+; CHECK-NEXT:    fcvt s4, h5
+; CHECK-NEXT:    fmax s5, s7, s6
 ; CHECK-NEXT:    mov h6, v0.h[3]
-; CHECK-NEXT:    fcvt h0, s4
+; CHECK-NEXT:    fmax s3, s4, s3
 ; CHECK-NEXT:    fcvt h2, s2
-; CHECK-NEXT:    fmax s3, s5, s3
+; CHECK-NEXT:    fcvt h0, s5
 ; CHECK-NEXT:    fcvt s4, h6
 ; CHECK-NEXT:    mov v0.h[1], v2.h[0]
 ; CHECK-NEXT:    fcvt h2, s3

diff  --git a/llvm/test/CodeGen/AArch64/fminimummaximum.ll b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
index 2492db229ceff5..b2952a1bc50389 100644
--- a/llvm/test/CodeGen/AArch64/fminimummaximum.ll
+++ b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
@@ -201,22 +201,34 @@ entry:
 }
 
 define <4 x double> @min_v4f64(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: min_v4f64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmin v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    fmin v1.2d, v1.2d, v3.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: min_v4f64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmin v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    fmin v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: min_v4f64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmin v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    fmin v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ret
 entry:
   %c = call <4 x double> @llvm.minimum.v4f64(<4 x double> %a, <4 x double> %b)
   ret <4 x double> %c
 }
 
 define <4 x double> @max_v4f64(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: max_v4f64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmax v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    fmax v1.2d, v1.2d, v3.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: max_v4f64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmax v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    fmax v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: max_v4f64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmax v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    fmax v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ret
 entry:
   %c = call <4 x double> @llvm.maximum.v4f64(<4 x double> %a, <4 x double> %b)
   ret <4 x double> %c
@@ -325,39 +337,39 @@ entry:
 define <7 x float> @min_v7f32(<7 x float> %a, <7 x float> %b) {
 ; CHECK-SD-LABEL: min_v7f32:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mov x8, sp
-; CHECK-SD-NEXT:    // kill: def $s7 killed $s7 def $q7
 ; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
 ; CHECK-SD-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-SD-NEXT:    add x9, sp, #32
-; CHECK-SD-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-SD-NEXT:    ldr s1, [sp, #24]
+; CHECK-SD-NEXT:    // kill: def $s7 killed $s7 def $q7
+; CHECK-SD-NEXT:    mov x8, sp
 ; CHECK-SD-NEXT:    // kill: def $s4 killed $s4 def $q4
 ; CHECK-SD-NEXT:    // kill: def $s5 killed $s5 def $q5
 ; CHECK-SD-NEXT:    // kill: def $s2 killed $s2 def $q2
 ; CHECK-SD-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-SD-NEXT:    // kill: def $s3 killed $s3 def $q3
+; CHECK-SD-NEXT:    mov v0.s[1], v1.s[0]
 ; CHECK-SD-NEXT:    ld1 { v7.s }[1], [x8]
+; CHECK-SD-NEXT:    ldr s1, [sp, #24]
 ; CHECK-SD-NEXT:    add x8, sp, #8
-; CHECK-SD-NEXT:    ld1 { v1.s }[1], [x9]
-; CHECK-SD-NEXT:    mov v0.s[2], v2.s[0]
 ; CHECK-SD-NEXT:    mov v4.s[1], v5.s[0]
 ; CHECK-SD-NEXT:    ld1 { v7.s }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #32
+; CHECK-SD-NEXT:    mov v0.s[2], v2.s[0]
+; CHECK-SD-NEXT:    ld1 { v1.s }[1], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #16
-; CHECK-SD-NEXT:    mov v0.s[3], v3.s[0]
 ; CHECK-SD-NEXT:    mov v4.s[2], v6.s[0]
 ; CHECK-SD-NEXT:    ld1 { v7.s }[3], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #40
 ; CHECK-SD-NEXT:    ld1 { v1.s }[2], [x8]
-; CHECK-SD-NEXT:    fmin v0.4s, v0.4s, v7.4s
+; CHECK-SD-NEXT:    mov v0.s[3], v3.s[0]
 ; CHECK-SD-NEXT:    fmin v4.4s, v4.4s, v1.4s
+; CHECK-SD-NEXT:    fmin v0.4s, v0.4s, v7.4s
+; CHECK-SD-NEXT:    mov s5, v4.s[1]
+; CHECK-SD-NEXT:    mov s6, v4.s[2]
+; CHECK-SD-NEXT:    // kill: def $s4 killed $s4 killed $q4
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    mov s2, v0.s[2]
 ; CHECK-SD-NEXT:    mov s3, v0.s[3]
 ; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    mov s5, v4.s[1]
-; CHECK-SD-NEXT:    mov s6, v4.s[2]
-; CHECK-SD-NEXT:    // kill: def $s4 killed $s4 killed $q4
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: min_v7f32:
@@ -365,31 +377,31 @@ define <7 x float> @min_v7f32(<7 x float> %a, <7 x float> %b) {
 ; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
 ; CHECK-GI-NEXT:    // kill: def $s1 killed $s1 def $q1
 ; CHECK-GI-NEXT:    ldr s16, [sp]
+; CHECK-GI-NEXT:    ldr s17, [sp, #24]
 ; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 def $q4
 ; CHECK-GI-NEXT:    // kill: def $s7 killed $s7 def $q7
-; CHECK-GI-NEXT:    // kill: def $s5 killed $s5 def $q5
 ; CHECK-GI-NEXT:    // kill: def $s2 killed $s2 def $q2
+; CHECK-GI-NEXT:    // kill: def $s5 killed $s5 def $q5
 ; CHECK-GI-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-GI-NEXT:    // kill: def $s3 killed $s3 def $q3
+; CHECK-GI-NEXT:    ldr s18, [sp, #32]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    ldr s19, [sp, #24]
-; CHECK-GI-NEXT:    ldr s20, [sp, #32]
-; CHECK-GI-NEXT:    mov v7.s[1], v16.s[0]
-; CHECK-GI-NEXT:    ldr s17, [sp, #8]
 ; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
-; CHECK-GI-NEXT:    ldr s21, [sp, #40]
-; CHECK-GI-NEXT:    mov v19.s[1], v20.s[0]
-; CHECK-GI-NEXT:    ldr s18, [sp, #16]
+; CHECK-GI-NEXT:    ldr s1, [sp, #8]
+; CHECK-GI-NEXT:    mov v7.s[1], v16.s[0]
+; CHECK-GI-NEXT:    mov v17.s[1], v18.s[0]
+; CHECK-GI-NEXT:    ldr s5, [sp, #40]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
-; CHECK-GI-NEXT:    mov v7.s[2], v17.s[0]
 ; CHECK-GI-NEXT:    mov v4.s[2], v6.s[0]
-; CHECK-GI-NEXT:    mov v19.s[2], v21.s[0]
+; CHECK-GI-NEXT:    mov v7.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov v17.s[2], v5.s[0]
+; CHECK-GI-NEXT:    ldr s1, [sp, #16]
 ; CHECK-GI-NEXT:    mov v0.s[3], v3.s[0]
-; CHECK-GI-NEXT:    mov v7.s[3], v18.s[0]
+; CHECK-GI-NEXT:    mov v7.s[3], v1.s[0]
 ; CHECK-GI-NEXT:    mov v4.s[3], v0.s[0]
-; CHECK-GI-NEXT:    mov v19.s[3], v0.s[0]
+; CHECK-GI-NEXT:    mov v17.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    fmin v0.4s, v0.4s, v7.4s
-; CHECK-GI-NEXT:    fmin v4.4s, v4.4s, v19.4s
+; CHECK-GI-NEXT:    fmin v4.4s, v4.4s, v17.4s
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    mov s2, v0.s[2]
 ; CHECK-GI-NEXT:    mov s3, v0.s[3]
@@ -406,39 +418,39 @@ entry:
 define <7 x float> @max_v7f32(<7 x float> %a, <7 x float> %b) {
 ; CHECK-SD-LABEL: max_v7f32:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mov x8, sp
-; CHECK-SD-NEXT:    // kill: def $s7 killed $s7 def $q7
 ; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
 ; CHECK-SD-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-SD-NEXT:    add x9, sp, #32
-; CHECK-SD-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-SD-NEXT:    ldr s1, [sp, #24]
+; CHECK-SD-NEXT:    // kill: def $s7 killed $s7 def $q7
+; CHECK-SD-NEXT:    mov x8, sp
 ; CHECK-SD-NEXT:    // kill: def $s4 killed $s4 def $q4
 ; CHECK-SD-NEXT:    // kill: def $s5 killed $s5 def $q5
 ; CHECK-SD-NEXT:    // kill: def $s2 killed $s2 def $q2
 ; CHECK-SD-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-SD-NEXT:    // kill: def $s3 killed $s3 def $q3
+; CHECK-SD-NEXT:    mov v0.s[1], v1.s[0]
 ; CHECK-SD-NEXT:    ld1 { v7.s }[1], [x8]
+; CHECK-SD-NEXT:    ldr s1, [sp, #24]
 ; CHECK-SD-NEXT:    add x8, sp, #8
-; CHECK-SD-NEXT:    ld1 { v1.s }[1], [x9]
-; CHECK-SD-NEXT:    mov v0.s[2], v2.s[0]
 ; CHECK-SD-NEXT:    mov v4.s[1], v5.s[0]
 ; CHECK-SD-NEXT:    ld1 { v7.s }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #32
+; CHECK-SD-NEXT:    mov v0.s[2], v2.s[0]
+; CHECK-SD-NEXT:    ld1 { v1.s }[1], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #16
-; CHECK-SD-NEXT:    mov v0.s[3], v3.s[0]
 ; CHECK-SD-NEXT:    mov v4.s[2], v6.s[0]
 ; CHECK-SD-NEXT:    ld1 { v7.s }[3], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #40
 ; CHECK-SD-NEXT:    ld1 { v1.s }[2], [x8]
-; CHECK-SD-NEXT:    fmax v0.4s, v0.4s, v7.4s
+; CHECK-SD-NEXT:    mov v0.s[3], v3.s[0]
 ; CHECK-SD-NEXT:    fmax v4.4s, v4.4s, v1.4s
+; CHECK-SD-NEXT:    fmax v0.4s, v0.4s, v7.4s
+; CHECK-SD-NEXT:    mov s5, v4.s[1]
+; CHECK-SD-NEXT:    mov s6, v4.s[2]
+; CHECK-SD-NEXT:    // kill: def $s4 killed $s4 killed $q4
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    mov s2, v0.s[2]
 ; CHECK-SD-NEXT:    mov s3, v0.s[3]
 ; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    mov s5, v4.s[1]
-; CHECK-SD-NEXT:    mov s6, v4.s[2]
-; CHECK-SD-NEXT:    // kill: def $s4 killed $s4 killed $q4
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: max_v7f32:
@@ -446,31 +458,31 @@ define <7 x float> @max_v7f32(<7 x float> %a, <7 x float> %b) {
 ; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
 ; CHECK-GI-NEXT:    // kill: def $s1 killed $s1 def $q1
 ; CHECK-GI-NEXT:    ldr s16, [sp]
+; CHECK-GI-NEXT:    ldr s17, [sp, #24]
 ; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 def $q4
 ; CHECK-GI-NEXT:    // kill: def $s7 killed $s7 def $q7
-; CHECK-GI-NEXT:    // kill: def $s5 killed $s5 def $q5
 ; CHECK-GI-NEXT:    // kill: def $s2 killed $s2 def $q2
+; CHECK-GI-NEXT:    // kill: def $s5 killed $s5 def $q5
 ; CHECK-GI-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-GI-NEXT:    // kill: def $s3 killed $s3 def $q3
+; CHECK-GI-NEXT:    ldr s18, [sp, #32]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    ldr s19, [sp, #24]
-; CHECK-GI-NEXT:    ldr s20, [sp, #32]
-; CHECK-GI-NEXT:    mov v7.s[1], v16.s[0]
-; CHECK-GI-NEXT:    ldr s17, [sp, #8]
 ; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
-; CHECK-GI-NEXT:    ldr s21, [sp, #40]
-; CHECK-GI-NEXT:    mov v19.s[1], v20.s[0]
-; CHECK-GI-NEXT:    ldr s18, [sp, #16]
+; CHECK-GI-NEXT:    ldr s1, [sp, #8]
+; CHECK-GI-NEXT:    mov v7.s[1], v16.s[0]
+; CHECK-GI-NEXT:    mov v17.s[1], v18.s[0]
+; CHECK-GI-NEXT:    ldr s5, [sp, #40]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
-; CHECK-GI-NEXT:    mov v7.s[2], v17.s[0]
 ; CHECK-GI-NEXT:    mov v4.s[2], v6.s[0]
-; CHECK-GI-NEXT:    mov v19.s[2], v21.s[0]
+; CHECK-GI-NEXT:    mov v7.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov v17.s[2], v5.s[0]
+; CHECK-GI-NEXT:    ldr s1, [sp, #16]
 ; CHECK-GI-NEXT:    mov v0.s[3], v3.s[0]
-; CHECK-GI-NEXT:    mov v7.s[3], v18.s[0]
+; CHECK-GI-NEXT:    mov v7.s[3], v1.s[0]
 ; CHECK-GI-NEXT:    mov v4.s[3], v0.s[0]
-; CHECK-GI-NEXT:    mov v19.s[3], v0.s[0]
+; CHECK-GI-NEXT:    mov v17.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    fmax v0.4s, v0.4s, v7.4s
-; CHECK-GI-NEXT:    fmax v4.4s, v4.4s, v19.4s
+; CHECK-GI-NEXT:    fmax v4.4s, v4.4s, v17.4s
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    mov s2, v0.s[2]
 ; CHECK-GI-NEXT:    mov s3, v0.s[3]
@@ -485,22 +497,34 @@ entry:
 }
 
 define <8 x float> @min_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: min_v8f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmin v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    fmin v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: min_v8f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmin v1.4s, v1.4s, v3.4s
+; CHECK-SD-NEXT:    fmin v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: min_v8f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmin v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    fmin v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %c = call <8 x float> @llvm.minimum.v8f32(<8 x float> %a, <8 x float> %b)
   ret <8 x float> %c
 }
 
 define <8 x float> @max_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: max_v8f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmax v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    fmax v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: max_v8f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmax v1.4s, v1.4s, v3.4s
+; CHECK-SD-NEXT:    fmax v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: max_v8f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmax v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    fmax v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %c = call <8 x float> @llvm.maximum.v8f32(<8 x float> %a, <8 x float> %b)
   ret <8 x float> %c
@@ -513,22 +537,22 @@ define <4 x half> @min_v4f16(<4 x half> %a, <4 x half> %b) {
 ; CHECK-NOFP16-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NOFP16-SD-NEXT:    mov h2, v1.h[1]
 ; CHECK-NOFP16-SD-NEXT:    mov h3, v0.h[1]
-; CHECK-NOFP16-SD-NEXT:    fcvt s4, h1
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h0
-; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[2]
-; CHECK-NOFP16-SD-NEXT:    mov h7, v0.h[2]
+; CHECK-NOFP16-SD-NEXT:    mov h4, v1.h[2]
+; CHECK-NOFP16-SD-NEXT:    mov h5, v0.h[2]
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h1
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h0
 ; CHECK-NOFP16-SD-NEXT:    mov h1, v1.h[3]
 ; CHECK-NOFP16-SD-NEXT:    fcvt s2, h2
 ; CHECK-NOFP16-SD-NEXT:    fcvt s3, h3
-; CHECK-NOFP16-SD-NEXT:    fmin s4, s5, s4
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h7
 ; CHECK-NOFP16-SD-NEXT:    fcvt s1, h1
 ; CHECK-NOFP16-SD-NEXT:    fmin s2, s3, s2
-; CHECK-NOFP16-SD-NEXT:    fcvt s3, h6
+; CHECK-NOFP16-SD-NEXT:    fcvt s3, h4
+; CHECK-NOFP16-SD-NEXT:    fcvt s4, h5
+; CHECK-NOFP16-SD-NEXT:    fmin s5, s7, s6
 ; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[3]
-; CHECK-NOFP16-SD-NEXT:    fcvt h0, s4
+; CHECK-NOFP16-SD-NEXT:    fmin s3, s4, s3
 ; CHECK-NOFP16-SD-NEXT:    fcvt h2, s2
-; CHECK-NOFP16-SD-NEXT:    fmin s3, s5, s3
+; CHECK-NOFP16-SD-NEXT:    fcvt h0, s5
 ; CHECK-NOFP16-SD-NEXT:    fcvt s4, h6
 ; CHECK-NOFP16-SD-NEXT:    mov v0.h[1], v2.h[0]
 ; CHECK-NOFP16-SD-NEXT:    fcvt h2, s3
@@ -568,22 +592,22 @@ define <4 x half> @max_v4f16(<4 x half> %a, <4 x half> %b) {
 ; CHECK-NOFP16-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NOFP16-SD-NEXT:    mov h2, v1.h[1]
 ; CHECK-NOFP16-SD-NEXT:    mov h3, v0.h[1]
-; CHECK-NOFP16-SD-NEXT:    fcvt s4, h1
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h0
-; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[2]
-; CHECK-NOFP16-SD-NEXT:    mov h7, v0.h[2]
+; CHECK-NOFP16-SD-NEXT:    mov h4, v1.h[2]
+; CHECK-NOFP16-SD-NEXT:    mov h5, v0.h[2]
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h1
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h0
 ; CHECK-NOFP16-SD-NEXT:    mov h1, v1.h[3]
 ; CHECK-NOFP16-SD-NEXT:    fcvt s2, h2
 ; CHECK-NOFP16-SD-NEXT:    fcvt s3, h3
-; CHECK-NOFP16-SD-NEXT:    fmax s4, s5, s4
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h7
 ; CHECK-NOFP16-SD-NEXT:    fcvt s1, h1
 ; CHECK-NOFP16-SD-NEXT:    fmax s2, s3, s2
-; CHECK-NOFP16-SD-NEXT:    fcvt s3, h6
+; CHECK-NOFP16-SD-NEXT:    fcvt s3, h4
+; CHECK-NOFP16-SD-NEXT:    fcvt s4, h5
+; CHECK-NOFP16-SD-NEXT:    fmax s5, s7, s6
 ; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[3]
-; CHECK-NOFP16-SD-NEXT:    fcvt h0, s4
+; CHECK-NOFP16-SD-NEXT:    fmax s3, s4, s3
 ; CHECK-NOFP16-SD-NEXT:    fcvt h2, s2
-; CHECK-NOFP16-SD-NEXT:    fmax s3, s5, s3
+; CHECK-NOFP16-SD-NEXT:    fcvt h0, s5
 ; CHECK-NOFP16-SD-NEXT:    fcvt s4, h6
 ; CHECK-NOFP16-SD-NEXT:    mov v0.h[1], v2.h[0]
 ; CHECK-NOFP16-SD-NEXT:    fcvt h2, s3
@@ -626,46 +650,46 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[2]
 ; CHECK-NOFP16-SD-NEXT:    mov h7, v0.h[2]
 ; CHECK-NOFP16-SD-NEXT:    mov h16, v1.h[3]
-; CHECK-NOFP16-SD-NEXT:    mov h17, v0.h[3]
 ; CHECK-NOFP16-SD-NEXT:    fcvt s2, h2
 ; CHECK-NOFP16-SD-NEXT:    fcvt s3, h3
 ; CHECK-NOFP16-SD-NEXT:    fmin s4, s5, s4
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h6
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h7
-; CHECK-NOFP16-SD-NEXT:    fcvt s7, h16
-; CHECK-NOFP16-SD-NEXT:    fcvt s16, h17
+; CHECK-NOFP16-SD-NEXT:    mov h5, v0.h[3]
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h6
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h7
+; CHECK-NOFP16-SD-NEXT:    fcvt s16, h16
 ; CHECK-NOFP16-SD-NEXT:    fmin s3, s3, s2
+; CHECK-NOFP16-SD-NEXT:    fcvt s5, h5
 ; CHECK-NOFP16-SD-NEXT:    fcvt h2, s4
-; CHECK-NOFP16-SD-NEXT:    fmin s4, s6, s5
-; CHECK-NOFP16-SD-NEXT:    mov h5, v1.h[4]
-; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[4]
-; CHECK-NOFP16-SD-NEXT:    fmin s7, s16, s7
+; CHECK-NOFP16-SD-NEXT:    fmin s4, s7, s6
+; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[4]
+; CHECK-NOFP16-SD-NEXT:    mov h7, v0.h[4]
 ; CHECK-NOFP16-SD-NEXT:    fcvt h3, s3
+; CHECK-NOFP16-SD-NEXT:    fmin s5, s5, s16
 ; CHECK-NOFP16-SD-NEXT:    mov h16, v0.h[5]
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h5
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h6
-; CHECK-NOFP16-SD-NEXT:    fcvt h7, s7
+; CHECK-NOFP16-SD-NEXT:    fcvt h4, s4
 ; CHECK-NOFP16-SD-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h3, s4
-; CHECK-NOFP16-SD-NEXT:    mov h4, v1.h[5]
-; CHECK-NOFP16-SD-NEXT:    fmin s5, s6, s5
-; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[6]
-; CHECK-NOFP16-SD-NEXT:    mov v2.h[2], v3.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s3, h6
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h7
+; CHECK-NOFP16-SD-NEXT:    mov h7, v1.h[5]
+; CHECK-NOFP16-SD-NEXT:    fcvt h5, s5
+; CHECK-NOFP16-SD-NEXT:    fcvt s16, h16
+; CHECK-NOFP16-SD-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-NOFP16-SD-NEXT:    mov h4, v1.h[6]
+; CHECK-NOFP16-SD-NEXT:    fmin s3, s6, s3
+; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[6]
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h7
 ; CHECK-NOFP16-SD-NEXT:    mov h1, v1.h[7]
-; CHECK-NOFP16-SD-NEXT:    fcvt s3, h4
-; CHECK-NOFP16-SD-NEXT:    fcvt s4, h16
-; CHECK-NOFP16-SD-NEXT:    mov h16, v0.h[6]
 ; CHECK-NOFP16-SD-NEXT:    mov h0, v0.h[7]
-; CHECK-NOFP16-SD-NEXT:    mov v2.h[3], v7.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt s1, h1
-; CHECK-NOFP16-SD-NEXT:    fmin s3, s4, s3
-; CHECK-NOFP16-SD-NEXT:    fcvt h4, s5
+; CHECK-NOFP16-SD-NEXT:    mov v2.h[3], v5.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s4, h4
+; CHECK-NOFP16-SD-NEXT:    fcvt h3, s3
 ; CHECK-NOFP16-SD-NEXT:    fcvt s5, h6
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h16
+; CHECK-NOFP16-SD-NEXT:    fmin s6, s16, s7
+; CHECK-NOFP16-SD-NEXT:    fcvt s1, h1
 ; CHECK-NOFP16-SD-NEXT:    fcvt s0, h0
-; CHECK-NOFP16-SD-NEXT:    mov v2.h[4], v4.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h3, s3
-; CHECK-NOFP16-SD-NEXT:    fmin s4, s6, s5
+; CHECK-NOFP16-SD-NEXT:    mov v2.h[4], v3.h[0]
+; CHECK-NOFP16-SD-NEXT:    fmin s4, s5, s4
+; CHECK-NOFP16-SD-NEXT:    fcvt h3, s6
 ; CHECK-NOFP16-SD-NEXT:    fmin s0, s0, s1
 ; CHECK-NOFP16-SD-NEXT:    mov v2.h[5], v3.h[0]
 ; CHECK-NOFP16-SD-NEXT:    fcvt h3, s4
@@ -711,18 +735,18 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    fmin v2.4s, v2.4s, v3.4s
 ; CHECK-NOFP16-GI-NEXT:    fmin v0.4s, v0.4s, v1.4s
 ; CHECK-NOFP16-GI-NEXT:    mov s1, v2.s[1]
-; CHECK-NOFP16-GI-NEXT:    mov s3, v2.s[2]
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NOFP16-GI-NEXT:    mov s3, v2.s[2]
 ; CHECK-NOFP16-GI-NEXT:    mov v2.s[1], v1.s[0]
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[3]
+; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[3]
 ; CHECK-NOFP16-GI-NEXT:    mov v2.s[2], v3.s[0]
+; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[2]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov v2.s[3], v0.s[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v1.4h, v2.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v5.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v4.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov h2, v1.h[1]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[2]
@@ -734,28 +758,28 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-FP16-GI-LABEL: min_v7f16:
 ; CHECK-FP16-GI:       // %bb.0: // %entry
 ; CHECK-FP16-GI-NEXT:    mov h2, v0.h[1]
-; CHECK-FP16-GI-NEXT:    mov h3, v0.h[2]
-; CHECK-FP16-GI-NEXT:    mov h4, v0.h[3]
-; CHECK-FP16-GI-NEXT:    mov h5, v0.h[4]
-; CHECK-FP16-GI-NEXT:    mov h6, v0.h[5]
-; CHECK-FP16-GI-NEXT:    mov h7, v0.h[6]
-; CHECK-FP16-GI-NEXT:    mov h16, v1.h[1]
+; CHECK-FP16-GI-NEXT:    mov h3, v1.h[1]
+; CHECK-FP16-GI-NEXT:    mov h4, v0.h[2]
+; CHECK-FP16-GI-NEXT:    mov h5, v0.h[3]
+; CHECK-FP16-GI-NEXT:    mov h6, v0.h[4]
+; CHECK-FP16-GI-NEXT:    mov h7, v0.h[5]
+; CHECK-FP16-GI-NEXT:    mov h16, v0.h[6]
 ; CHECK-FP16-GI-NEXT:    mov h17, v1.h[2]
-; CHECK-FP16-GI-NEXT:    mov v0.h[1], v2.h[0]
 ; CHECK-FP16-GI-NEXT:    mov h18, v1.h[3]
 ; CHECK-FP16-GI-NEXT:    mov h19, v1.h[4]
 ; CHECK-FP16-GI-NEXT:    mov h20, v1.h[5]
 ; CHECK-FP16-GI-NEXT:    mov h21, v1.h[6]
-; CHECK-FP16-GI-NEXT:    mov v1.h[1], v16.h[0]
-; CHECK-FP16-GI-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[1], v2.h[0]
+; CHECK-FP16-GI-NEXT:    mov v1.h[1], v3.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[2], v4.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v1.h[2], v17.h[0]
-; CHECK-FP16-GI-NEXT:    mov v0.h[3], v4.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[3], v5.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v1.h[3], v18.h[0]
-; CHECK-FP16-GI-NEXT:    mov v0.h[4], v5.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[4], v6.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v1.h[4], v19.h[0]
-; CHECK-FP16-GI-NEXT:    mov v0.h[5], v6.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[5], v7.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v1.h[5], v20.h[0]
-; CHECK-FP16-GI-NEXT:    mov v0.h[6], v7.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[6], v16.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v1.h[6], v21.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v1.h[7], v0.h[0]
@@ -789,46 +813,46 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[2]
 ; CHECK-NOFP16-SD-NEXT:    mov h7, v0.h[2]
 ; CHECK-NOFP16-SD-NEXT:    mov h16, v1.h[3]
-; CHECK-NOFP16-SD-NEXT:    mov h17, v0.h[3]
 ; CHECK-NOFP16-SD-NEXT:    fcvt s2, h2
 ; CHECK-NOFP16-SD-NEXT:    fcvt s3, h3
 ; CHECK-NOFP16-SD-NEXT:    fmax s4, s5, s4
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h6
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h7
-; CHECK-NOFP16-SD-NEXT:    fcvt s7, h16
-; CHECK-NOFP16-SD-NEXT:    fcvt s16, h17
+; CHECK-NOFP16-SD-NEXT:    mov h5, v0.h[3]
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h6
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h7
+; CHECK-NOFP16-SD-NEXT:    fcvt s16, h16
 ; CHECK-NOFP16-SD-NEXT:    fmax s3, s3, s2
+; CHECK-NOFP16-SD-NEXT:    fcvt s5, h5
 ; CHECK-NOFP16-SD-NEXT:    fcvt h2, s4
-; CHECK-NOFP16-SD-NEXT:    fmax s4, s6, s5
-; CHECK-NOFP16-SD-NEXT:    mov h5, v1.h[4]
-; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[4]
-; CHECK-NOFP16-SD-NEXT:    fmax s7, s16, s7
+; CHECK-NOFP16-SD-NEXT:    fmax s4, s7, s6
+; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[4]
+; CHECK-NOFP16-SD-NEXT:    mov h7, v0.h[4]
 ; CHECK-NOFP16-SD-NEXT:    fcvt h3, s3
+; CHECK-NOFP16-SD-NEXT:    fmax s5, s5, s16
 ; CHECK-NOFP16-SD-NEXT:    mov h16, v0.h[5]
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h5
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h6
-; CHECK-NOFP16-SD-NEXT:    fcvt h7, s7
+; CHECK-NOFP16-SD-NEXT:    fcvt h4, s4
 ; CHECK-NOFP16-SD-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h3, s4
-; CHECK-NOFP16-SD-NEXT:    mov h4, v1.h[5]
-; CHECK-NOFP16-SD-NEXT:    fmax s5, s6, s5
-; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[6]
-; CHECK-NOFP16-SD-NEXT:    mov v2.h[2], v3.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s3, h6
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h7
+; CHECK-NOFP16-SD-NEXT:    mov h7, v1.h[5]
+; CHECK-NOFP16-SD-NEXT:    fcvt h5, s5
+; CHECK-NOFP16-SD-NEXT:    fcvt s16, h16
+; CHECK-NOFP16-SD-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-NOFP16-SD-NEXT:    mov h4, v1.h[6]
+; CHECK-NOFP16-SD-NEXT:    fmax s3, s6, s3
+; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[6]
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h7
 ; CHECK-NOFP16-SD-NEXT:    mov h1, v1.h[7]
-; CHECK-NOFP16-SD-NEXT:    fcvt s3, h4
-; CHECK-NOFP16-SD-NEXT:    fcvt s4, h16
-; CHECK-NOFP16-SD-NEXT:    mov h16, v0.h[6]
 ; CHECK-NOFP16-SD-NEXT:    mov h0, v0.h[7]
-; CHECK-NOFP16-SD-NEXT:    mov v2.h[3], v7.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt s1, h1
-; CHECK-NOFP16-SD-NEXT:    fmax s3, s4, s3
-; CHECK-NOFP16-SD-NEXT:    fcvt h4, s5
+; CHECK-NOFP16-SD-NEXT:    mov v2.h[3], v5.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s4, h4
+; CHECK-NOFP16-SD-NEXT:    fcvt h3, s3
 ; CHECK-NOFP16-SD-NEXT:    fcvt s5, h6
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h16
+; CHECK-NOFP16-SD-NEXT:    fmax s6, s16, s7
+; CHECK-NOFP16-SD-NEXT:    fcvt s1, h1
 ; CHECK-NOFP16-SD-NEXT:    fcvt s0, h0
-; CHECK-NOFP16-SD-NEXT:    mov v2.h[4], v4.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h3, s3
-; CHECK-NOFP16-SD-NEXT:    fmax s4, s6, s5
+; CHECK-NOFP16-SD-NEXT:    mov v2.h[4], v3.h[0]
+; CHECK-NOFP16-SD-NEXT:    fmax s4, s5, s4
+; CHECK-NOFP16-SD-NEXT:    fcvt h3, s6
 ; CHECK-NOFP16-SD-NEXT:    fmax s0, s0, s1
 ; CHECK-NOFP16-SD-NEXT:    mov v2.h[5], v3.h[0]
 ; CHECK-NOFP16-SD-NEXT:    fcvt h3, s4
@@ -874,18 +898,18 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    fmax v2.4s, v2.4s, v3.4s
 ; CHECK-NOFP16-GI-NEXT:    fmax v0.4s, v0.4s, v1.4s
 ; CHECK-NOFP16-GI-NEXT:    mov s1, v2.s[1]
-; CHECK-NOFP16-GI-NEXT:    mov s3, v2.s[2]
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NOFP16-GI-NEXT:    mov s3, v2.s[2]
 ; CHECK-NOFP16-GI-NEXT:    mov v2.s[1], v1.s[0]
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[3]
+; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[3]
 ; CHECK-NOFP16-GI-NEXT:    mov v2.s[2], v3.s[0]
+; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[2]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov v2.s[3], v0.s[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v1.4h, v2.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v5.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v4.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov h2, v1.h[1]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[2]
@@ -897,28 +921,28 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-FP16-GI-LABEL: max_v7f16:
 ; CHECK-FP16-GI:       // %bb.0: // %entry
 ; CHECK-FP16-GI-NEXT:    mov h2, v0.h[1]
-; CHECK-FP16-GI-NEXT:    mov h3, v0.h[2]
-; CHECK-FP16-GI-NEXT:    mov h4, v0.h[3]
-; CHECK-FP16-GI-NEXT:    mov h5, v0.h[4]
-; CHECK-FP16-GI-NEXT:    mov h6, v0.h[5]
-; CHECK-FP16-GI-NEXT:    mov h7, v0.h[6]
-; CHECK-FP16-GI-NEXT:    mov h16, v1.h[1]
+; CHECK-FP16-GI-NEXT:    mov h3, v1.h[1]
+; CHECK-FP16-GI-NEXT:    mov h4, v0.h[2]
+; CHECK-FP16-GI-NEXT:    mov h5, v0.h[3]
+; CHECK-FP16-GI-NEXT:    mov h6, v0.h[4]
+; CHECK-FP16-GI-NEXT:    mov h7, v0.h[5]
+; CHECK-FP16-GI-NEXT:    mov h16, v0.h[6]
 ; CHECK-FP16-GI-NEXT:    mov h17, v1.h[2]
-; CHECK-FP16-GI-NEXT:    mov v0.h[1], v2.h[0]
 ; CHECK-FP16-GI-NEXT:    mov h18, v1.h[3]
 ; CHECK-FP16-GI-NEXT:    mov h19, v1.h[4]
 ; CHECK-FP16-GI-NEXT:    mov h20, v1.h[5]
 ; CHECK-FP16-GI-NEXT:    mov h21, v1.h[6]
-; CHECK-FP16-GI-NEXT:    mov v1.h[1], v16.h[0]
-; CHECK-FP16-GI-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[1], v2.h[0]
+; CHECK-FP16-GI-NEXT:    mov v1.h[1], v3.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[2], v4.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v1.h[2], v17.h[0]
-; CHECK-FP16-GI-NEXT:    mov v0.h[3], v4.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[3], v5.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v1.h[3], v18.h[0]
-; CHECK-FP16-GI-NEXT:    mov v0.h[4], v5.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[4], v6.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v1.h[4], v19.h[0]
-; CHECK-FP16-GI-NEXT:    mov v0.h[5], v6.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[5], v7.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v1.h[5], v20.h[0]
-; CHECK-FP16-GI-NEXT:    mov v0.h[6], v7.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[6], v16.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v1.h[6], v21.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v1.h[7], v0.h[0]
@@ -952,46 +976,46 @@ define <8 x half> @min_v8f16(<8 x half> %a, <8 x half> %b) {
 ; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[2]
 ; CHECK-NOFP16-SD-NEXT:    mov h7, v0.h[2]
 ; CHECK-NOFP16-SD-NEXT:    mov h16, v1.h[3]
-; CHECK-NOFP16-SD-NEXT:    mov h17, v0.h[3]
 ; CHECK-NOFP16-SD-NEXT:    fcvt s2, h2
 ; CHECK-NOFP16-SD-NEXT:    fcvt s3, h3
 ; CHECK-NOFP16-SD-NEXT:    fmin s4, s5, s4
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h6
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h7
-; CHECK-NOFP16-SD-NEXT:    fcvt s7, h16
-; CHECK-NOFP16-SD-NEXT:    fcvt s16, h17
+; CHECK-NOFP16-SD-NEXT:    mov h5, v0.h[3]
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h6
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h7
+; CHECK-NOFP16-SD-NEXT:    fcvt s16, h16
 ; CHECK-NOFP16-SD-NEXT:    fmin s3, s3, s2
+; CHECK-NOFP16-SD-NEXT:    fcvt s5, h5
 ; CHECK-NOFP16-SD-NEXT:    fcvt h2, s4
-; CHECK-NOFP16-SD-NEXT:    fmin s4, s6, s5
-; CHECK-NOFP16-SD-NEXT:    mov h5, v1.h[4]
-; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[4]
-; CHECK-NOFP16-SD-NEXT:    fmin s7, s16, s7
+; CHECK-NOFP16-SD-NEXT:    fmin s4, s7, s6
+; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[4]
+; CHECK-NOFP16-SD-NEXT:    mov h7, v0.h[4]
 ; CHECK-NOFP16-SD-NEXT:    fcvt h3, s3
+; CHECK-NOFP16-SD-NEXT:    fmin s5, s5, s16
 ; CHECK-NOFP16-SD-NEXT:    mov h16, v0.h[5]
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h5
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h6
-; CHECK-NOFP16-SD-NEXT:    fcvt h7, s7
+; CHECK-NOFP16-SD-NEXT:    fcvt h4, s4
 ; CHECK-NOFP16-SD-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h3, s4
-; CHECK-NOFP16-SD-NEXT:    mov h4, v1.h[5]
-; CHECK-NOFP16-SD-NEXT:    fmin s5, s6, s5
-; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[6]
-; CHECK-NOFP16-SD-NEXT:    mov v2.h[2], v3.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s3, h6
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h7
+; CHECK-NOFP16-SD-NEXT:    mov h7, v1.h[5]
+; CHECK-NOFP16-SD-NEXT:    fcvt h5, s5
+; CHECK-NOFP16-SD-NEXT:    fcvt s16, h16
+; CHECK-NOFP16-SD-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-NOFP16-SD-NEXT:    mov h4, v1.h[6]
+; CHECK-NOFP16-SD-NEXT:    fmin s3, s6, s3
+; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[6]
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h7
 ; CHECK-NOFP16-SD-NEXT:    mov h1, v1.h[7]
-; CHECK-NOFP16-SD-NEXT:    fcvt s3, h4
-; CHECK-NOFP16-SD-NEXT:    fcvt s4, h16
-; CHECK-NOFP16-SD-NEXT:    mov h16, v0.h[6]
 ; CHECK-NOFP16-SD-NEXT:    mov h0, v0.h[7]
-; CHECK-NOFP16-SD-NEXT:    mov v2.h[3], v7.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt s1, h1
-; CHECK-NOFP16-SD-NEXT:    fmin s3, s4, s3
-; CHECK-NOFP16-SD-NEXT:    fcvt h4, s5
+; CHECK-NOFP16-SD-NEXT:    mov v2.h[3], v5.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s4, h4
+; CHECK-NOFP16-SD-NEXT:    fcvt h3, s3
 ; CHECK-NOFP16-SD-NEXT:    fcvt s5, h6
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h16
+; CHECK-NOFP16-SD-NEXT:    fmin s6, s16, s7
+; CHECK-NOFP16-SD-NEXT:    fcvt s1, h1
 ; CHECK-NOFP16-SD-NEXT:    fcvt s0, h0
-; CHECK-NOFP16-SD-NEXT:    mov v2.h[4], v4.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h3, s3
-; CHECK-NOFP16-SD-NEXT:    fmin s4, s6, s5
+; CHECK-NOFP16-SD-NEXT:    mov v2.h[4], v3.h[0]
+; CHECK-NOFP16-SD-NEXT:    fmin s4, s5, s4
+; CHECK-NOFP16-SD-NEXT:    fcvt h3, s6
 ; CHECK-NOFP16-SD-NEXT:    fmin s0, s0, s1
 ; CHECK-NOFP16-SD-NEXT:    mov v2.h[5], v3.h[0]
 ; CHECK-NOFP16-SD-NEXT:    fcvt h3, s4
@@ -1037,46 +1061,46 @@ define <8 x half> @max_v8f16(<8 x half> %a, <8 x half> %b) {
 ; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[2]
 ; CHECK-NOFP16-SD-NEXT:    mov h7, v0.h[2]
 ; CHECK-NOFP16-SD-NEXT:    mov h16, v1.h[3]
-; CHECK-NOFP16-SD-NEXT:    mov h17, v0.h[3]
 ; CHECK-NOFP16-SD-NEXT:    fcvt s2, h2
 ; CHECK-NOFP16-SD-NEXT:    fcvt s3, h3
 ; CHECK-NOFP16-SD-NEXT:    fmax s4, s5, s4
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h6
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h7
-; CHECK-NOFP16-SD-NEXT:    fcvt s7, h16
-; CHECK-NOFP16-SD-NEXT:    fcvt s16, h17
+; CHECK-NOFP16-SD-NEXT:    mov h5, v0.h[3]
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h6
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h7
+; CHECK-NOFP16-SD-NEXT:    fcvt s16, h16
 ; CHECK-NOFP16-SD-NEXT:    fmax s3, s3, s2
+; CHECK-NOFP16-SD-NEXT:    fcvt s5, h5
 ; CHECK-NOFP16-SD-NEXT:    fcvt h2, s4
-; CHECK-NOFP16-SD-NEXT:    fmax s4, s6, s5
-; CHECK-NOFP16-SD-NEXT:    mov h5, v1.h[4]
-; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[4]
-; CHECK-NOFP16-SD-NEXT:    fmax s7, s16, s7
+; CHECK-NOFP16-SD-NEXT:    fmax s4, s7, s6
+; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[4]
+; CHECK-NOFP16-SD-NEXT:    mov h7, v0.h[4]
 ; CHECK-NOFP16-SD-NEXT:    fcvt h3, s3
+; CHECK-NOFP16-SD-NEXT:    fmax s5, s5, s16
 ; CHECK-NOFP16-SD-NEXT:    mov h16, v0.h[5]
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h5
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h6
-; CHECK-NOFP16-SD-NEXT:    fcvt h7, s7
+; CHECK-NOFP16-SD-NEXT:    fcvt h4, s4
 ; CHECK-NOFP16-SD-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h3, s4
-; CHECK-NOFP16-SD-NEXT:    mov h4, v1.h[5]
-; CHECK-NOFP16-SD-NEXT:    fmax s5, s6, s5
-; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[6]
-; CHECK-NOFP16-SD-NEXT:    mov v2.h[2], v3.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s3, h6
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h7
+; CHECK-NOFP16-SD-NEXT:    mov h7, v1.h[5]
+; CHECK-NOFP16-SD-NEXT:    fcvt h5, s5
+; CHECK-NOFP16-SD-NEXT:    fcvt s16, h16
+; CHECK-NOFP16-SD-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-NOFP16-SD-NEXT:    mov h4, v1.h[6]
+; CHECK-NOFP16-SD-NEXT:    fmax s3, s6, s3
+; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[6]
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h7
 ; CHECK-NOFP16-SD-NEXT:    mov h1, v1.h[7]
-; CHECK-NOFP16-SD-NEXT:    fcvt s3, h4
-; CHECK-NOFP16-SD-NEXT:    fcvt s4, h16
-; CHECK-NOFP16-SD-NEXT:    mov h16, v0.h[6]
 ; CHECK-NOFP16-SD-NEXT:    mov h0, v0.h[7]
-; CHECK-NOFP16-SD-NEXT:    mov v2.h[3], v7.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt s1, h1
-; CHECK-NOFP16-SD-NEXT:    fmax s3, s4, s3
-; CHECK-NOFP16-SD-NEXT:    fcvt h4, s5
+; CHECK-NOFP16-SD-NEXT:    mov v2.h[3], v5.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s4, h4
+; CHECK-NOFP16-SD-NEXT:    fcvt h3, s3
 ; CHECK-NOFP16-SD-NEXT:    fcvt s5, h6
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h16
+; CHECK-NOFP16-SD-NEXT:    fmax s6, s16, s7
+; CHECK-NOFP16-SD-NEXT:    fcvt s1, h1
 ; CHECK-NOFP16-SD-NEXT:    fcvt s0, h0
-; CHECK-NOFP16-SD-NEXT:    mov v2.h[4], v4.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h3, s3
-; CHECK-NOFP16-SD-NEXT:    fmax s4, s6, s5
+; CHECK-NOFP16-SD-NEXT:    mov v2.h[4], v3.h[0]
+; CHECK-NOFP16-SD-NEXT:    fmax s4, s5, s4
+; CHECK-NOFP16-SD-NEXT:    fcvt h3, s6
 ; CHECK-NOFP16-SD-NEXT:    fmax s0, s0, s1
 ; CHECK-NOFP16-SD-NEXT:    mov v2.h[5], v3.h[0]
 ; CHECK-NOFP16-SD-NEXT:    fcvt h3, s4
@@ -1115,110 +1139,110 @@ entry:
 define <16 x half> @min_v16f16(<16 x half> %a, <16 x half> %b) {
 ; CHECK-NOFP16-SD-LABEL: min_v16f16:
 ; CHECK-NOFP16-SD:       // %bb.0: // %entry
-; CHECK-NOFP16-SD-NEXT:    mov h4, v2.h[1]
-; CHECK-NOFP16-SD-NEXT:    mov h5, v0.h[1]
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h2
-; CHECK-NOFP16-SD-NEXT:    fcvt s7, h0
-; CHECK-NOFP16-SD-NEXT:    mov h16, v2.h[2]
-; CHECK-NOFP16-SD-NEXT:    mov h17, v0.h[2]
-; CHECK-NOFP16-SD-NEXT:    mov h18, v3.h[1]
-; CHECK-NOFP16-SD-NEXT:    mov h19, v1.h[1]
-; CHECK-NOFP16-SD-NEXT:    fcvt s4, h4
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h5
-; CHECK-NOFP16-SD-NEXT:    fmin s6, s7, s6
-; CHECK-NOFP16-SD-NEXT:    fcvt s20, h1
-; CHECK-NOFP16-SD-NEXT:    fcvt s7, h16
-; CHECK-NOFP16-SD-NEXT:    fcvt s16, h17
-; CHECK-NOFP16-SD-NEXT:    mov h17, v2.h[3]
-; CHECK-NOFP16-SD-NEXT:    fcvt s18, h18
-; CHECK-NOFP16-SD-NEXT:    fmin s5, s5, s4
-; CHECK-NOFP16-SD-NEXT:    fcvt s19, h19
-; CHECK-NOFP16-SD-NEXT:    fcvt h4, s6
-; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[3]
-; CHECK-NOFP16-SD-NEXT:    fmin s7, s16, s7
-; CHECK-NOFP16-SD-NEXT:    fcvt s16, h3
-; CHECK-NOFP16-SD-NEXT:    mov h21, v0.h[4]
-; CHECK-NOFP16-SD-NEXT:    fcvt s17, h17
-; CHECK-NOFP16-SD-NEXT:    fcvt h5, s5
-; CHECK-NOFP16-SD-NEXT:    fmin s18, s19, s18
+; CHECK-NOFP16-SD-NEXT:    mov h6, v2.h[1]
+; CHECK-NOFP16-SD-NEXT:    mov h7, v0.h[1]
+; CHECK-NOFP16-SD-NEXT:    fcvt s4, h2
+; CHECK-NOFP16-SD-NEXT:    fcvt s5, h0
+; CHECK-NOFP16-SD-NEXT:    mov h16, v3.h[1]
+; CHECK-NOFP16-SD-NEXT:    mov h17, v1.h[1]
+; CHECK-NOFP16-SD-NEXT:    mov h18, v2.h[2]
+; CHECK-NOFP16-SD-NEXT:    mov h19, v0.h[2]
+; CHECK-NOFP16-SD-NEXT:    fcvt s20, h3
+; CHECK-NOFP16-SD-NEXT:    fcvt s21, h1
+; CHECK-NOFP16-SD-NEXT:    mov h22, v3.h[2]
+; CHECK-NOFP16-SD-NEXT:    mov h23, v1.h[2]
 ; CHECK-NOFP16-SD-NEXT:    fcvt s6, h6
-; CHECK-NOFP16-SD-NEXT:    mov h19, v3.h[2]
-; CHECK-NOFP16-SD-NEXT:    fmin s16, s20, s16
-; CHECK-NOFP16-SD-NEXT:    mov h20, v1.h[2]
-; CHECK-NOFP16-SD-NEXT:    fcvt s21, h21
-; CHECK-NOFP16-SD-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h5, s7
-; CHECK-NOFP16-SD-NEXT:    mov h7, v2.h[4]
-; CHECK-NOFP16-SD-NEXT:    fmin s6, s6, s17
-; CHECK-NOFP16-SD-NEXT:    mov h17, v3.h[3]
-; CHECK-NOFP16-SD-NEXT:    fcvt s19, h19
-; CHECK-NOFP16-SD-NEXT:    fcvt s20, h20
-; CHECK-NOFP16-SD-NEXT:    mov v4.h[2], v5.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h5, s16
 ; CHECK-NOFP16-SD-NEXT:    fcvt s7, h7
-; CHECK-NOFP16-SD-NEXT:    fcvt h16, s18
-; CHECK-NOFP16-SD-NEXT:    mov h18, v1.h[3]
-; CHECK-NOFP16-SD-NEXT:    fcvt h6, s6
-; CHECK-NOFP16-SD-NEXT:    fmin s7, s21, s7
-; CHECK-NOFP16-SD-NEXT:    mov v5.h[1], v16.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt s16, h17
-; CHECK-NOFP16-SD-NEXT:    fmin s17, s20, s19
-; CHECK-NOFP16-SD-NEXT:    fcvt s18, h18
-; CHECK-NOFP16-SD-NEXT:    mov h19, v3.h[4]
-; CHECK-NOFP16-SD-NEXT:    mov h20, v1.h[4]
-; CHECK-NOFP16-SD-NEXT:    mov v4.h[3], v6.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h6, s7
-; CHECK-NOFP16-SD-NEXT:    fmin s7, s18, s16
-; CHECK-NOFP16-SD-NEXT:    fcvt h16, s17
-; CHECK-NOFP16-SD-NEXT:    fcvt s17, h19
-; CHECK-NOFP16-SD-NEXT:    fcvt s18, h20
-; CHECK-NOFP16-SD-NEXT:    mov v4.h[4], v6.h[0]
-; CHECK-NOFP16-SD-NEXT:    mov h19, v1.h[5]
-; CHECK-NOFP16-SD-NEXT:    mov v5.h[2], v16.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h6, s7
-; CHECK-NOFP16-SD-NEXT:    fmin s7, s18, s17
-; CHECK-NOFP16-SD-NEXT:    mov h16, v2.h[5]
-; CHECK-NOFP16-SD-NEXT:    mov h17, v0.h[5]
-; CHECK-NOFP16-SD-NEXT:    mov h18, v3.h[5]
-; CHECK-NOFP16-SD-NEXT:    mov v5.h[3], v6.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h6, s7
-; CHECK-NOFP16-SD-NEXT:    fcvt s7, h16
+; CHECK-NOFP16-SD-NEXT:    mov h24, v0.h[6]
+; CHECK-NOFP16-SD-NEXT:    fmin s4, s5, s4
+; CHECK-NOFP16-SD-NEXT:    fcvt s5, h16
 ; CHECK-NOFP16-SD-NEXT:    fcvt s16, h17
 ; CHECK-NOFP16-SD-NEXT:    fcvt s17, h18
 ; CHECK-NOFP16-SD-NEXT:    fcvt s18, h19
+; CHECK-NOFP16-SD-NEXT:    mov h19, v0.h[3]
+; CHECK-NOFP16-SD-NEXT:    fmin s20, s21, s20
+; CHECK-NOFP16-SD-NEXT:    fcvt s21, h22
+; CHECK-NOFP16-SD-NEXT:    mov h22, v3.h[3]
+; CHECK-NOFP16-SD-NEXT:    fmin s6, s7, s6
+; CHECK-NOFP16-SD-NEXT:    mov h7, v2.h[3]
+; CHECK-NOFP16-SD-NEXT:    mov h25, v1.h[6]
+; CHECK-NOFP16-SD-NEXT:    fcvt h4, s4
+; CHECK-NOFP16-SD-NEXT:    fmin s5, s16, s5
+; CHECK-NOFP16-SD-NEXT:    fcvt s16, h23
+; CHECK-NOFP16-SD-NEXT:    mov h23, v1.h[3]
+; CHECK-NOFP16-SD-NEXT:    fmin s17, s18, s17
+; CHECK-NOFP16-SD-NEXT:    fcvt s18, h19
+; CHECK-NOFP16-SD-NEXT:    fcvt h6, s6
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h7
+; CHECK-NOFP16-SD-NEXT:    fcvt h19, s5
+; CHECK-NOFP16-SD-NEXT:    fcvt h5, s20
+; CHECK-NOFP16-SD-NEXT:    fmin s16, s16, s21
+; CHECK-NOFP16-SD-NEXT:    fcvt s20, h23
+; CHECK-NOFP16-SD-NEXT:    fcvt h17, s17
+; CHECK-NOFP16-SD-NEXT:    mov h21, v2.h[4]
+; CHECK-NOFP16-SD-NEXT:    mov h23, v1.h[4]
+; CHECK-NOFP16-SD-NEXT:    mov v4.h[1], v6.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h22
+; CHECK-NOFP16-SD-NEXT:    mov h22, v0.h[4]
+; CHECK-NOFP16-SD-NEXT:    fmin s7, s18, s7
+; CHECK-NOFP16-SD-NEXT:    mov h18, v3.h[4]
+; CHECK-NOFP16-SD-NEXT:    mov v5.h[1], v19.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt h16, s16
+; CHECK-NOFP16-SD-NEXT:    fmin s6, s20, s6
+; CHECK-NOFP16-SD-NEXT:    mov v4.h[2], v17.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s17, h21
+; CHECK-NOFP16-SD-NEXT:    fcvt s19, h22
+; CHECK-NOFP16-SD-NEXT:    fcvt h7, s7
+; CHECK-NOFP16-SD-NEXT:    fcvt s18, h18
+; CHECK-NOFP16-SD-NEXT:    fcvt s20, h23
+; CHECK-NOFP16-SD-NEXT:    mov h21, v2.h[5]
+; CHECK-NOFP16-SD-NEXT:    mov h22, v0.h[5]
+; CHECK-NOFP16-SD-NEXT:    mov v5.h[2], v16.h[0]
+; CHECK-NOFP16-SD-NEXT:    mov h16, v3.h[5]
+; CHECK-NOFP16-SD-NEXT:    mov h23, v1.h[5]
+; CHECK-NOFP16-SD-NEXT:    fcvt h6, s6
+; CHECK-NOFP16-SD-NEXT:    mov h0, v0.h[7]
+; CHECK-NOFP16-SD-NEXT:    mov h1, v1.h[7]
+; CHECK-NOFP16-SD-NEXT:    fmin s17, s19, s17
 ; CHECK-NOFP16-SD-NEXT:    mov h19, v2.h[6]
+; CHECK-NOFP16-SD-NEXT:    mov v4.h[3], v7.h[0]
+; CHECK-NOFP16-SD-NEXT:    fmin s18, s20, s18
+; CHECK-NOFP16-SD-NEXT:    mov h20, v3.h[6]
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h21
+; CHECK-NOFP16-SD-NEXT:    fcvt s21, h22
+; CHECK-NOFP16-SD-NEXT:    fcvt s22, h24
 ; CHECK-NOFP16-SD-NEXT:    mov h2, v2.h[7]
-; CHECK-NOFP16-SD-NEXT:    mov v5.h[4], v6.h[0]
-; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[6]
-; CHECK-NOFP16-SD-NEXT:    fmin s7, s16, s7
-; CHECK-NOFP16-SD-NEXT:    fmin s16, s18, s17
-; CHECK-NOFP16-SD-NEXT:    mov h17, v3.h[6]
-; CHECK-NOFP16-SD-NEXT:    mov h18, v1.h[6]
+; CHECK-NOFP16-SD-NEXT:    mov v5.h[3], v6.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h16
+; CHECK-NOFP16-SD-NEXT:    fcvt s16, h23
+; CHECK-NOFP16-SD-NEXT:    fcvt h17, s17
 ; CHECK-NOFP16-SD-NEXT:    fcvt s19, h19
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h6
-; CHECK-NOFP16-SD-NEXT:    mov h0, v0.h[7]
+; CHECK-NOFP16-SD-NEXT:    fcvt s23, h25
+; CHECK-NOFP16-SD-NEXT:    fcvt h18, s18
+; CHECK-NOFP16-SD-NEXT:    fcvt s20, h20
 ; CHECK-NOFP16-SD-NEXT:    mov h3, v3.h[7]
-; CHECK-NOFP16-SD-NEXT:    mov h1, v1.h[7]
-; CHECK-NOFP16-SD-NEXT:    fcvt s17, h17
-; CHECK-NOFP16-SD-NEXT:    fcvt h7, s7
-; CHECK-NOFP16-SD-NEXT:    fcvt s18, h18
+; CHECK-NOFP16-SD-NEXT:    fmin s7, s21, s7
 ; CHECK-NOFP16-SD-NEXT:    fcvt s2, h2
-; CHECK-NOFP16-SD-NEXT:    fmin s6, s6, s19
 ; CHECK-NOFP16-SD-NEXT:    fcvt s0, h0
-; CHECK-NOFP16-SD-NEXT:    fcvt s3, h3
+; CHECK-NOFP16-SD-NEXT:    fmin s6, s16, s6
 ; CHECK-NOFP16-SD-NEXT:    fcvt s1, h1
-; CHECK-NOFP16-SD-NEXT:    fcvt h16, s16
-; CHECK-NOFP16-SD-NEXT:    fmin s17, s18, s17
-; CHECK-NOFP16-SD-NEXT:    mov v4.h[5], v7.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h6, s6
+; CHECK-NOFP16-SD-NEXT:    mov v4.h[4], v17.h[0]
+; CHECK-NOFP16-SD-NEXT:    fmin s16, s22, s19
+; CHECK-NOFP16-SD-NEXT:    mov v5.h[4], v18.h[0]
+; CHECK-NOFP16-SD-NEXT:    fmin s17, s23, s20
+; CHECK-NOFP16-SD-NEXT:    fcvt s3, h3
+; CHECK-NOFP16-SD-NEXT:    fcvt h7, s7
 ; CHECK-NOFP16-SD-NEXT:    fmin s0, s0, s2
+; CHECK-NOFP16-SD-NEXT:    fcvt h6, s6
+; CHECK-NOFP16-SD-NEXT:    fcvt h2, s16
 ; CHECK-NOFP16-SD-NEXT:    fmin s1, s1, s3
-; CHECK-NOFP16-SD-NEXT:    mov v5.h[5], v16.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h2, s17
-; CHECK-NOFP16-SD-NEXT:    mov v4.h[6], v6.h[0]
+; CHECK-NOFP16-SD-NEXT:    mov v4.h[5], v7.h[0]
 ; CHECK-NOFP16-SD-NEXT:    fcvt h0, s0
+; CHECK-NOFP16-SD-NEXT:    mov v5.h[5], v6.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt h6, s17
 ; CHECK-NOFP16-SD-NEXT:    fcvt h1, s1
-; CHECK-NOFP16-SD-NEXT:    mov v5.h[6], v2.h[0]
+; CHECK-NOFP16-SD-NEXT:    mov v4.h[6], v2.h[0]
+; CHECK-NOFP16-SD-NEXT:    mov v5.h[6], v6.h[0]
 ; CHECK-NOFP16-SD-NEXT:    mov v4.h[7], v0.h[0]
 ; CHECK-NOFP16-SD-NEXT:    mov v5.h[7], v1.h[0]
 ; CHECK-NOFP16-SD-NEXT:    mov v0.16b, v4.16b
@@ -1227,22 +1251,22 @@ define <16 x half> @min_v16f16(<16 x half> %a, <16 x half> %b) {
 ;
 ; CHECK-FP16-SD-LABEL: min_v16f16:
 ; CHECK-FP16-SD:       // %bb.0: // %entry
-; CHECK-FP16-SD-NEXT:    fmin v0.8h, v0.8h, v2.8h
 ; CHECK-FP16-SD-NEXT:    fmin v1.8h, v1.8h, v3.8h
+; CHECK-FP16-SD-NEXT:    fmin v0.8h, v0.8h, v2.8h
 ; CHECK-FP16-SD-NEXT:    ret
 ;
 ; CHECK-NOFP16-GI-LABEL: min_v16f16:
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v4.4s, v0.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v5.4s, v2.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v6.4s, v1.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v5.4s, v1.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v6.4s, v2.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v7.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl2 v0.4s, v0.8h
-; CHECK-NOFP16-GI-NEXT:    fcvtl2 v2.4s, v2.8h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-NOFP16-GI-NEXT:    fcvtl2 v2.4s, v2.8h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl2 v3.4s, v3.8h
-; CHECK-NOFP16-GI-NEXT:    fmin v4.4s, v4.4s, v5.4s
-; CHECK-NOFP16-GI-NEXT:    fmin v5.4s, v6.4s, v7.4s
+; CHECK-NOFP16-GI-NEXT:    fmin v4.4s, v4.4s, v6.4s
+; CHECK-NOFP16-GI-NEXT:    fmin v5.4s, v5.4s, v7.4s
 ; CHECK-NOFP16-GI-NEXT:    fmin v2.4s, v0.4s, v2.4s
 ; CHECK-NOFP16-GI-NEXT:    fmin v3.4s, v1.4s, v3.4s
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v4.4s
@@ -1264,110 +1288,110 @@ entry:
 define <16 x half> @max_v16f16(<16 x half> %a, <16 x half> %b) {
 ; CHECK-NOFP16-SD-LABEL: max_v16f16:
 ; CHECK-NOFP16-SD:       // %bb.0: // %entry
-; CHECK-NOFP16-SD-NEXT:    mov h4, v2.h[1]
-; CHECK-NOFP16-SD-NEXT:    mov h5, v0.h[1]
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h2
-; CHECK-NOFP16-SD-NEXT:    fcvt s7, h0
-; CHECK-NOFP16-SD-NEXT:    mov h16, v2.h[2]
-; CHECK-NOFP16-SD-NEXT:    mov h17, v0.h[2]
-; CHECK-NOFP16-SD-NEXT:    mov h18, v3.h[1]
-; CHECK-NOFP16-SD-NEXT:    mov h19, v1.h[1]
-; CHECK-NOFP16-SD-NEXT:    fcvt s4, h4
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h5
-; CHECK-NOFP16-SD-NEXT:    fmax s6, s7, s6
-; CHECK-NOFP16-SD-NEXT:    fcvt s20, h1
-; CHECK-NOFP16-SD-NEXT:    fcvt s7, h16
-; CHECK-NOFP16-SD-NEXT:    fcvt s16, h17
-; CHECK-NOFP16-SD-NEXT:    mov h17, v2.h[3]
-; CHECK-NOFP16-SD-NEXT:    fcvt s18, h18
-; CHECK-NOFP16-SD-NEXT:    fmax s5, s5, s4
-; CHECK-NOFP16-SD-NEXT:    fcvt s19, h19
-; CHECK-NOFP16-SD-NEXT:    fcvt h4, s6
-; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[3]
-; CHECK-NOFP16-SD-NEXT:    fmax s7, s16, s7
-; CHECK-NOFP16-SD-NEXT:    fcvt s16, h3
-; CHECK-NOFP16-SD-NEXT:    mov h21, v0.h[4]
-; CHECK-NOFP16-SD-NEXT:    fcvt s17, h17
-; CHECK-NOFP16-SD-NEXT:    fcvt h5, s5
-; CHECK-NOFP16-SD-NEXT:    fmax s18, s19, s18
+; CHECK-NOFP16-SD-NEXT:    mov h6, v2.h[1]
+; CHECK-NOFP16-SD-NEXT:    mov h7, v0.h[1]
+; CHECK-NOFP16-SD-NEXT:    fcvt s4, h2
+; CHECK-NOFP16-SD-NEXT:    fcvt s5, h0
+; CHECK-NOFP16-SD-NEXT:    mov h16, v3.h[1]
+; CHECK-NOFP16-SD-NEXT:    mov h17, v1.h[1]
+; CHECK-NOFP16-SD-NEXT:    mov h18, v2.h[2]
+; CHECK-NOFP16-SD-NEXT:    mov h19, v0.h[2]
+; CHECK-NOFP16-SD-NEXT:    fcvt s20, h3
+; CHECK-NOFP16-SD-NEXT:    fcvt s21, h1
+; CHECK-NOFP16-SD-NEXT:    mov h22, v3.h[2]
+; CHECK-NOFP16-SD-NEXT:    mov h23, v1.h[2]
 ; CHECK-NOFP16-SD-NEXT:    fcvt s6, h6
-; CHECK-NOFP16-SD-NEXT:    mov h19, v3.h[2]
-; CHECK-NOFP16-SD-NEXT:    fmax s16, s20, s16
-; CHECK-NOFP16-SD-NEXT:    mov h20, v1.h[2]
-; CHECK-NOFP16-SD-NEXT:    fcvt s21, h21
-; CHECK-NOFP16-SD-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h5, s7
-; CHECK-NOFP16-SD-NEXT:    mov h7, v2.h[4]
-; CHECK-NOFP16-SD-NEXT:    fmax s6, s6, s17
-; CHECK-NOFP16-SD-NEXT:    mov h17, v3.h[3]
-; CHECK-NOFP16-SD-NEXT:    fcvt s19, h19
-; CHECK-NOFP16-SD-NEXT:    fcvt s20, h20
-; CHECK-NOFP16-SD-NEXT:    mov v4.h[2], v5.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h5, s16
 ; CHECK-NOFP16-SD-NEXT:    fcvt s7, h7
-; CHECK-NOFP16-SD-NEXT:    fcvt h16, s18
-; CHECK-NOFP16-SD-NEXT:    mov h18, v1.h[3]
-; CHECK-NOFP16-SD-NEXT:    fcvt h6, s6
-; CHECK-NOFP16-SD-NEXT:    fmax s7, s21, s7
-; CHECK-NOFP16-SD-NEXT:    mov v5.h[1], v16.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt s16, h17
-; CHECK-NOFP16-SD-NEXT:    fmax s17, s20, s19
-; CHECK-NOFP16-SD-NEXT:    fcvt s18, h18
-; CHECK-NOFP16-SD-NEXT:    mov h19, v3.h[4]
-; CHECK-NOFP16-SD-NEXT:    mov h20, v1.h[4]
-; CHECK-NOFP16-SD-NEXT:    mov v4.h[3], v6.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h6, s7
-; CHECK-NOFP16-SD-NEXT:    fmax s7, s18, s16
-; CHECK-NOFP16-SD-NEXT:    fcvt h16, s17
-; CHECK-NOFP16-SD-NEXT:    fcvt s17, h19
-; CHECK-NOFP16-SD-NEXT:    fcvt s18, h20
-; CHECK-NOFP16-SD-NEXT:    mov v4.h[4], v6.h[0]
-; CHECK-NOFP16-SD-NEXT:    mov h19, v1.h[5]
-; CHECK-NOFP16-SD-NEXT:    mov v5.h[2], v16.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h6, s7
-; CHECK-NOFP16-SD-NEXT:    fmax s7, s18, s17
-; CHECK-NOFP16-SD-NEXT:    mov h16, v2.h[5]
-; CHECK-NOFP16-SD-NEXT:    mov h17, v0.h[5]
-; CHECK-NOFP16-SD-NEXT:    mov h18, v3.h[5]
-; CHECK-NOFP16-SD-NEXT:    mov v5.h[3], v6.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h6, s7
-; CHECK-NOFP16-SD-NEXT:    fcvt s7, h16
+; CHECK-NOFP16-SD-NEXT:    mov h24, v0.h[6]
+; CHECK-NOFP16-SD-NEXT:    fmax s4, s5, s4
+; CHECK-NOFP16-SD-NEXT:    fcvt s5, h16
 ; CHECK-NOFP16-SD-NEXT:    fcvt s16, h17
 ; CHECK-NOFP16-SD-NEXT:    fcvt s17, h18
 ; CHECK-NOFP16-SD-NEXT:    fcvt s18, h19
+; CHECK-NOFP16-SD-NEXT:    mov h19, v0.h[3]
+; CHECK-NOFP16-SD-NEXT:    fmax s20, s21, s20
+; CHECK-NOFP16-SD-NEXT:    fcvt s21, h22
+; CHECK-NOFP16-SD-NEXT:    mov h22, v3.h[3]
+; CHECK-NOFP16-SD-NEXT:    fmax s6, s7, s6
+; CHECK-NOFP16-SD-NEXT:    mov h7, v2.h[3]
+; CHECK-NOFP16-SD-NEXT:    mov h25, v1.h[6]
+; CHECK-NOFP16-SD-NEXT:    fcvt h4, s4
+; CHECK-NOFP16-SD-NEXT:    fmax s5, s16, s5
+; CHECK-NOFP16-SD-NEXT:    fcvt s16, h23
+; CHECK-NOFP16-SD-NEXT:    mov h23, v1.h[3]
+; CHECK-NOFP16-SD-NEXT:    fmax s17, s18, s17
+; CHECK-NOFP16-SD-NEXT:    fcvt s18, h19
+; CHECK-NOFP16-SD-NEXT:    fcvt h6, s6
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h7
+; CHECK-NOFP16-SD-NEXT:    fcvt h19, s5
+; CHECK-NOFP16-SD-NEXT:    fcvt h5, s20
+; CHECK-NOFP16-SD-NEXT:    fmax s16, s16, s21
+; CHECK-NOFP16-SD-NEXT:    fcvt s20, h23
+; CHECK-NOFP16-SD-NEXT:    fcvt h17, s17
+; CHECK-NOFP16-SD-NEXT:    mov h21, v2.h[4]
+; CHECK-NOFP16-SD-NEXT:    mov h23, v1.h[4]
+; CHECK-NOFP16-SD-NEXT:    mov v4.h[1], v6.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h22
+; CHECK-NOFP16-SD-NEXT:    mov h22, v0.h[4]
+; CHECK-NOFP16-SD-NEXT:    fmax s7, s18, s7
+; CHECK-NOFP16-SD-NEXT:    mov h18, v3.h[4]
+; CHECK-NOFP16-SD-NEXT:    mov v5.h[1], v19.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt h16, s16
+; CHECK-NOFP16-SD-NEXT:    fmax s6, s20, s6
+; CHECK-NOFP16-SD-NEXT:    mov v4.h[2], v17.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s17, h21
+; CHECK-NOFP16-SD-NEXT:    fcvt s19, h22
+; CHECK-NOFP16-SD-NEXT:    fcvt h7, s7
+; CHECK-NOFP16-SD-NEXT:    fcvt s18, h18
+; CHECK-NOFP16-SD-NEXT:    fcvt s20, h23
+; CHECK-NOFP16-SD-NEXT:    mov h21, v2.h[5]
+; CHECK-NOFP16-SD-NEXT:    mov h22, v0.h[5]
+; CHECK-NOFP16-SD-NEXT:    mov v5.h[2], v16.h[0]
+; CHECK-NOFP16-SD-NEXT:    mov h16, v3.h[5]
+; CHECK-NOFP16-SD-NEXT:    mov h23, v1.h[5]
+; CHECK-NOFP16-SD-NEXT:    fcvt h6, s6
+; CHECK-NOFP16-SD-NEXT:    mov h0, v0.h[7]
+; CHECK-NOFP16-SD-NEXT:    mov h1, v1.h[7]
+; CHECK-NOFP16-SD-NEXT:    fmax s17, s19, s17
 ; CHECK-NOFP16-SD-NEXT:    mov h19, v2.h[6]
+; CHECK-NOFP16-SD-NEXT:    mov v4.h[3], v7.h[0]
+; CHECK-NOFP16-SD-NEXT:    fmax s18, s20, s18
+; CHECK-NOFP16-SD-NEXT:    mov h20, v3.h[6]
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h21
+; CHECK-NOFP16-SD-NEXT:    fcvt s21, h22
+; CHECK-NOFP16-SD-NEXT:    fcvt s22, h24
 ; CHECK-NOFP16-SD-NEXT:    mov h2, v2.h[7]
-; CHECK-NOFP16-SD-NEXT:    mov v5.h[4], v6.h[0]
-; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[6]
-; CHECK-NOFP16-SD-NEXT:    fmax s7, s16, s7
-; CHECK-NOFP16-SD-NEXT:    fmax s16, s18, s17
-; CHECK-NOFP16-SD-NEXT:    mov h17, v3.h[6]
-; CHECK-NOFP16-SD-NEXT:    mov h18, v1.h[6]
+; CHECK-NOFP16-SD-NEXT:    mov v5.h[3], v6.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h16
+; CHECK-NOFP16-SD-NEXT:    fcvt s16, h23
+; CHECK-NOFP16-SD-NEXT:    fcvt h17, s17
 ; CHECK-NOFP16-SD-NEXT:    fcvt s19, h19
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h6
-; CHECK-NOFP16-SD-NEXT:    mov h0, v0.h[7]
+; CHECK-NOFP16-SD-NEXT:    fcvt s23, h25
+; CHECK-NOFP16-SD-NEXT:    fcvt h18, s18
+; CHECK-NOFP16-SD-NEXT:    fcvt s20, h20
 ; CHECK-NOFP16-SD-NEXT:    mov h3, v3.h[7]
-; CHECK-NOFP16-SD-NEXT:    mov h1, v1.h[7]
-; CHECK-NOFP16-SD-NEXT:    fcvt s17, h17
-; CHECK-NOFP16-SD-NEXT:    fcvt h7, s7
-; CHECK-NOFP16-SD-NEXT:    fcvt s18, h18
+; CHECK-NOFP16-SD-NEXT:    fmax s7, s21, s7
 ; CHECK-NOFP16-SD-NEXT:    fcvt s2, h2
-; CHECK-NOFP16-SD-NEXT:    fmax s6, s6, s19
 ; CHECK-NOFP16-SD-NEXT:    fcvt s0, h0
-; CHECK-NOFP16-SD-NEXT:    fcvt s3, h3
+; CHECK-NOFP16-SD-NEXT:    fmax s6, s16, s6
 ; CHECK-NOFP16-SD-NEXT:    fcvt s1, h1
-; CHECK-NOFP16-SD-NEXT:    fcvt h16, s16
-; CHECK-NOFP16-SD-NEXT:    fmax s17, s18, s17
-; CHECK-NOFP16-SD-NEXT:    mov v4.h[5], v7.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h6, s6
+; CHECK-NOFP16-SD-NEXT:    mov v4.h[4], v17.h[0]
+; CHECK-NOFP16-SD-NEXT:    fmax s16, s22, s19
+; CHECK-NOFP16-SD-NEXT:    mov v5.h[4], v18.h[0]
+; CHECK-NOFP16-SD-NEXT:    fmax s17, s23, s20
+; CHECK-NOFP16-SD-NEXT:    fcvt s3, h3
+; CHECK-NOFP16-SD-NEXT:    fcvt h7, s7
 ; CHECK-NOFP16-SD-NEXT:    fmax s0, s0, s2
+; CHECK-NOFP16-SD-NEXT:    fcvt h6, s6
+; CHECK-NOFP16-SD-NEXT:    fcvt h2, s16
 ; CHECK-NOFP16-SD-NEXT:    fmax s1, s1, s3
-; CHECK-NOFP16-SD-NEXT:    mov v5.h[5], v16.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h2, s17
-; CHECK-NOFP16-SD-NEXT:    mov v4.h[6], v6.h[0]
+; CHECK-NOFP16-SD-NEXT:    mov v4.h[5], v7.h[0]
 ; CHECK-NOFP16-SD-NEXT:    fcvt h0, s0
+; CHECK-NOFP16-SD-NEXT:    mov v5.h[5], v6.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt h6, s17
 ; CHECK-NOFP16-SD-NEXT:    fcvt h1, s1
-; CHECK-NOFP16-SD-NEXT:    mov v5.h[6], v2.h[0]
+; CHECK-NOFP16-SD-NEXT:    mov v4.h[6], v2.h[0]
+; CHECK-NOFP16-SD-NEXT:    mov v5.h[6], v6.h[0]
 ; CHECK-NOFP16-SD-NEXT:    mov v4.h[7], v0.h[0]
 ; CHECK-NOFP16-SD-NEXT:    mov v5.h[7], v1.h[0]
 ; CHECK-NOFP16-SD-NEXT:    mov v0.16b, v4.16b
@@ -1376,22 +1400,22 @@ define <16 x half> @max_v16f16(<16 x half> %a, <16 x half> %b) {
 ;
 ; CHECK-FP16-SD-LABEL: max_v16f16:
 ; CHECK-FP16-SD:       // %bb.0: // %entry
-; CHECK-FP16-SD-NEXT:    fmax v0.8h, v0.8h, v2.8h
 ; CHECK-FP16-SD-NEXT:    fmax v1.8h, v1.8h, v3.8h
+; CHECK-FP16-SD-NEXT:    fmax v0.8h, v0.8h, v2.8h
 ; CHECK-FP16-SD-NEXT:    ret
 ;
 ; CHECK-NOFP16-GI-LABEL: max_v16f16:
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v4.4s, v0.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v5.4s, v2.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v6.4s, v1.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v5.4s, v1.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v6.4s, v2.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v7.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl2 v0.4s, v0.8h
-; CHECK-NOFP16-GI-NEXT:    fcvtl2 v2.4s, v2.8h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-NOFP16-GI-NEXT:    fcvtl2 v2.4s, v2.8h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl2 v3.4s, v3.8h
-; CHECK-NOFP16-GI-NEXT:    fmax v4.4s, v4.4s, v5.4s
-; CHECK-NOFP16-GI-NEXT:    fmax v5.4s, v6.4s, v7.4s
+; CHECK-NOFP16-GI-NEXT:    fmax v4.4s, v4.4s, v6.4s
+; CHECK-NOFP16-GI-NEXT:    fmax v5.4s, v5.4s, v7.4s
 ; CHECK-NOFP16-GI-NEXT:    fmax v2.4s, v0.4s, v2.4s
 ; CHECK-NOFP16-GI-NEXT:    fmax v3.4s, v1.4s, v3.4s
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v4.4s

diff  --git a/llvm/test/CodeGen/AArch64/fminmax.ll b/llvm/test/CodeGen/AArch64/fminmax.ll
index 9c38ed0b5c656b..a3a9dffabbb9dd 100644
--- a/llvm/test/CodeGen/AArch64/fminmax.ll
+++ b/llvm/test/CodeGen/AArch64/fminmax.ll
@@ -201,22 +201,34 @@ entry:
 }
 
 define <4 x double> @min_v4f64(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: min_v4f64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fminnm v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    fminnm v1.2d, v1.2d, v3.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: min_v4f64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fminnm v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    fminnm v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: min_v4f64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fminnm v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    fminnm v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ret
 entry:
   %c = call <4 x double> @llvm.minnum.v4f64(<4 x double> %a, <4 x double> %b)
   ret <4 x double> %c
 }
 
 define <4 x double> @max_v4f64(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: max_v4f64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmaxnm v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    fmaxnm v1.2d, v1.2d, v3.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: max_v4f64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmaxnm v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    fmaxnm v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: max_v4f64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmaxnm v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    fmaxnm v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ret
 entry:
   %c = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %a, <4 x double> %b)
   ret <4 x double> %c
@@ -325,39 +337,39 @@ entry:
 define <7 x float> @min_v7f32(<7 x float> %a, <7 x float> %b) {
 ; CHECK-SD-LABEL: min_v7f32:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mov x8, sp
-; CHECK-SD-NEXT:    // kill: def $s7 killed $s7 def $q7
 ; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
 ; CHECK-SD-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-SD-NEXT:    add x9, sp, #32
-; CHECK-SD-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-SD-NEXT:    ldr s1, [sp, #24]
+; CHECK-SD-NEXT:    // kill: def $s7 killed $s7 def $q7
+; CHECK-SD-NEXT:    mov x8, sp
 ; CHECK-SD-NEXT:    // kill: def $s4 killed $s4 def $q4
 ; CHECK-SD-NEXT:    // kill: def $s5 killed $s5 def $q5
 ; CHECK-SD-NEXT:    // kill: def $s2 killed $s2 def $q2
 ; CHECK-SD-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-SD-NEXT:    // kill: def $s3 killed $s3 def $q3
+; CHECK-SD-NEXT:    mov v0.s[1], v1.s[0]
 ; CHECK-SD-NEXT:    ld1 { v7.s }[1], [x8]
+; CHECK-SD-NEXT:    ldr s1, [sp, #24]
 ; CHECK-SD-NEXT:    add x8, sp, #8
-; CHECK-SD-NEXT:    ld1 { v1.s }[1], [x9]
-; CHECK-SD-NEXT:    mov v0.s[2], v2.s[0]
 ; CHECK-SD-NEXT:    mov v4.s[1], v5.s[0]
 ; CHECK-SD-NEXT:    ld1 { v7.s }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #32
+; CHECK-SD-NEXT:    mov v0.s[2], v2.s[0]
+; CHECK-SD-NEXT:    ld1 { v1.s }[1], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #16
-; CHECK-SD-NEXT:    mov v0.s[3], v3.s[0]
 ; CHECK-SD-NEXT:    mov v4.s[2], v6.s[0]
 ; CHECK-SD-NEXT:    ld1 { v7.s }[3], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #40
 ; CHECK-SD-NEXT:    ld1 { v1.s }[2], [x8]
-; CHECK-SD-NEXT:    fminnm v0.4s, v0.4s, v7.4s
+; CHECK-SD-NEXT:    mov v0.s[3], v3.s[0]
 ; CHECK-SD-NEXT:    fminnm v4.4s, v4.4s, v1.4s
+; CHECK-SD-NEXT:    fminnm v0.4s, v0.4s, v7.4s
+; CHECK-SD-NEXT:    mov s5, v4.s[1]
+; CHECK-SD-NEXT:    mov s6, v4.s[2]
+; CHECK-SD-NEXT:    // kill: def $s4 killed $s4 killed $q4
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    mov s2, v0.s[2]
 ; CHECK-SD-NEXT:    mov s3, v0.s[3]
 ; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    mov s5, v4.s[1]
-; CHECK-SD-NEXT:    mov s6, v4.s[2]
-; CHECK-SD-NEXT:    // kill: def $s4 killed $s4 killed $q4
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: min_v7f32:
@@ -365,31 +377,31 @@ define <7 x float> @min_v7f32(<7 x float> %a, <7 x float> %b) {
 ; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
 ; CHECK-GI-NEXT:    // kill: def $s1 killed $s1 def $q1
 ; CHECK-GI-NEXT:    ldr s16, [sp]
+; CHECK-GI-NEXT:    ldr s17, [sp, #24]
 ; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 def $q4
 ; CHECK-GI-NEXT:    // kill: def $s7 killed $s7 def $q7
-; CHECK-GI-NEXT:    // kill: def $s5 killed $s5 def $q5
 ; CHECK-GI-NEXT:    // kill: def $s2 killed $s2 def $q2
+; CHECK-GI-NEXT:    // kill: def $s5 killed $s5 def $q5
 ; CHECK-GI-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-GI-NEXT:    // kill: def $s3 killed $s3 def $q3
+; CHECK-GI-NEXT:    ldr s18, [sp, #32]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    ldr s19, [sp, #24]
-; CHECK-GI-NEXT:    ldr s20, [sp, #32]
-; CHECK-GI-NEXT:    mov v7.s[1], v16.s[0]
-; CHECK-GI-NEXT:    ldr s17, [sp, #8]
 ; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
-; CHECK-GI-NEXT:    ldr s21, [sp, #40]
-; CHECK-GI-NEXT:    mov v19.s[1], v20.s[0]
-; CHECK-GI-NEXT:    ldr s18, [sp, #16]
+; CHECK-GI-NEXT:    ldr s1, [sp, #8]
+; CHECK-GI-NEXT:    mov v7.s[1], v16.s[0]
+; CHECK-GI-NEXT:    mov v17.s[1], v18.s[0]
+; CHECK-GI-NEXT:    ldr s5, [sp, #40]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
-; CHECK-GI-NEXT:    mov v7.s[2], v17.s[0]
 ; CHECK-GI-NEXT:    mov v4.s[2], v6.s[0]
-; CHECK-GI-NEXT:    mov v19.s[2], v21.s[0]
+; CHECK-GI-NEXT:    mov v7.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov v17.s[2], v5.s[0]
+; CHECK-GI-NEXT:    ldr s1, [sp, #16]
 ; CHECK-GI-NEXT:    mov v0.s[3], v3.s[0]
-; CHECK-GI-NEXT:    mov v7.s[3], v18.s[0]
+; CHECK-GI-NEXT:    mov v7.s[3], v1.s[0]
 ; CHECK-GI-NEXT:    mov v4.s[3], v0.s[0]
-; CHECK-GI-NEXT:    mov v19.s[3], v0.s[0]
+; CHECK-GI-NEXT:    mov v17.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    fminnm v0.4s, v0.4s, v7.4s
-; CHECK-GI-NEXT:    fminnm v4.4s, v4.4s, v19.4s
+; CHECK-GI-NEXT:    fminnm v4.4s, v4.4s, v17.4s
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    mov s2, v0.s[2]
 ; CHECK-GI-NEXT:    mov s3, v0.s[3]
@@ -406,39 +418,39 @@ entry:
 define <7 x float> @max_v7f32(<7 x float> %a, <7 x float> %b) {
 ; CHECK-SD-LABEL: max_v7f32:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mov x8, sp
-; CHECK-SD-NEXT:    // kill: def $s7 killed $s7 def $q7
 ; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
 ; CHECK-SD-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-SD-NEXT:    add x9, sp, #32
-; CHECK-SD-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-SD-NEXT:    ldr s1, [sp, #24]
+; CHECK-SD-NEXT:    // kill: def $s7 killed $s7 def $q7
+; CHECK-SD-NEXT:    mov x8, sp
 ; CHECK-SD-NEXT:    // kill: def $s4 killed $s4 def $q4
 ; CHECK-SD-NEXT:    // kill: def $s5 killed $s5 def $q5
 ; CHECK-SD-NEXT:    // kill: def $s2 killed $s2 def $q2
 ; CHECK-SD-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-SD-NEXT:    // kill: def $s3 killed $s3 def $q3
+; CHECK-SD-NEXT:    mov v0.s[1], v1.s[0]
 ; CHECK-SD-NEXT:    ld1 { v7.s }[1], [x8]
+; CHECK-SD-NEXT:    ldr s1, [sp, #24]
 ; CHECK-SD-NEXT:    add x8, sp, #8
-; CHECK-SD-NEXT:    ld1 { v1.s }[1], [x9]
-; CHECK-SD-NEXT:    mov v0.s[2], v2.s[0]
 ; CHECK-SD-NEXT:    mov v4.s[1], v5.s[0]
 ; CHECK-SD-NEXT:    ld1 { v7.s }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #32
+; CHECK-SD-NEXT:    mov v0.s[2], v2.s[0]
+; CHECK-SD-NEXT:    ld1 { v1.s }[1], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #16
-; CHECK-SD-NEXT:    mov v0.s[3], v3.s[0]
 ; CHECK-SD-NEXT:    mov v4.s[2], v6.s[0]
 ; CHECK-SD-NEXT:    ld1 { v7.s }[3], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #40
 ; CHECK-SD-NEXT:    ld1 { v1.s }[2], [x8]
-; CHECK-SD-NEXT:    fmaxnm v0.4s, v0.4s, v7.4s
+; CHECK-SD-NEXT:    mov v0.s[3], v3.s[0]
 ; CHECK-SD-NEXT:    fmaxnm v4.4s, v4.4s, v1.4s
+; CHECK-SD-NEXT:    fmaxnm v0.4s, v0.4s, v7.4s
+; CHECK-SD-NEXT:    mov s5, v4.s[1]
+; CHECK-SD-NEXT:    mov s6, v4.s[2]
+; CHECK-SD-NEXT:    // kill: def $s4 killed $s4 killed $q4
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    mov s2, v0.s[2]
 ; CHECK-SD-NEXT:    mov s3, v0.s[3]
 ; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    mov s5, v4.s[1]
-; CHECK-SD-NEXT:    mov s6, v4.s[2]
-; CHECK-SD-NEXT:    // kill: def $s4 killed $s4 killed $q4
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: max_v7f32:
@@ -446,31 +458,31 @@ define <7 x float> @max_v7f32(<7 x float> %a, <7 x float> %b) {
 ; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
 ; CHECK-GI-NEXT:    // kill: def $s1 killed $s1 def $q1
 ; CHECK-GI-NEXT:    ldr s16, [sp]
+; CHECK-GI-NEXT:    ldr s17, [sp, #24]
 ; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 def $q4
 ; CHECK-GI-NEXT:    // kill: def $s7 killed $s7 def $q7
-; CHECK-GI-NEXT:    // kill: def $s5 killed $s5 def $q5
 ; CHECK-GI-NEXT:    // kill: def $s2 killed $s2 def $q2
+; CHECK-GI-NEXT:    // kill: def $s5 killed $s5 def $q5
 ; CHECK-GI-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-GI-NEXT:    // kill: def $s3 killed $s3 def $q3
+; CHECK-GI-NEXT:    ldr s18, [sp, #32]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    ldr s19, [sp, #24]
-; CHECK-GI-NEXT:    ldr s20, [sp, #32]
-; CHECK-GI-NEXT:    mov v7.s[1], v16.s[0]
-; CHECK-GI-NEXT:    ldr s17, [sp, #8]
 ; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
-; CHECK-GI-NEXT:    ldr s21, [sp, #40]
-; CHECK-GI-NEXT:    mov v19.s[1], v20.s[0]
-; CHECK-GI-NEXT:    ldr s18, [sp, #16]
+; CHECK-GI-NEXT:    ldr s1, [sp, #8]
+; CHECK-GI-NEXT:    mov v7.s[1], v16.s[0]
+; CHECK-GI-NEXT:    mov v17.s[1], v18.s[0]
+; CHECK-GI-NEXT:    ldr s5, [sp, #40]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
-; CHECK-GI-NEXT:    mov v7.s[2], v17.s[0]
 ; CHECK-GI-NEXT:    mov v4.s[2], v6.s[0]
-; CHECK-GI-NEXT:    mov v19.s[2], v21.s[0]
+; CHECK-GI-NEXT:    mov v7.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov v17.s[2], v5.s[0]
+; CHECK-GI-NEXT:    ldr s1, [sp, #16]
 ; CHECK-GI-NEXT:    mov v0.s[3], v3.s[0]
-; CHECK-GI-NEXT:    mov v7.s[3], v18.s[0]
+; CHECK-GI-NEXT:    mov v7.s[3], v1.s[0]
 ; CHECK-GI-NEXT:    mov v4.s[3], v0.s[0]
-; CHECK-GI-NEXT:    mov v19.s[3], v0.s[0]
+; CHECK-GI-NEXT:    mov v17.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    fmaxnm v0.4s, v0.4s, v7.4s
-; CHECK-GI-NEXT:    fmaxnm v4.4s, v4.4s, v19.4s
+; CHECK-GI-NEXT:    fmaxnm v4.4s, v4.4s, v17.4s
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    mov s2, v0.s[2]
 ; CHECK-GI-NEXT:    mov s3, v0.s[3]
@@ -485,22 +497,34 @@ entry:
 }
 
 define <8 x float> @min_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: min_v8f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fminnm v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    fminnm v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: min_v8f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fminnm v1.4s, v1.4s, v3.4s
+; CHECK-SD-NEXT:    fminnm v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: min_v8f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fminnm v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    fminnm v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %c = call <8 x float> @llvm.minnum.v8f32(<8 x float> %a, <8 x float> %b)
   ret <8 x float> %c
 }
 
 define <8 x float> @max_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: max_v8f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmaxnm v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    fmaxnm v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: max_v8f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmaxnm v1.4s, v1.4s, v3.4s
+; CHECK-SD-NEXT:    fmaxnm v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: max_v8f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmaxnm v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    fmaxnm v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %c = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %a, <8 x float> %b)
   ret <8 x float> %c
@@ -513,22 +537,22 @@ define <4 x half> @min_v4f16(<4 x half> %a, <4 x half> %b) {
 ; CHECK-NOFP16-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NOFP16-SD-NEXT:    mov h2, v1.h[1]
 ; CHECK-NOFP16-SD-NEXT:    mov h3, v0.h[1]
-; CHECK-NOFP16-SD-NEXT:    fcvt s4, h1
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h0
-; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[2]
-; CHECK-NOFP16-SD-NEXT:    mov h7, v0.h[2]
+; CHECK-NOFP16-SD-NEXT:    mov h4, v1.h[2]
+; CHECK-NOFP16-SD-NEXT:    mov h5, v0.h[2]
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h1
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h0
 ; CHECK-NOFP16-SD-NEXT:    mov h1, v1.h[3]
 ; CHECK-NOFP16-SD-NEXT:    fcvt s2, h2
 ; CHECK-NOFP16-SD-NEXT:    fcvt s3, h3
-; CHECK-NOFP16-SD-NEXT:    fminnm s4, s5, s4
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h7
 ; CHECK-NOFP16-SD-NEXT:    fcvt s1, h1
 ; CHECK-NOFP16-SD-NEXT:    fminnm s2, s3, s2
-; CHECK-NOFP16-SD-NEXT:    fcvt s3, h6
+; CHECK-NOFP16-SD-NEXT:    fcvt s3, h4
+; CHECK-NOFP16-SD-NEXT:    fcvt s4, h5
+; CHECK-NOFP16-SD-NEXT:    fminnm s5, s7, s6
 ; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[3]
-; CHECK-NOFP16-SD-NEXT:    fcvt h0, s4
+; CHECK-NOFP16-SD-NEXT:    fminnm s3, s4, s3
 ; CHECK-NOFP16-SD-NEXT:    fcvt h2, s2
-; CHECK-NOFP16-SD-NEXT:    fminnm s3, s5, s3
+; CHECK-NOFP16-SD-NEXT:    fcvt h0, s5
 ; CHECK-NOFP16-SD-NEXT:    fcvt s4, h6
 ; CHECK-NOFP16-SD-NEXT:    mov v0.h[1], v2.h[0]
 ; CHECK-NOFP16-SD-NEXT:    fcvt h2, s3
@@ -568,22 +592,22 @@ define <4 x half> @max_v4f16(<4 x half> %a, <4 x half> %b) {
 ; CHECK-NOFP16-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NOFP16-SD-NEXT:    mov h2, v1.h[1]
 ; CHECK-NOFP16-SD-NEXT:    mov h3, v0.h[1]
-; CHECK-NOFP16-SD-NEXT:    fcvt s4, h1
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h0
-; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[2]
-; CHECK-NOFP16-SD-NEXT:    mov h7, v0.h[2]
+; CHECK-NOFP16-SD-NEXT:    mov h4, v1.h[2]
+; CHECK-NOFP16-SD-NEXT:    mov h5, v0.h[2]
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h1
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h0
 ; CHECK-NOFP16-SD-NEXT:    mov h1, v1.h[3]
 ; CHECK-NOFP16-SD-NEXT:    fcvt s2, h2
 ; CHECK-NOFP16-SD-NEXT:    fcvt s3, h3
-; CHECK-NOFP16-SD-NEXT:    fmaxnm s4, s5, s4
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h7
 ; CHECK-NOFP16-SD-NEXT:    fcvt s1, h1
 ; CHECK-NOFP16-SD-NEXT:    fmaxnm s2, s3, s2
-; CHECK-NOFP16-SD-NEXT:    fcvt s3, h6
+; CHECK-NOFP16-SD-NEXT:    fcvt s3, h4
+; CHECK-NOFP16-SD-NEXT:    fcvt s4, h5
+; CHECK-NOFP16-SD-NEXT:    fmaxnm s5, s7, s6
 ; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[3]
-; CHECK-NOFP16-SD-NEXT:    fcvt h0, s4
+; CHECK-NOFP16-SD-NEXT:    fmaxnm s3, s4, s3
 ; CHECK-NOFP16-SD-NEXT:    fcvt h2, s2
-; CHECK-NOFP16-SD-NEXT:    fmaxnm s3, s5, s3
+; CHECK-NOFP16-SD-NEXT:    fcvt h0, s5
 ; CHECK-NOFP16-SD-NEXT:    fcvt s4, h6
 ; CHECK-NOFP16-SD-NEXT:    mov v0.h[1], v2.h[0]
 ; CHECK-NOFP16-SD-NEXT:    fcvt h2, s3
@@ -626,46 +650,46 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[2]
 ; CHECK-NOFP16-SD-NEXT:    mov h7, v0.h[2]
 ; CHECK-NOFP16-SD-NEXT:    mov h16, v1.h[3]
-; CHECK-NOFP16-SD-NEXT:    mov h17, v0.h[3]
 ; CHECK-NOFP16-SD-NEXT:    fcvt s2, h2
 ; CHECK-NOFP16-SD-NEXT:    fcvt s3, h3
 ; CHECK-NOFP16-SD-NEXT:    fminnm s4, s5, s4
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h6
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h7
-; CHECK-NOFP16-SD-NEXT:    fcvt s7, h16
-; CHECK-NOFP16-SD-NEXT:    fcvt s16, h17
+; CHECK-NOFP16-SD-NEXT:    mov h5, v0.h[3]
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h6
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h7
+; CHECK-NOFP16-SD-NEXT:    fcvt s16, h16
 ; CHECK-NOFP16-SD-NEXT:    fminnm s3, s3, s2
+; CHECK-NOFP16-SD-NEXT:    fcvt s5, h5
 ; CHECK-NOFP16-SD-NEXT:    fcvt h2, s4
-; CHECK-NOFP16-SD-NEXT:    fminnm s4, s6, s5
-; CHECK-NOFP16-SD-NEXT:    mov h5, v1.h[4]
-; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[4]
-; CHECK-NOFP16-SD-NEXT:    fminnm s7, s16, s7
+; CHECK-NOFP16-SD-NEXT:    fminnm s4, s7, s6
+; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[4]
+; CHECK-NOFP16-SD-NEXT:    mov h7, v0.h[4]
 ; CHECK-NOFP16-SD-NEXT:    fcvt h3, s3
+; CHECK-NOFP16-SD-NEXT:    fminnm s5, s5, s16
 ; CHECK-NOFP16-SD-NEXT:    mov h16, v0.h[5]
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h5
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h6
-; CHECK-NOFP16-SD-NEXT:    fcvt h7, s7
+; CHECK-NOFP16-SD-NEXT:    fcvt h4, s4
 ; CHECK-NOFP16-SD-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h3, s4
-; CHECK-NOFP16-SD-NEXT:    mov h4, v1.h[5]
-; CHECK-NOFP16-SD-NEXT:    fminnm s5, s6, s5
-; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[6]
-; CHECK-NOFP16-SD-NEXT:    mov v2.h[2], v3.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s3, h6
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h7
+; CHECK-NOFP16-SD-NEXT:    mov h7, v1.h[5]
+; CHECK-NOFP16-SD-NEXT:    fcvt h5, s5
+; CHECK-NOFP16-SD-NEXT:    fcvt s16, h16
+; CHECK-NOFP16-SD-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-NOFP16-SD-NEXT:    mov h4, v1.h[6]
+; CHECK-NOFP16-SD-NEXT:    fminnm s3, s6, s3
+; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[6]
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h7
 ; CHECK-NOFP16-SD-NEXT:    mov h1, v1.h[7]
-; CHECK-NOFP16-SD-NEXT:    fcvt s3, h4
-; CHECK-NOFP16-SD-NEXT:    fcvt s4, h16
-; CHECK-NOFP16-SD-NEXT:    mov h16, v0.h[6]
 ; CHECK-NOFP16-SD-NEXT:    mov h0, v0.h[7]
-; CHECK-NOFP16-SD-NEXT:    mov v2.h[3], v7.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt s1, h1
-; CHECK-NOFP16-SD-NEXT:    fminnm s3, s4, s3
-; CHECK-NOFP16-SD-NEXT:    fcvt h4, s5
+; CHECK-NOFP16-SD-NEXT:    mov v2.h[3], v5.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s4, h4
+; CHECK-NOFP16-SD-NEXT:    fcvt h3, s3
 ; CHECK-NOFP16-SD-NEXT:    fcvt s5, h6
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h16
+; CHECK-NOFP16-SD-NEXT:    fminnm s6, s16, s7
+; CHECK-NOFP16-SD-NEXT:    fcvt s1, h1
 ; CHECK-NOFP16-SD-NEXT:    fcvt s0, h0
-; CHECK-NOFP16-SD-NEXT:    mov v2.h[4], v4.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h3, s3
-; CHECK-NOFP16-SD-NEXT:    fminnm s4, s6, s5
+; CHECK-NOFP16-SD-NEXT:    mov v2.h[4], v3.h[0]
+; CHECK-NOFP16-SD-NEXT:    fminnm s4, s5, s4
+; CHECK-NOFP16-SD-NEXT:    fcvt h3, s6
 ; CHECK-NOFP16-SD-NEXT:    fminnm s0, s0, s1
 ; CHECK-NOFP16-SD-NEXT:    mov v2.h[5], v3.h[0]
 ; CHECK-NOFP16-SD-NEXT:    fcvt h3, s4
@@ -711,18 +735,18 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    fminnm v2.4s, v2.4s, v3.4s
 ; CHECK-NOFP16-GI-NEXT:    fminnm v0.4s, v0.4s, v1.4s
 ; CHECK-NOFP16-GI-NEXT:    mov s1, v2.s[1]
-; CHECK-NOFP16-GI-NEXT:    mov s3, v2.s[2]
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NOFP16-GI-NEXT:    mov s3, v2.s[2]
 ; CHECK-NOFP16-GI-NEXT:    mov v2.s[1], v1.s[0]
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[3]
+; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[3]
 ; CHECK-NOFP16-GI-NEXT:    mov v2.s[2], v3.s[0]
+; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[2]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov v2.s[3], v0.s[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v1.4h, v2.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v5.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v4.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov h2, v1.h[1]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[2]
@@ -734,28 +758,28 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-FP16-GI-LABEL: min_v7f16:
 ; CHECK-FP16-GI:       // %bb.0: // %entry
 ; CHECK-FP16-GI-NEXT:    mov h2, v0.h[1]
-; CHECK-FP16-GI-NEXT:    mov h3, v0.h[2]
-; CHECK-FP16-GI-NEXT:    mov h4, v0.h[3]
-; CHECK-FP16-GI-NEXT:    mov h5, v0.h[4]
-; CHECK-FP16-GI-NEXT:    mov h6, v0.h[5]
-; CHECK-FP16-GI-NEXT:    mov h7, v0.h[6]
-; CHECK-FP16-GI-NEXT:    mov h16, v1.h[1]
+; CHECK-FP16-GI-NEXT:    mov h3, v1.h[1]
+; CHECK-FP16-GI-NEXT:    mov h4, v0.h[2]
+; CHECK-FP16-GI-NEXT:    mov h5, v0.h[3]
+; CHECK-FP16-GI-NEXT:    mov h6, v0.h[4]
+; CHECK-FP16-GI-NEXT:    mov h7, v0.h[5]
+; CHECK-FP16-GI-NEXT:    mov h16, v0.h[6]
 ; CHECK-FP16-GI-NEXT:    mov h17, v1.h[2]
-; CHECK-FP16-GI-NEXT:    mov v0.h[1], v2.h[0]
 ; CHECK-FP16-GI-NEXT:    mov h18, v1.h[3]
 ; CHECK-FP16-GI-NEXT:    mov h19, v1.h[4]
 ; CHECK-FP16-GI-NEXT:    mov h20, v1.h[5]
 ; CHECK-FP16-GI-NEXT:    mov h21, v1.h[6]
-; CHECK-FP16-GI-NEXT:    mov v1.h[1], v16.h[0]
-; CHECK-FP16-GI-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[1], v2.h[0]
+; CHECK-FP16-GI-NEXT:    mov v1.h[1], v3.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[2], v4.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v1.h[2], v17.h[0]
-; CHECK-FP16-GI-NEXT:    mov v0.h[3], v4.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[3], v5.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v1.h[3], v18.h[0]
-; CHECK-FP16-GI-NEXT:    mov v0.h[4], v5.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[4], v6.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v1.h[4], v19.h[0]
-; CHECK-FP16-GI-NEXT:    mov v0.h[5], v6.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[5], v7.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v1.h[5], v20.h[0]
-; CHECK-FP16-GI-NEXT:    mov v0.h[6], v7.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[6], v16.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v1.h[6], v21.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v1.h[7], v0.h[0]
@@ -789,46 +813,46 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[2]
 ; CHECK-NOFP16-SD-NEXT:    mov h7, v0.h[2]
 ; CHECK-NOFP16-SD-NEXT:    mov h16, v1.h[3]
-; CHECK-NOFP16-SD-NEXT:    mov h17, v0.h[3]
 ; CHECK-NOFP16-SD-NEXT:    fcvt s2, h2
 ; CHECK-NOFP16-SD-NEXT:    fcvt s3, h3
 ; CHECK-NOFP16-SD-NEXT:    fmaxnm s4, s5, s4
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h6
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h7
-; CHECK-NOFP16-SD-NEXT:    fcvt s7, h16
-; CHECK-NOFP16-SD-NEXT:    fcvt s16, h17
+; CHECK-NOFP16-SD-NEXT:    mov h5, v0.h[3]
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h6
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h7
+; CHECK-NOFP16-SD-NEXT:    fcvt s16, h16
 ; CHECK-NOFP16-SD-NEXT:    fmaxnm s3, s3, s2
+; CHECK-NOFP16-SD-NEXT:    fcvt s5, h5
 ; CHECK-NOFP16-SD-NEXT:    fcvt h2, s4
-; CHECK-NOFP16-SD-NEXT:    fmaxnm s4, s6, s5
-; CHECK-NOFP16-SD-NEXT:    mov h5, v1.h[4]
-; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[4]
-; CHECK-NOFP16-SD-NEXT:    fmaxnm s7, s16, s7
+; CHECK-NOFP16-SD-NEXT:    fmaxnm s4, s7, s6
+; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[4]
+; CHECK-NOFP16-SD-NEXT:    mov h7, v0.h[4]
 ; CHECK-NOFP16-SD-NEXT:    fcvt h3, s3
+; CHECK-NOFP16-SD-NEXT:    fmaxnm s5, s5, s16
 ; CHECK-NOFP16-SD-NEXT:    mov h16, v0.h[5]
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h5
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h6
-; CHECK-NOFP16-SD-NEXT:    fcvt h7, s7
+; CHECK-NOFP16-SD-NEXT:    fcvt h4, s4
 ; CHECK-NOFP16-SD-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h3, s4
-; CHECK-NOFP16-SD-NEXT:    mov h4, v1.h[5]
-; CHECK-NOFP16-SD-NEXT:    fmaxnm s5, s6, s5
-; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[6]
-; CHECK-NOFP16-SD-NEXT:    mov v2.h[2], v3.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s3, h6
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h7
+; CHECK-NOFP16-SD-NEXT:    mov h7, v1.h[5]
+; CHECK-NOFP16-SD-NEXT:    fcvt h5, s5
+; CHECK-NOFP16-SD-NEXT:    fcvt s16, h16
+; CHECK-NOFP16-SD-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-NOFP16-SD-NEXT:    mov h4, v1.h[6]
+; CHECK-NOFP16-SD-NEXT:    fmaxnm s3, s6, s3
+; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[6]
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h7
 ; CHECK-NOFP16-SD-NEXT:    mov h1, v1.h[7]
-; CHECK-NOFP16-SD-NEXT:    fcvt s3, h4
-; CHECK-NOFP16-SD-NEXT:    fcvt s4, h16
-; CHECK-NOFP16-SD-NEXT:    mov h16, v0.h[6]
 ; CHECK-NOFP16-SD-NEXT:    mov h0, v0.h[7]
-; CHECK-NOFP16-SD-NEXT:    mov v2.h[3], v7.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt s1, h1
-; CHECK-NOFP16-SD-NEXT:    fmaxnm s3, s4, s3
-; CHECK-NOFP16-SD-NEXT:    fcvt h4, s5
+; CHECK-NOFP16-SD-NEXT:    mov v2.h[3], v5.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s4, h4
+; CHECK-NOFP16-SD-NEXT:    fcvt h3, s3
 ; CHECK-NOFP16-SD-NEXT:    fcvt s5, h6
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h16
+; CHECK-NOFP16-SD-NEXT:    fmaxnm s6, s16, s7
+; CHECK-NOFP16-SD-NEXT:    fcvt s1, h1
 ; CHECK-NOFP16-SD-NEXT:    fcvt s0, h0
-; CHECK-NOFP16-SD-NEXT:    mov v2.h[4], v4.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h3, s3
-; CHECK-NOFP16-SD-NEXT:    fmaxnm s4, s6, s5
+; CHECK-NOFP16-SD-NEXT:    mov v2.h[4], v3.h[0]
+; CHECK-NOFP16-SD-NEXT:    fmaxnm s4, s5, s4
+; CHECK-NOFP16-SD-NEXT:    fcvt h3, s6
 ; CHECK-NOFP16-SD-NEXT:    fmaxnm s0, s0, s1
 ; CHECK-NOFP16-SD-NEXT:    mov v2.h[5], v3.h[0]
 ; CHECK-NOFP16-SD-NEXT:    fcvt h3, s4
@@ -874,18 +898,18 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    fmaxnm v2.4s, v2.4s, v3.4s
 ; CHECK-NOFP16-GI-NEXT:    fmaxnm v0.4s, v0.4s, v1.4s
 ; CHECK-NOFP16-GI-NEXT:    mov s1, v2.s[1]
-; CHECK-NOFP16-GI-NEXT:    mov s3, v2.s[2]
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NOFP16-GI-NEXT:    mov s3, v2.s[2]
 ; CHECK-NOFP16-GI-NEXT:    mov v2.s[1], v1.s[0]
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[3]
+; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[3]
 ; CHECK-NOFP16-GI-NEXT:    mov v2.s[2], v3.s[0]
+; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[2]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov v2.s[3], v0.s[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v1.4h, v2.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v5.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v4.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov h2, v1.h[1]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[2]
@@ -897,28 +921,28 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-FP16-GI-LABEL: max_v7f16:
 ; CHECK-FP16-GI:       // %bb.0: // %entry
 ; CHECK-FP16-GI-NEXT:    mov h2, v0.h[1]
-; CHECK-FP16-GI-NEXT:    mov h3, v0.h[2]
-; CHECK-FP16-GI-NEXT:    mov h4, v0.h[3]
-; CHECK-FP16-GI-NEXT:    mov h5, v0.h[4]
-; CHECK-FP16-GI-NEXT:    mov h6, v0.h[5]
-; CHECK-FP16-GI-NEXT:    mov h7, v0.h[6]
-; CHECK-FP16-GI-NEXT:    mov h16, v1.h[1]
+; CHECK-FP16-GI-NEXT:    mov h3, v1.h[1]
+; CHECK-FP16-GI-NEXT:    mov h4, v0.h[2]
+; CHECK-FP16-GI-NEXT:    mov h5, v0.h[3]
+; CHECK-FP16-GI-NEXT:    mov h6, v0.h[4]
+; CHECK-FP16-GI-NEXT:    mov h7, v0.h[5]
+; CHECK-FP16-GI-NEXT:    mov h16, v0.h[6]
 ; CHECK-FP16-GI-NEXT:    mov h17, v1.h[2]
-; CHECK-FP16-GI-NEXT:    mov v0.h[1], v2.h[0]
 ; CHECK-FP16-GI-NEXT:    mov h18, v1.h[3]
 ; CHECK-FP16-GI-NEXT:    mov h19, v1.h[4]
 ; CHECK-FP16-GI-NEXT:    mov h20, v1.h[5]
 ; CHECK-FP16-GI-NEXT:    mov h21, v1.h[6]
-; CHECK-FP16-GI-NEXT:    mov v1.h[1], v16.h[0]
-; CHECK-FP16-GI-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[1], v2.h[0]
+; CHECK-FP16-GI-NEXT:    mov v1.h[1], v3.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[2], v4.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v1.h[2], v17.h[0]
-; CHECK-FP16-GI-NEXT:    mov v0.h[3], v4.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[3], v5.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v1.h[3], v18.h[0]
-; CHECK-FP16-GI-NEXT:    mov v0.h[4], v5.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[4], v6.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v1.h[4], v19.h[0]
-; CHECK-FP16-GI-NEXT:    mov v0.h[5], v6.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[5], v7.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v1.h[5], v20.h[0]
-; CHECK-FP16-GI-NEXT:    mov v0.h[6], v7.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[6], v16.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v1.h[6], v21.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-FP16-GI-NEXT:    mov v1.h[7], v0.h[0]
@@ -952,46 +976,46 @@ define <8 x half> @min_v8f16(<8 x half> %a, <8 x half> %b) {
 ; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[2]
 ; CHECK-NOFP16-SD-NEXT:    mov h7, v0.h[2]
 ; CHECK-NOFP16-SD-NEXT:    mov h16, v1.h[3]
-; CHECK-NOFP16-SD-NEXT:    mov h17, v0.h[3]
 ; CHECK-NOFP16-SD-NEXT:    fcvt s2, h2
 ; CHECK-NOFP16-SD-NEXT:    fcvt s3, h3
 ; CHECK-NOFP16-SD-NEXT:    fminnm s4, s5, s4
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h6
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h7
-; CHECK-NOFP16-SD-NEXT:    fcvt s7, h16
-; CHECK-NOFP16-SD-NEXT:    fcvt s16, h17
+; CHECK-NOFP16-SD-NEXT:    mov h5, v0.h[3]
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h6
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h7
+; CHECK-NOFP16-SD-NEXT:    fcvt s16, h16
 ; CHECK-NOFP16-SD-NEXT:    fminnm s3, s3, s2
+; CHECK-NOFP16-SD-NEXT:    fcvt s5, h5
 ; CHECK-NOFP16-SD-NEXT:    fcvt h2, s4
-; CHECK-NOFP16-SD-NEXT:    fminnm s4, s6, s5
-; CHECK-NOFP16-SD-NEXT:    mov h5, v1.h[4]
-; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[4]
-; CHECK-NOFP16-SD-NEXT:    fminnm s7, s16, s7
+; CHECK-NOFP16-SD-NEXT:    fminnm s4, s7, s6
+; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[4]
+; CHECK-NOFP16-SD-NEXT:    mov h7, v0.h[4]
 ; CHECK-NOFP16-SD-NEXT:    fcvt h3, s3
+; CHECK-NOFP16-SD-NEXT:    fminnm s5, s5, s16
 ; CHECK-NOFP16-SD-NEXT:    mov h16, v0.h[5]
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h5
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h6
-; CHECK-NOFP16-SD-NEXT:    fcvt h7, s7
+; CHECK-NOFP16-SD-NEXT:    fcvt h4, s4
 ; CHECK-NOFP16-SD-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h3, s4
-; CHECK-NOFP16-SD-NEXT:    mov h4, v1.h[5]
-; CHECK-NOFP16-SD-NEXT:    fminnm s5, s6, s5
-; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[6]
-; CHECK-NOFP16-SD-NEXT:    mov v2.h[2], v3.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s3, h6
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h7
+; CHECK-NOFP16-SD-NEXT:    mov h7, v1.h[5]
+; CHECK-NOFP16-SD-NEXT:    fcvt h5, s5
+; CHECK-NOFP16-SD-NEXT:    fcvt s16, h16
+; CHECK-NOFP16-SD-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-NOFP16-SD-NEXT:    mov h4, v1.h[6]
+; CHECK-NOFP16-SD-NEXT:    fminnm s3, s6, s3
+; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[6]
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h7
 ; CHECK-NOFP16-SD-NEXT:    mov h1, v1.h[7]
-; CHECK-NOFP16-SD-NEXT:    fcvt s3, h4
-; CHECK-NOFP16-SD-NEXT:    fcvt s4, h16
-; CHECK-NOFP16-SD-NEXT:    mov h16, v0.h[6]
 ; CHECK-NOFP16-SD-NEXT:    mov h0, v0.h[7]
-; CHECK-NOFP16-SD-NEXT:    mov v2.h[3], v7.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt s1, h1
-; CHECK-NOFP16-SD-NEXT:    fminnm s3, s4, s3
-; CHECK-NOFP16-SD-NEXT:    fcvt h4, s5
+; CHECK-NOFP16-SD-NEXT:    mov v2.h[3], v5.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s4, h4
+; CHECK-NOFP16-SD-NEXT:    fcvt h3, s3
 ; CHECK-NOFP16-SD-NEXT:    fcvt s5, h6
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h16
+; CHECK-NOFP16-SD-NEXT:    fminnm s6, s16, s7
+; CHECK-NOFP16-SD-NEXT:    fcvt s1, h1
 ; CHECK-NOFP16-SD-NEXT:    fcvt s0, h0
-; CHECK-NOFP16-SD-NEXT:    mov v2.h[4], v4.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h3, s3
-; CHECK-NOFP16-SD-NEXT:    fminnm s4, s6, s5
+; CHECK-NOFP16-SD-NEXT:    mov v2.h[4], v3.h[0]
+; CHECK-NOFP16-SD-NEXT:    fminnm s4, s5, s4
+; CHECK-NOFP16-SD-NEXT:    fcvt h3, s6
 ; CHECK-NOFP16-SD-NEXT:    fminnm s0, s0, s1
 ; CHECK-NOFP16-SD-NEXT:    mov v2.h[5], v3.h[0]
 ; CHECK-NOFP16-SD-NEXT:    fcvt h3, s4
@@ -1037,46 +1061,46 @@ define <8 x half> @max_v8f16(<8 x half> %a, <8 x half> %b) {
 ; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[2]
 ; CHECK-NOFP16-SD-NEXT:    mov h7, v0.h[2]
 ; CHECK-NOFP16-SD-NEXT:    mov h16, v1.h[3]
-; CHECK-NOFP16-SD-NEXT:    mov h17, v0.h[3]
 ; CHECK-NOFP16-SD-NEXT:    fcvt s2, h2
 ; CHECK-NOFP16-SD-NEXT:    fcvt s3, h3
 ; CHECK-NOFP16-SD-NEXT:    fmaxnm s4, s5, s4
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h6
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h7
-; CHECK-NOFP16-SD-NEXT:    fcvt s7, h16
-; CHECK-NOFP16-SD-NEXT:    fcvt s16, h17
+; CHECK-NOFP16-SD-NEXT:    mov h5, v0.h[3]
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h6
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h7
+; CHECK-NOFP16-SD-NEXT:    fcvt s16, h16
 ; CHECK-NOFP16-SD-NEXT:    fmaxnm s3, s3, s2
+; CHECK-NOFP16-SD-NEXT:    fcvt s5, h5
 ; CHECK-NOFP16-SD-NEXT:    fcvt h2, s4
-; CHECK-NOFP16-SD-NEXT:    fmaxnm s4, s6, s5
-; CHECK-NOFP16-SD-NEXT:    mov h5, v1.h[4]
-; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[4]
-; CHECK-NOFP16-SD-NEXT:    fmaxnm s7, s16, s7
+; CHECK-NOFP16-SD-NEXT:    fmaxnm s4, s7, s6
+; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[4]
+; CHECK-NOFP16-SD-NEXT:    mov h7, v0.h[4]
 ; CHECK-NOFP16-SD-NEXT:    fcvt h3, s3
+; CHECK-NOFP16-SD-NEXT:    fmaxnm s5, s5, s16
 ; CHECK-NOFP16-SD-NEXT:    mov h16, v0.h[5]
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h5
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h6
-; CHECK-NOFP16-SD-NEXT:    fcvt h7, s7
+; CHECK-NOFP16-SD-NEXT:    fcvt h4, s4
 ; CHECK-NOFP16-SD-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h3, s4
-; CHECK-NOFP16-SD-NEXT:    mov h4, v1.h[5]
-; CHECK-NOFP16-SD-NEXT:    fmaxnm s5, s6, s5
-; CHECK-NOFP16-SD-NEXT:    mov h6, v1.h[6]
-; CHECK-NOFP16-SD-NEXT:    mov v2.h[2], v3.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s3, h6
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h7
+; CHECK-NOFP16-SD-NEXT:    mov h7, v1.h[5]
+; CHECK-NOFP16-SD-NEXT:    fcvt h5, s5
+; CHECK-NOFP16-SD-NEXT:    fcvt s16, h16
+; CHECK-NOFP16-SD-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-NOFP16-SD-NEXT:    mov h4, v1.h[6]
+; CHECK-NOFP16-SD-NEXT:    fmaxnm s3, s6, s3
+; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[6]
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h7
 ; CHECK-NOFP16-SD-NEXT:    mov h1, v1.h[7]
-; CHECK-NOFP16-SD-NEXT:    fcvt s3, h4
-; CHECK-NOFP16-SD-NEXT:    fcvt s4, h16
-; CHECK-NOFP16-SD-NEXT:    mov h16, v0.h[6]
 ; CHECK-NOFP16-SD-NEXT:    mov h0, v0.h[7]
-; CHECK-NOFP16-SD-NEXT:    mov v2.h[3], v7.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt s1, h1
-; CHECK-NOFP16-SD-NEXT:    fmaxnm s3, s4, s3
-; CHECK-NOFP16-SD-NEXT:    fcvt h4, s5
+; CHECK-NOFP16-SD-NEXT:    mov v2.h[3], v5.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s4, h4
+; CHECK-NOFP16-SD-NEXT:    fcvt h3, s3
 ; CHECK-NOFP16-SD-NEXT:    fcvt s5, h6
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h16
+; CHECK-NOFP16-SD-NEXT:    fmaxnm s6, s16, s7
+; CHECK-NOFP16-SD-NEXT:    fcvt s1, h1
 ; CHECK-NOFP16-SD-NEXT:    fcvt s0, h0
-; CHECK-NOFP16-SD-NEXT:    mov v2.h[4], v4.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h3, s3
-; CHECK-NOFP16-SD-NEXT:    fmaxnm s4, s6, s5
+; CHECK-NOFP16-SD-NEXT:    mov v2.h[4], v3.h[0]
+; CHECK-NOFP16-SD-NEXT:    fmaxnm s4, s5, s4
+; CHECK-NOFP16-SD-NEXT:    fcvt h3, s6
 ; CHECK-NOFP16-SD-NEXT:    fmaxnm s0, s0, s1
 ; CHECK-NOFP16-SD-NEXT:    mov v2.h[5], v3.h[0]
 ; CHECK-NOFP16-SD-NEXT:    fcvt h3, s4
@@ -1115,110 +1139,110 @@ entry:
 define <16 x half> @min_v16f16(<16 x half> %a, <16 x half> %b) {
 ; CHECK-NOFP16-SD-LABEL: min_v16f16:
 ; CHECK-NOFP16-SD:       // %bb.0: // %entry
-; CHECK-NOFP16-SD-NEXT:    mov h4, v2.h[1]
-; CHECK-NOFP16-SD-NEXT:    mov h5, v0.h[1]
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h2
-; CHECK-NOFP16-SD-NEXT:    fcvt s7, h0
-; CHECK-NOFP16-SD-NEXT:    mov h16, v2.h[2]
-; CHECK-NOFP16-SD-NEXT:    mov h17, v0.h[2]
-; CHECK-NOFP16-SD-NEXT:    mov h18, v3.h[1]
-; CHECK-NOFP16-SD-NEXT:    mov h19, v1.h[1]
-; CHECK-NOFP16-SD-NEXT:    fcvt s4, h4
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h5
-; CHECK-NOFP16-SD-NEXT:    fminnm s6, s7, s6
-; CHECK-NOFP16-SD-NEXT:    fcvt s20, h1
-; CHECK-NOFP16-SD-NEXT:    fcvt s7, h16
-; CHECK-NOFP16-SD-NEXT:    fcvt s16, h17
-; CHECK-NOFP16-SD-NEXT:    mov h17, v2.h[3]
-; CHECK-NOFP16-SD-NEXT:    fcvt s18, h18
-; CHECK-NOFP16-SD-NEXT:    fminnm s5, s5, s4
-; CHECK-NOFP16-SD-NEXT:    fcvt s19, h19
-; CHECK-NOFP16-SD-NEXT:    fcvt h4, s6
-; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[3]
-; CHECK-NOFP16-SD-NEXT:    fminnm s7, s16, s7
-; CHECK-NOFP16-SD-NEXT:    fcvt s16, h3
-; CHECK-NOFP16-SD-NEXT:    mov h21, v0.h[4]
-; CHECK-NOFP16-SD-NEXT:    fcvt s17, h17
-; CHECK-NOFP16-SD-NEXT:    fcvt h5, s5
-; CHECK-NOFP16-SD-NEXT:    fminnm s18, s19, s18
+; CHECK-NOFP16-SD-NEXT:    mov h6, v2.h[1]
+; CHECK-NOFP16-SD-NEXT:    mov h7, v0.h[1]
+; CHECK-NOFP16-SD-NEXT:    fcvt s4, h2
+; CHECK-NOFP16-SD-NEXT:    fcvt s5, h0
+; CHECK-NOFP16-SD-NEXT:    mov h16, v3.h[1]
+; CHECK-NOFP16-SD-NEXT:    mov h17, v1.h[1]
+; CHECK-NOFP16-SD-NEXT:    mov h18, v2.h[2]
+; CHECK-NOFP16-SD-NEXT:    mov h19, v0.h[2]
+; CHECK-NOFP16-SD-NEXT:    fcvt s20, h3
+; CHECK-NOFP16-SD-NEXT:    fcvt s21, h1
+; CHECK-NOFP16-SD-NEXT:    mov h22, v3.h[2]
+; CHECK-NOFP16-SD-NEXT:    mov h23, v1.h[2]
 ; CHECK-NOFP16-SD-NEXT:    fcvt s6, h6
-; CHECK-NOFP16-SD-NEXT:    mov h19, v3.h[2]
-; CHECK-NOFP16-SD-NEXT:    fminnm s16, s20, s16
-; CHECK-NOFP16-SD-NEXT:    mov h20, v1.h[2]
-; CHECK-NOFP16-SD-NEXT:    fcvt s21, h21
-; CHECK-NOFP16-SD-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h5, s7
-; CHECK-NOFP16-SD-NEXT:    mov h7, v2.h[4]
-; CHECK-NOFP16-SD-NEXT:    fminnm s6, s6, s17
-; CHECK-NOFP16-SD-NEXT:    mov h17, v3.h[3]
-; CHECK-NOFP16-SD-NEXT:    fcvt s19, h19
-; CHECK-NOFP16-SD-NEXT:    fcvt s20, h20
-; CHECK-NOFP16-SD-NEXT:    mov v4.h[2], v5.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h5, s16
 ; CHECK-NOFP16-SD-NEXT:    fcvt s7, h7
-; CHECK-NOFP16-SD-NEXT:    fcvt h16, s18
-; CHECK-NOFP16-SD-NEXT:    mov h18, v1.h[3]
-; CHECK-NOFP16-SD-NEXT:    fcvt h6, s6
-; CHECK-NOFP16-SD-NEXT:    fminnm s7, s21, s7
-; CHECK-NOFP16-SD-NEXT:    mov v5.h[1], v16.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt s16, h17
-; CHECK-NOFP16-SD-NEXT:    fminnm s17, s20, s19
-; CHECK-NOFP16-SD-NEXT:    fcvt s18, h18
-; CHECK-NOFP16-SD-NEXT:    mov h19, v3.h[4]
-; CHECK-NOFP16-SD-NEXT:    mov h20, v1.h[4]
-; CHECK-NOFP16-SD-NEXT:    mov v4.h[3], v6.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h6, s7
-; CHECK-NOFP16-SD-NEXT:    fminnm s7, s18, s16
-; CHECK-NOFP16-SD-NEXT:    fcvt h16, s17
-; CHECK-NOFP16-SD-NEXT:    fcvt s17, h19
-; CHECK-NOFP16-SD-NEXT:    fcvt s18, h20
-; CHECK-NOFP16-SD-NEXT:    mov v4.h[4], v6.h[0]
-; CHECK-NOFP16-SD-NEXT:    mov h19, v1.h[5]
-; CHECK-NOFP16-SD-NEXT:    mov v5.h[2], v16.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h6, s7
-; CHECK-NOFP16-SD-NEXT:    fminnm s7, s18, s17
-; CHECK-NOFP16-SD-NEXT:    mov h16, v2.h[5]
-; CHECK-NOFP16-SD-NEXT:    mov h17, v0.h[5]
-; CHECK-NOFP16-SD-NEXT:    mov h18, v3.h[5]
-; CHECK-NOFP16-SD-NEXT:    mov v5.h[3], v6.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h6, s7
-; CHECK-NOFP16-SD-NEXT:    fcvt s7, h16
+; CHECK-NOFP16-SD-NEXT:    mov h24, v0.h[6]
+; CHECK-NOFP16-SD-NEXT:    fminnm s4, s5, s4
+; CHECK-NOFP16-SD-NEXT:    fcvt s5, h16
 ; CHECK-NOFP16-SD-NEXT:    fcvt s16, h17
 ; CHECK-NOFP16-SD-NEXT:    fcvt s17, h18
 ; CHECK-NOFP16-SD-NEXT:    fcvt s18, h19
+; CHECK-NOFP16-SD-NEXT:    mov h19, v0.h[3]
+; CHECK-NOFP16-SD-NEXT:    fminnm s20, s21, s20
+; CHECK-NOFP16-SD-NEXT:    fcvt s21, h22
+; CHECK-NOFP16-SD-NEXT:    mov h22, v3.h[3]
+; CHECK-NOFP16-SD-NEXT:    fminnm s6, s7, s6
+; CHECK-NOFP16-SD-NEXT:    mov h7, v2.h[3]
+; CHECK-NOFP16-SD-NEXT:    mov h25, v1.h[6]
+; CHECK-NOFP16-SD-NEXT:    fcvt h4, s4
+; CHECK-NOFP16-SD-NEXT:    fminnm s5, s16, s5
+; CHECK-NOFP16-SD-NEXT:    fcvt s16, h23
+; CHECK-NOFP16-SD-NEXT:    mov h23, v1.h[3]
+; CHECK-NOFP16-SD-NEXT:    fminnm s17, s18, s17
+; CHECK-NOFP16-SD-NEXT:    fcvt s18, h19
+; CHECK-NOFP16-SD-NEXT:    fcvt h6, s6
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h7
+; CHECK-NOFP16-SD-NEXT:    fcvt h19, s5
+; CHECK-NOFP16-SD-NEXT:    fcvt h5, s20
+; CHECK-NOFP16-SD-NEXT:    fminnm s16, s16, s21
+; CHECK-NOFP16-SD-NEXT:    fcvt s20, h23
+; CHECK-NOFP16-SD-NEXT:    fcvt h17, s17
+; CHECK-NOFP16-SD-NEXT:    mov h21, v2.h[4]
+; CHECK-NOFP16-SD-NEXT:    mov h23, v1.h[4]
+; CHECK-NOFP16-SD-NEXT:    mov v4.h[1], v6.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h22
+; CHECK-NOFP16-SD-NEXT:    mov h22, v0.h[4]
+; CHECK-NOFP16-SD-NEXT:    fminnm s7, s18, s7
+; CHECK-NOFP16-SD-NEXT:    mov h18, v3.h[4]
+; CHECK-NOFP16-SD-NEXT:    mov v5.h[1], v19.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt h16, s16
+; CHECK-NOFP16-SD-NEXT:    fminnm s6, s20, s6
+; CHECK-NOFP16-SD-NEXT:    mov v4.h[2], v17.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s17, h21
+; CHECK-NOFP16-SD-NEXT:    fcvt s19, h22
+; CHECK-NOFP16-SD-NEXT:    fcvt h7, s7
+; CHECK-NOFP16-SD-NEXT:    fcvt s18, h18
+; CHECK-NOFP16-SD-NEXT:    fcvt s20, h23
+; CHECK-NOFP16-SD-NEXT:    mov h21, v2.h[5]
+; CHECK-NOFP16-SD-NEXT:    mov h22, v0.h[5]
+; CHECK-NOFP16-SD-NEXT:    mov v5.h[2], v16.h[0]
+; CHECK-NOFP16-SD-NEXT:    mov h16, v3.h[5]
+; CHECK-NOFP16-SD-NEXT:    mov h23, v1.h[5]
+; CHECK-NOFP16-SD-NEXT:    fcvt h6, s6
+; CHECK-NOFP16-SD-NEXT:    mov h0, v0.h[7]
+; CHECK-NOFP16-SD-NEXT:    mov h1, v1.h[7]
+; CHECK-NOFP16-SD-NEXT:    fminnm s17, s19, s17
 ; CHECK-NOFP16-SD-NEXT:    mov h19, v2.h[6]
+; CHECK-NOFP16-SD-NEXT:    mov v4.h[3], v7.h[0]
+; CHECK-NOFP16-SD-NEXT:    fminnm s18, s20, s18
+; CHECK-NOFP16-SD-NEXT:    mov h20, v3.h[6]
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h21
+; CHECK-NOFP16-SD-NEXT:    fcvt s21, h22
+; CHECK-NOFP16-SD-NEXT:    fcvt s22, h24
 ; CHECK-NOFP16-SD-NEXT:    mov h2, v2.h[7]
-; CHECK-NOFP16-SD-NEXT:    mov v5.h[4], v6.h[0]
-; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[6]
-; CHECK-NOFP16-SD-NEXT:    fminnm s7, s16, s7
-; CHECK-NOFP16-SD-NEXT:    fminnm s16, s18, s17
-; CHECK-NOFP16-SD-NEXT:    mov h17, v3.h[6]
-; CHECK-NOFP16-SD-NEXT:    mov h18, v1.h[6]
+; CHECK-NOFP16-SD-NEXT:    mov v5.h[3], v6.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h16
+; CHECK-NOFP16-SD-NEXT:    fcvt s16, h23
+; CHECK-NOFP16-SD-NEXT:    fcvt h17, s17
 ; CHECK-NOFP16-SD-NEXT:    fcvt s19, h19
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h6
-; CHECK-NOFP16-SD-NEXT:    mov h0, v0.h[7]
+; CHECK-NOFP16-SD-NEXT:    fcvt s23, h25
+; CHECK-NOFP16-SD-NEXT:    fcvt h18, s18
+; CHECK-NOFP16-SD-NEXT:    fcvt s20, h20
 ; CHECK-NOFP16-SD-NEXT:    mov h3, v3.h[7]
-; CHECK-NOFP16-SD-NEXT:    mov h1, v1.h[7]
-; CHECK-NOFP16-SD-NEXT:    fcvt s17, h17
-; CHECK-NOFP16-SD-NEXT:    fcvt h7, s7
-; CHECK-NOFP16-SD-NEXT:    fcvt s18, h18
+; CHECK-NOFP16-SD-NEXT:    fminnm s7, s21, s7
 ; CHECK-NOFP16-SD-NEXT:    fcvt s2, h2
-; CHECK-NOFP16-SD-NEXT:    fminnm s6, s6, s19
 ; CHECK-NOFP16-SD-NEXT:    fcvt s0, h0
-; CHECK-NOFP16-SD-NEXT:    fcvt s3, h3
+; CHECK-NOFP16-SD-NEXT:    fminnm s6, s16, s6
 ; CHECK-NOFP16-SD-NEXT:    fcvt s1, h1
-; CHECK-NOFP16-SD-NEXT:    fcvt h16, s16
-; CHECK-NOFP16-SD-NEXT:    fminnm s17, s18, s17
-; CHECK-NOFP16-SD-NEXT:    mov v4.h[5], v7.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h6, s6
+; CHECK-NOFP16-SD-NEXT:    mov v4.h[4], v17.h[0]
+; CHECK-NOFP16-SD-NEXT:    fminnm s16, s22, s19
+; CHECK-NOFP16-SD-NEXT:    mov v5.h[4], v18.h[0]
+; CHECK-NOFP16-SD-NEXT:    fminnm s17, s23, s20
+; CHECK-NOFP16-SD-NEXT:    fcvt s3, h3
+; CHECK-NOFP16-SD-NEXT:    fcvt h7, s7
 ; CHECK-NOFP16-SD-NEXT:    fminnm s0, s0, s2
+; CHECK-NOFP16-SD-NEXT:    fcvt h6, s6
+; CHECK-NOFP16-SD-NEXT:    fcvt h2, s16
 ; CHECK-NOFP16-SD-NEXT:    fminnm s1, s1, s3
-; CHECK-NOFP16-SD-NEXT:    mov v5.h[5], v16.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h2, s17
-; CHECK-NOFP16-SD-NEXT:    mov v4.h[6], v6.h[0]
+; CHECK-NOFP16-SD-NEXT:    mov v4.h[5], v7.h[0]
 ; CHECK-NOFP16-SD-NEXT:    fcvt h0, s0
+; CHECK-NOFP16-SD-NEXT:    mov v5.h[5], v6.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt h6, s17
 ; CHECK-NOFP16-SD-NEXT:    fcvt h1, s1
-; CHECK-NOFP16-SD-NEXT:    mov v5.h[6], v2.h[0]
+; CHECK-NOFP16-SD-NEXT:    mov v4.h[6], v2.h[0]
+; CHECK-NOFP16-SD-NEXT:    mov v5.h[6], v6.h[0]
 ; CHECK-NOFP16-SD-NEXT:    mov v4.h[7], v0.h[0]
 ; CHECK-NOFP16-SD-NEXT:    mov v5.h[7], v1.h[0]
 ; CHECK-NOFP16-SD-NEXT:    mov v0.16b, v4.16b
@@ -1227,22 +1251,22 @@ define <16 x half> @min_v16f16(<16 x half> %a, <16 x half> %b) {
 ;
 ; CHECK-FP16-SD-LABEL: min_v16f16:
 ; CHECK-FP16-SD:       // %bb.0: // %entry
-; CHECK-FP16-SD-NEXT:    fminnm v0.8h, v0.8h, v2.8h
 ; CHECK-FP16-SD-NEXT:    fminnm v1.8h, v1.8h, v3.8h
+; CHECK-FP16-SD-NEXT:    fminnm v0.8h, v0.8h, v2.8h
 ; CHECK-FP16-SD-NEXT:    ret
 ;
 ; CHECK-NOFP16-GI-LABEL: min_v16f16:
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v4.4s, v0.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v5.4s, v2.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v6.4s, v1.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v5.4s, v1.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v6.4s, v2.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v7.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl2 v0.4s, v0.8h
-; CHECK-NOFP16-GI-NEXT:    fcvtl2 v2.4s, v2.8h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-NOFP16-GI-NEXT:    fcvtl2 v2.4s, v2.8h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl2 v3.4s, v3.8h
-; CHECK-NOFP16-GI-NEXT:    fminnm v4.4s, v4.4s, v5.4s
-; CHECK-NOFP16-GI-NEXT:    fminnm v5.4s, v6.4s, v7.4s
+; CHECK-NOFP16-GI-NEXT:    fminnm v4.4s, v4.4s, v6.4s
+; CHECK-NOFP16-GI-NEXT:    fminnm v5.4s, v5.4s, v7.4s
 ; CHECK-NOFP16-GI-NEXT:    fminnm v2.4s, v0.4s, v2.4s
 ; CHECK-NOFP16-GI-NEXT:    fminnm v3.4s, v1.4s, v3.4s
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v4.4s
@@ -1264,110 +1288,110 @@ entry:
 define <16 x half> @max_v16f16(<16 x half> %a, <16 x half> %b) {
 ; CHECK-NOFP16-SD-LABEL: max_v16f16:
 ; CHECK-NOFP16-SD:       // %bb.0: // %entry
-; CHECK-NOFP16-SD-NEXT:    mov h4, v2.h[1]
-; CHECK-NOFP16-SD-NEXT:    mov h5, v0.h[1]
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h2
-; CHECK-NOFP16-SD-NEXT:    fcvt s7, h0
-; CHECK-NOFP16-SD-NEXT:    mov h16, v2.h[2]
-; CHECK-NOFP16-SD-NEXT:    mov h17, v0.h[2]
-; CHECK-NOFP16-SD-NEXT:    mov h18, v3.h[1]
-; CHECK-NOFP16-SD-NEXT:    mov h19, v1.h[1]
-; CHECK-NOFP16-SD-NEXT:    fcvt s4, h4
-; CHECK-NOFP16-SD-NEXT:    fcvt s5, h5
-; CHECK-NOFP16-SD-NEXT:    fmaxnm s6, s7, s6
-; CHECK-NOFP16-SD-NEXT:    fcvt s20, h1
-; CHECK-NOFP16-SD-NEXT:    fcvt s7, h16
-; CHECK-NOFP16-SD-NEXT:    fcvt s16, h17
-; CHECK-NOFP16-SD-NEXT:    mov h17, v2.h[3]
-; CHECK-NOFP16-SD-NEXT:    fcvt s18, h18
-; CHECK-NOFP16-SD-NEXT:    fmaxnm s5, s5, s4
-; CHECK-NOFP16-SD-NEXT:    fcvt s19, h19
-; CHECK-NOFP16-SD-NEXT:    fcvt h4, s6
-; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[3]
-; CHECK-NOFP16-SD-NEXT:    fmaxnm s7, s16, s7
-; CHECK-NOFP16-SD-NEXT:    fcvt s16, h3
-; CHECK-NOFP16-SD-NEXT:    mov h21, v0.h[4]
-; CHECK-NOFP16-SD-NEXT:    fcvt s17, h17
-; CHECK-NOFP16-SD-NEXT:    fcvt h5, s5
-; CHECK-NOFP16-SD-NEXT:    fmaxnm s18, s19, s18
+; CHECK-NOFP16-SD-NEXT:    mov h6, v2.h[1]
+; CHECK-NOFP16-SD-NEXT:    mov h7, v0.h[1]
+; CHECK-NOFP16-SD-NEXT:    fcvt s4, h2
+; CHECK-NOFP16-SD-NEXT:    fcvt s5, h0
+; CHECK-NOFP16-SD-NEXT:    mov h16, v3.h[1]
+; CHECK-NOFP16-SD-NEXT:    mov h17, v1.h[1]
+; CHECK-NOFP16-SD-NEXT:    mov h18, v2.h[2]
+; CHECK-NOFP16-SD-NEXT:    mov h19, v0.h[2]
+; CHECK-NOFP16-SD-NEXT:    fcvt s20, h3
+; CHECK-NOFP16-SD-NEXT:    fcvt s21, h1
+; CHECK-NOFP16-SD-NEXT:    mov h22, v3.h[2]
+; CHECK-NOFP16-SD-NEXT:    mov h23, v1.h[2]
 ; CHECK-NOFP16-SD-NEXT:    fcvt s6, h6
-; CHECK-NOFP16-SD-NEXT:    mov h19, v3.h[2]
-; CHECK-NOFP16-SD-NEXT:    fmaxnm s16, s20, s16
-; CHECK-NOFP16-SD-NEXT:    mov h20, v1.h[2]
-; CHECK-NOFP16-SD-NEXT:    fcvt s21, h21
-; CHECK-NOFP16-SD-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h5, s7
-; CHECK-NOFP16-SD-NEXT:    mov h7, v2.h[4]
-; CHECK-NOFP16-SD-NEXT:    fmaxnm s6, s6, s17
-; CHECK-NOFP16-SD-NEXT:    mov h17, v3.h[3]
-; CHECK-NOFP16-SD-NEXT:    fcvt s19, h19
-; CHECK-NOFP16-SD-NEXT:    fcvt s20, h20
-; CHECK-NOFP16-SD-NEXT:    mov v4.h[2], v5.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h5, s16
 ; CHECK-NOFP16-SD-NEXT:    fcvt s7, h7
-; CHECK-NOFP16-SD-NEXT:    fcvt h16, s18
-; CHECK-NOFP16-SD-NEXT:    mov h18, v1.h[3]
-; CHECK-NOFP16-SD-NEXT:    fcvt h6, s6
-; CHECK-NOFP16-SD-NEXT:    fmaxnm s7, s21, s7
-; CHECK-NOFP16-SD-NEXT:    mov v5.h[1], v16.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt s16, h17
-; CHECK-NOFP16-SD-NEXT:    fmaxnm s17, s20, s19
-; CHECK-NOFP16-SD-NEXT:    fcvt s18, h18
-; CHECK-NOFP16-SD-NEXT:    mov h19, v3.h[4]
-; CHECK-NOFP16-SD-NEXT:    mov h20, v1.h[4]
-; CHECK-NOFP16-SD-NEXT:    mov v4.h[3], v6.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h6, s7
-; CHECK-NOFP16-SD-NEXT:    fmaxnm s7, s18, s16
-; CHECK-NOFP16-SD-NEXT:    fcvt h16, s17
-; CHECK-NOFP16-SD-NEXT:    fcvt s17, h19
-; CHECK-NOFP16-SD-NEXT:    fcvt s18, h20
-; CHECK-NOFP16-SD-NEXT:    mov v4.h[4], v6.h[0]
-; CHECK-NOFP16-SD-NEXT:    mov h19, v1.h[5]
-; CHECK-NOFP16-SD-NEXT:    mov v5.h[2], v16.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h6, s7
-; CHECK-NOFP16-SD-NEXT:    fmaxnm s7, s18, s17
-; CHECK-NOFP16-SD-NEXT:    mov h16, v2.h[5]
-; CHECK-NOFP16-SD-NEXT:    mov h17, v0.h[5]
-; CHECK-NOFP16-SD-NEXT:    mov h18, v3.h[5]
-; CHECK-NOFP16-SD-NEXT:    mov v5.h[3], v6.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h6, s7
-; CHECK-NOFP16-SD-NEXT:    fcvt s7, h16
+; CHECK-NOFP16-SD-NEXT:    mov h24, v0.h[6]
+; CHECK-NOFP16-SD-NEXT:    fmaxnm s4, s5, s4
+; CHECK-NOFP16-SD-NEXT:    fcvt s5, h16
 ; CHECK-NOFP16-SD-NEXT:    fcvt s16, h17
 ; CHECK-NOFP16-SD-NEXT:    fcvt s17, h18
 ; CHECK-NOFP16-SD-NEXT:    fcvt s18, h19
+; CHECK-NOFP16-SD-NEXT:    mov h19, v0.h[3]
+; CHECK-NOFP16-SD-NEXT:    fmaxnm s20, s21, s20
+; CHECK-NOFP16-SD-NEXT:    fcvt s21, h22
+; CHECK-NOFP16-SD-NEXT:    mov h22, v3.h[3]
+; CHECK-NOFP16-SD-NEXT:    fmaxnm s6, s7, s6
+; CHECK-NOFP16-SD-NEXT:    mov h7, v2.h[3]
+; CHECK-NOFP16-SD-NEXT:    mov h25, v1.h[6]
+; CHECK-NOFP16-SD-NEXT:    fcvt h4, s4
+; CHECK-NOFP16-SD-NEXT:    fmaxnm s5, s16, s5
+; CHECK-NOFP16-SD-NEXT:    fcvt s16, h23
+; CHECK-NOFP16-SD-NEXT:    mov h23, v1.h[3]
+; CHECK-NOFP16-SD-NEXT:    fmaxnm s17, s18, s17
+; CHECK-NOFP16-SD-NEXT:    fcvt s18, h19
+; CHECK-NOFP16-SD-NEXT:    fcvt h6, s6
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h7
+; CHECK-NOFP16-SD-NEXT:    fcvt h19, s5
+; CHECK-NOFP16-SD-NEXT:    fcvt h5, s20
+; CHECK-NOFP16-SD-NEXT:    fmaxnm s16, s16, s21
+; CHECK-NOFP16-SD-NEXT:    fcvt s20, h23
+; CHECK-NOFP16-SD-NEXT:    fcvt h17, s17
+; CHECK-NOFP16-SD-NEXT:    mov h21, v2.h[4]
+; CHECK-NOFP16-SD-NEXT:    mov h23, v1.h[4]
+; CHECK-NOFP16-SD-NEXT:    mov v4.h[1], v6.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h22
+; CHECK-NOFP16-SD-NEXT:    mov h22, v0.h[4]
+; CHECK-NOFP16-SD-NEXT:    fmaxnm s7, s18, s7
+; CHECK-NOFP16-SD-NEXT:    mov h18, v3.h[4]
+; CHECK-NOFP16-SD-NEXT:    mov v5.h[1], v19.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt h16, s16
+; CHECK-NOFP16-SD-NEXT:    fmaxnm s6, s20, s6
+; CHECK-NOFP16-SD-NEXT:    mov v4.h[2], v17.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s17, h21
+; CHECK-NOFP16-SD-NEXT:    fcvt s19, h22
+; CHECK-NOFP16-SD-NEXT:    fcvt h7, s7
+; CHECK-NOFP16-SD-NEXT:    fcvt s18, h18
+; CHECK-NOFP16-SD-NEXT:    fcvt s20, h23
+; CHECK-NOFP16-SD-NEXT:    mov h21, v2.h[5]
+; CHECK-NOFP16-SD-NEXT:    mov h22, v0.h[5]
+; CHECK-NOFP16-SD-NEXT:    mov v5.h[2], v16.h[0]
+; CHECK-NOFP16-SD-NEXT:    mov h16, v3.h[5]
+; CHECK-NOFP16-SD-NEXT:    mov h23, v1.h[5]
+; CHECK-NOFP16-SD-NEXT:    fcvt h6, s6
+; CHECK-NOFP16-SD-NEXT:    mov h0, v0.h[7]
+; CHECK-NOFP16-SD-NEXT:    mov h1, v1.h[7]
+; CHECK-NOFP16-SD-NEXT:    fmaxnm s17, s19, s17
 ; CHECK-NOFP16-SD-NEXT:    mov h19, v2.h[6]
+; CHECK-NOFP16-SD-NEXT:    mov v4.h[3], v7.h[0]
+; CHECK-NOFP16-SD-NEXT:    fmaxnm s18, s20, s18
+; CHECK-NOFP16-SD-NEXT:    mov h20, v3.h[6]
+; CHECK-NOFP16-SD-NEXT:    fcvt s7, h21
+; CHECK-NOFP16-SD-NEXT:    fcvt s21, h22
+; CHECK-NOFP16-SD-NEXT:    fcvt s22, h24
 ; CHECK-NOFP16-SD-NEXT:    mov h2, v2.h[7]
-; CHECK-NOFP16-SD-NEXT:    mov v5.h[4], v6.h[0]
-; CHECK-NOFP16-SD-NEXT:    mov h6, v0.h[6]
-; CHECK-NOFP16-SD-NEXT:    fmaxnm s7, s16, s7
-; CHECK-NOFP16-SD-NEXT:    fmaxnm s16, s18, s17
-; CHECK-NOFP16-SD-NEXT:    mov h17, v3.h[6]
-; CHECK-NOFP16-SD-NEXT:    mov h18, v1.h[6]
+; CHECK-NOFP16-SD-NEXT:    mov v5.h[3], v6.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt s6, h16
+; CHECK-NOFP16-SD-NEXT:    fcvt s16, h23
+; CHECK-NOFP16-SD-NEXT:    fcvt h17, s17
 ; CHECK-NOFP16-SD-NEXT:    fcvt s19, h19
-; CHECK-NOFP16-SD-NEXT:    fcvt s6, h6
-; CHECK-NOFP16-SD-NEXT:    mov h0, v0.h[7]
+; CHECK-NOFP16-SD-NEXT:    fcvt s23, h25
+; CHECK-NOFP16-SD-NEXT:    fcvt h18, s18
+; CHECK-NOFP16-SD-NEXT:    fcvt s20, h20
 ; CHECK-NOFP16-SD-NEXT:    mov h3, v3.h[7]
-; CHECK-NOFP16-SD-NEXT:    mov h1, v1.h[7]
-; CHECK-NOFP16-SD-NEXT:    fcvt s17, h17
-; CHECK-NOFP16-SD-NEXT:    fcvt h7, s7
-; CHECK-NOFP16-SD-NEXT:    fcvt s18, h18
+; CHECK-NOFP16-SD-NEXT:    fmaxnm s7, s21, s7
 ; CHECK-NOFP16-SD-NEXT:    fcvt s2, h2
-; CHECK-NOFP16-SD-NEXT:    fmaxnm s6, s6, s19
 ; CHECK-NOFP16-SD-NEXT:    fcvt s0, h0
-; CHECK-NOFP16-SD-NEXT:    fcvt s3, h3
+; CHECK-NOFP16-SD-NEXT:    fmaxnm s6, s16, s6
 ; CHECK-NOFP16-SD-NEXT:    fcvt s1, h1
-; CHECK-NOFP16-SD-NEXT:    fcvt h16, s16
-; CHECK-NOFP16-SD-NEXT:    fmaxnm s17, s18, s17
-; CHECK-NOFP16-SD-NEXT:    mov v4.h[5], v7.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h6, s6
+; CHECK-NOFP16-SD-NEXT:    mov v4.h[4], v17.h[0]
+; CHECK-NOFP16-SD-NEXT:    fmaxnm s16, s22, s19
+; CHECK-NOFP16-SD-NEXT:    mov v5.h[4], v18.h[0]
+; CHECK-NOFP16-SD-NEXT:    fmaxnm s17, s23, s20
+; CHECK-NOFP16-SD-NEXT:    fcvt s3, h3
+; CHECK-NOFP16-SD-NEXT:    fcvt h7, s7
 ; CHECK-NOFP16-SD-NEXT:    fmaxnm s0, s0, s2
+; CHECK-NOFP16-SD-NEXT:    fcvt h6, s6
+; CHECK-NOFP16-SD-NEXT:    fcvt h2, s16
 ; CHECK-NOFP16-SD-NEXT:    fmaxnm s1, s1, s3
-; CHECK-NOFP16-SD-NEXT:    mov v5.h[5], v16.h[0]
-; CHECK-NOFP16-SD-NEXT:    fcvt h2, s17
-; CHECK-NOFP16-SD-NEXT:    mov v4.h[6], v6.h[0]
+; CHECK-NOFP16-SD-NEXT:    mov v4.h[5], v7.h[0]
 ; CHECK-NOFP16-SD-NEXT:    fcvt h0, s0
+; CHECK-NOFP16-SD-NEXT:    mov v5.h[5], v6.h[0]
+; CHECK-NOFP16-SD-NEXT:    fcvt h6, s17
 ; CHECK-NOFP16-SD-NEXT:    fcvt h1, s1
-; CHECK-NOFP16-SD-NEXT:    mov v5.h[6], v2.h[0]
+; CHECK-NOFP16-SD-NEXT:    mov v4.h[6], v2.h[0]
+; CHECK-NOFP16-SD-NEXT:    mov v5.h[6], v6.h[0]
 ; CHECK-NOFP16-SD-NEXT:    mov v4.h[7], v0.h[0]
 ; CHECK-NOFP16-SD-NEXT:    mov v5.h[7], v1.h[0]
 ; CHECK-NOFP16-SD-NEXT:    mov v0.16b, v4.16b
@@ -1376,22 +1400,22 @@ define <16 x half> @max_v16f16(<16 x half> %a, <16 x half> %b) {
 ;
 ; CHECK-FP16-SD-LABEL: max_v16f16:
 ; CHECK-FP16-SD:       // %bb.0: // %entry
-; CHECK-FP16-SD-NEXT:    fmaxnm v0.8h, v0.8h, v2.8h
 ; CHECK-FP16-SD-NEXT:    fmaxnm v1.8h, v1.8h, v3.8h
+; CHECK-FP16-SD-NEXT:    fmaxnm v0.8h, v0.8h, v2.8h
 ; CHECK-FP16-SD-NEXT:    ret
 ;
 ; CHECK-NOFP16-GI-LABEL: max_v16f16:
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v4.4s, v0.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v5.4s, v2.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v6.4s, v1.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v5.4s, v1.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v6.4s, v2.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v7.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl2 v0.4s, v0.8h
-; CHECK-NOFP16-GI-NEXT:    fcvtl2 v2.4s, v2.8h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-NOFP16-GI-NEXT:    fcvtl2 v2.4s, v2.8h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl2 v3.4s, v3.8h
-; CHECK-NOFP16-GI-NEXT:    fmaxnm v4.4s, v4.4s, v5.4s
-; CHECK-NOFP16-GI-NEXT:    fmaxnm v5.4s, v6.4s, v7.4s
+; CHECK-NOFP16-GI-NEXT:    fmaxnm v4.4s, v4.4s, v6.4s
+; CHECK-NOFP16-GI-NEXT:    fmaxnm v5.4s, v5.4s, v7.4s
 ; CHECK-NOFP16-GI-NEXT:    fmaxnm v2.4s, v0.4s, v2.4s
 ; CHECK-NOFP16-GI-NEXT:    fmaxnm v3.4s, v1.4s, v3.4s
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v4.4s

diff  --git a/llvm/test/CodeGen/AArch64/fmlal-loreg.ll b/llvm/test/CodeGen/AArch64/fmlal-loreg.ll
index fc6d4e7431c4d3..20737a73183944 100644
--- a/llvm/test/CodeGen/AArch64/fmlal-loreg.ll
+++ b/llvm/test/CodeGen/AArch64/fmlal-loreg.ll
@@ -41,13 +41,13 @@ define void @loop(ptr %out_tile, ptr %lhs_panel, ptr %rhs_panel, i32 noundef %K,
 ; CHECK-LABEL: loop:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    mov w8, w3
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    mov w8, w3
 ; CHECK-NEXT:  .LBB1_1: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr q2, [x1], #2
-; CHECK-NEXT:    ldr q3, [x2], #2
 ; CHECK-NEXT:    subs x8, x8, #1
+; CHECK-NEXT:    ldr q3, [x2], #2
 ; CHECK-NEXT:    fmlal v0.4s, v3.4h, v2.h[0]
 ; CHECK-NEXT:    fmlal2 v1.4s, v3.4h, v2.h[0]
 ; CHECK-NEXT:    b.ne .LBB1_1
@@ -84,8 +84,8 @@ define void @sink(ptr %out_tile, ptr %lhs_panel, ptr %rhs_panel, i32 noundef %K,
 ; CHECK-LABEL: sink:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    mov w8, w3
 ; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    mov w8, w3
 ; CHECK-NEXT:  .LBB2_1: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr q3, [x2], #2

diff  --git a/llvm/test/CodeGen/AArch64/fold-csel-cttz-and.ll b/llvm/test/CodeGen/AArch64/fold-csel-cttz-and.ll
index 89d1d0b1f1d4da..e2ea83d54633e6 100644
--- a/llvm/test/CodeGen/AArch64/fold-csel-cttz-and.ll
+++ b/llvm/test/CodeGen/AArch64/fold-csel-cttz-and.ll
@@ -112,9 +112,9 @@ define i32 @cttzlhsnot0(i32 %x) {
 ; CHECK-LABEL: cttzlhsnot0:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    rbit w9, w0
-; CHECK-NEXT:    mov w8, #10
-; CHECK-NEXT:    clz w9, w9
+; CHECK-NEXT:    mov w8, #10 // =0xa
 ; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    clz w9, w9
 ; CHECK-NEXT:    csel w0, w8, w9, eq
 ; CHECK-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
index 2cdbc109e3bf12..1ea87bb6b04b51 100644
--- a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
@@ -29,10 +29,10 @@ define void @fptoui_v8f32_to_v8i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-LABEL: fptoui_v8f32_to_v8i8_in_loop:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh0:
-; CHECK-NEXT:    adrp x9, lCPI0_0 at PAGE
-; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    adrp x8, lCPI0_0 at PAGE
 ; CHECK-NEXT:  Lloh1:
-; CHECK-NEXT:    ldr q0, [x9, lCPI0_0 at PAGEOFF]
+; CHECK-NEXT:    ldr q0, [x8, lCPI0_0 at PAGEOFF]
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB0_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x9, x0, x8, lsl #5
@@ -71,10 +71,10 @@ define void @fptoui_v8f32_to_v8i8_no_loop(ptr %A, ptr %dst) {
 ; CHECK-LABEL: fptoui_v8f32_to_v8i8_no_loop:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    fcvtzs.4s v0, v0
 ; CHECK-NEXT:    fcvtzs.4s v1, v1
-; CHECK-NEXT:    xtn.4h v0, v0
+; CHECK-NEXT:    fcvtzs.4s v0, v0
 ; CHECK-NEXT:    xtn.4h v1, v1
+; CHECK-NEXT:    xtn.4h v0, v0
 ; CHECK-NEXT:    uzp1.8b v0, v0, v1
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
@@ -109,22 +109,22 @@ define void @fptoui_2x_v8f32_to_v8i8_in_loop(ptr %A, ptr %B, ptr %dst) {
 ; CHECK-LABEL: fptoui_2x_v8f32_to_v8i8_in_loop:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh2:
-; CHECK-NEXT:    adrp x9, lCPI2_0 at PAGE
-; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    adrp x8, lCPI2_0 at PAGE
 ; CHECK-NEXT:  Lloh3:
-; CHECK-NEXT:    ldr q0, [x9, lCPI2_0 at PAGEOFF]
+; CHECK-NEXT:    ldr q0, [x8, lCPI2_0 at PAGEOFF]
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB2_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    lsl x9, x8, #5
 ; CHECK-NEXT:    add x10, x0, x9
 ; CHECK-NEXT:    add x9, x1, x9
 ; CHECK-NEXT:    ldp q2, q1, [x10]
-; CHECK-NEXT:    ldp q4, q3, [x9]
-; CHECK-NEXT:    fcvtzu.4s v17, v1
-; CHECK-NEXT:    fcvtzu.4s v16, v2
-; CHECK-NEXT:    fcvtzu.4s v19, v3
-; CHECK-NEXT:    fcvtzu.4s v18, v4
-; CHECK-NEXT:    tbl.16b v1, { v16, v17, v18, v19 }, v0
+; CHECK-NEXT:    fcvtzu.4s v5, v1
+; CHECK-NEXT:    ldp q1, q3, [x9]
+; CHECK-NEXT:    fcvtzu.4s v4, v2
+; CHECK-NEXT:    fcvtzu.4s v7, v3
+; CHECK-NEXT:    fcvtzu.4s v6, v1
+; CHECK-NEXT:    tbl.16b v1, { v4, v5, v6, v7 }, v0
 ; CHECK-NEXT:    str q1, [x2, x8, lsl #4]
 ; CHECK-NEXT:    add x8, x8, #1
 ; CHECK-NEXT:    cmp x8, #1000
@@ -176,22 +176,22 @@ define void @fptoui_2x_v8f32_to_v8i8_in_loop_no_concat_shuffle(ptr %A, ptr %B, p
 ; CHECK-LABEL: fptoui_2x_v8f32_to_v8i8_in_loop_no_concat_shuffle:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh4:
-; CHECK-NEXT:    adrp x9, lCPI3_0 at PAGE
-; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    adrp x8, lCPI3_0 at PAGE
 ; CHECK-NEXT:  Lloh5:
-; CHECK-NEXT:    ldr q0, [x9, lCPI3_0 at PAGEOFF]
+; CHECK-NEXT:    ldr q0, [x8, lCPI3_0 at PAGEOFF]
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB3_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    lsl x9, x8, #5
 ; CHECK-NEXT:    add x10, x0, x9
 ; CHECK-NEXT:    add x9, x1, x9
 ; CHECK-NEXT:    ldp q2, q1, [x10]
-; CHECK-NEXT:    ldp q4, q3, [x9]
-; CHECK-NEXT:    fcvtzu.4s v17, v1
-; CHECK-NEXT:    fcvtzu.4s v16, v2
-; CHECK-NEXT:    fcvtzu.4s v19, v3
-; CHECK-NEXT:    fcvtzu.4s v18, v4
-; CHECK-NEXT:    tbl.16b v1, { v16, v17, v18, v19 }, v0
+; CHECK-NEXT:    fcvtzu.4s v5, v1
+; CHECK-NEXT:    ldp q1, q3, [x9]
+; CHECK-NEXT:    fcvtzu.4s v4, v2
+; CHECK-NEXT:    fcvtzu.4s v7, v3
+; CHECK-NEXT:    fcvtzu.4s v6, v1
+; CHECK-NEXT:    tbl.16b v1, { v4, v5, v6, v7 }, v0
 ; CHECK-NEXT:    str q1, [x2, x8, lsl #4]
 ; CHECK-NEXT:    add x8, x8, #1
 ; CHECK-NEXT:    cmp x8, #1000
@@ -243,22 +243,22 @@ define void @fptoui_v16f32_to_v16i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-LABEL: fptoui_v16f32_to_v16i8_in_loop:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh6:
-; CHECK-NEXT:    adrp x9, lCPI4_0 at PAGE
-; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    adrp x8, lCPI4_0 at PAGE
 ; CHECK-NEXT:  Lloh7:
-; CHECK-NEXT:    ldr q0, [x9, lCPI4_0 at PAGEOFF]
+; CHECK-NEXT:    ldr q0, [x8, lCPI4_0 at PAGEOFF]
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB4_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x9, x0, x8, lsl #6
 ; CHECK-NEXT:    add x8, x8, #1
 ; CHECK-NEXT:    cmp x8, #1000
 ; CHECK-NEXT:    ldp q2, q1, [x9, #32]
-; CHECK-NEXT:    ldp q4, q3, [x9]
-; CHECK-NEXT:    fcvtzu.4s v19, v1
-; CHECK-NEXT:    fcvtzu.4s v18, v2
-; CHECK-NEXT:    fcvtzu.4s v17, v3
-; CHECK-NEXT:    fcvtzu.4s v16, v4
-; CHECK-NEXT:    tbl.16b v1, { v16, v17, v18, v19 }, v0
+; CHECK-NEXT:    fcvtzu.4s v7, v1
+; CHECK-NEXT:    ldp q1, q3, [x9]
+; CHECK-NEXT:    fcvtzu.4s v6, v2
+; CHECK-NEXT:    fcvtzu.4s v5, v3
+; CHECK-NEXT:    fcvtzu.4s v4, v1
+; CHECK-NEXT:    tbl.16b v1, { v4, v5, v6, v7 }, v0
 ; CHECK-NEXT:    str q1, [x1], #32
 ; CHECK-NEXT:    b.eq LBB4_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
@@ -304,32 +304,32 @@ define void @fptoui_2x_v16f32_to_v16i8_in_loop(ptr %A, ptr %B, ptr %dst) {
 ; CHECK-LABEL: fptoui_2x_v16f32_to_v16i8_in_loop:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh8:
-; CHECK-NEXT:    adrp x9, lCPI5_0 at PAGE
-; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    adrp x8, lCPI5_0 at PAGE
 ; CHECK-NEXT:  Lloh9:
-; CHECK-NEXT:    ldr q0, [x9, lCPI5_0 at PAGEOFF]
+; CHECK-NEXT:    ldr q0, [x8, lCPI5_0 at PAGEOFF]
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB5_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    lsl x9, x8, #6
-; CHECK-NEXT:    add x10, x0, x9
-; CHECK-NEXT:    add x9, x1, x9
-; CHECK-NEXT:    ldp q1, q2, [x10, #32]
+; CHECK-NEXT:    add x10, x1, x9
+; CHECK-NEXT:    add x9, x0, x9
+; CHECK-NEXT:    ldp q2, q1, [x10, #32]
 ; CHECK-NEXT:    ldp q3, q4, [x9, #32]
-; CHECK-NEXT:    fcvtzu.4s v24, v2
-; CHECK-NEXT:    fcvtzu.4s v23, v1
-; CHECK-NEXT:    ldp q5, q6, [x9]
-; CHECK-NEXT:    fcvtzu.4s v20, v4
+; CHECK-NEXT:    ldp q5, q6, [x10]
+; CHECK-NEXT:    fcvtzu.4s v19, v1
+; CHECK-NEXT:    fcvtzu.4s v18, v2
+; CHECK-NEXT:    ldp q2, q1, [x9]
+; CHECK-NEXT:    fcvtzu.4s v23, v4
+; CHECK-NEXT:    fcvtzu.4s v17, v6
 ; CHECK-NEXT:    add x9, x2, x8, lsl #5
-; CHECK-NEXT:    fcvtzu.4s v19, v3
+; CHECK-NEXT:    fcvtzu.4s v22, v3
+; CHECK-NEXT:    fcvtzu.4s v16, v5
 ; CHECK-NEXT:    add x8, x8, #1
+; CHECK-NEXT:    fcvtzu.4s v21, v1
 ; CHECK-NEXT:    cmp x8, #1000
-; CHECK-NEXT:    ldp q7, q16, [x10]
-; CHECK-NEXT:    fcvtzu.4s v18, v6
-; CHECK-NEXT:    fcvtzu.4s v17, v5
-; CHECK-NEXT:    fcvtzu.4s v22, v16
-; CHECK-NEXT:    fcvtzu.4s v21, v7
-; CHECK-NEXT:    tbl.16b v1, { v17, v18, v19, v20 }, v0
-; CHECK-NEXT:    tbl.16b v2, { v21, v22, v23, v24 }, v0
+; CHECK-NEXT:    fcvtzu.4s v20, v2
+; CHECK-NEXT:    tbl.16b v1, { v16, v17, v18, v19 }, v0
+; CHECK-NEXT:    tbl.16b v2, { v20, v21, v22, v23 }, v0
 ; CHECK-NEXT:    stp q2, q1, [x9]
 ; CHECK-NEXT:    b.eq LBB5_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
@@ -365,8 +365,8 @@ define void @fptoui_v8f32_to_v8i16_in_loop(ptr %A, ptr %dst) {
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x9, x0, x8, lsl #5
 ; CHECK-NEXT:    ldp q0, q1, [x9]
-; CHECK-NEXT:    fcvtzu.4s v0, v0
 ; CHECK-NEXT:    fcvtzu.4s v1, v1
+; CHECK-NEXT:    fcvtzu.4s v0, v0
 ; CHECK-NEXT:    uzp1.8h v0, v0, v1
 ; CHECK-NEXT:    str q0, [x1, x8, lsl #4]
 ; CHECK-NEXT:    add x8, x8, #1
@@ -400,17 +400,17 @@ define void @fptoui_2x_v8f32_to_v8i16_in_loop(ptr %A, ptr %B, ptr %dst) {
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    lsl x9, x8, #5
 ; CHECK-NEXT:    add x8, x8, #1
+; CHECK-NEXT:    cmp x8, #1000
 ; CHECK-NEXT:    add x10, x0, x9
 ; CHECK-NEXT:    add x11, x1, x9
 ; CHECK-NEXT:    add x9, x2, x9
-; CHECK-NEXT:    cmp x8, #1000
 ; CHECK-NEXT:    ldp q0, q1, [x10]
-; CHECK-NEXT:    fcvtzu.4s v0, v0
 ; CHECK-NEXT:    ldp q2, q3, [x11]
 ; CHECK-NEXT:    fcvtzu.4s v1, v1
+; CHECK-NEXT:    fcvtzu.4s v0, v0
+; CHECK-NEXT:    fcvtzu.4s v3, v3
 ; CHECK-NEXT:    fcvtzu.4s v2, v2
 ; CHECK-NEXT:    uzp1.8h v0, v0, v1
-; CHECK-NEXT:    fcvtzu.4s v3, v3
 ; CHECK-NEXT:    uzp1.8h v1, v2, v3
 ; CHECK-NEXT:    stp q0, q1, [x9]
 ; CHECK-NEXT:    b.eq LBB7_1
@@ -477,14 +477,14 @@ define void @uitofp_v8i8_to_v8f32(ptr %src, ptr %dst) {
 ; CHECK-LABEL: uitofp_v8i8_to_v8f32:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh10:
-; CHECK-NEXT:    adrp x9, lCPI8_0 at PAGE
+; CHECK-NEXT:    adrp x8, lCPI8_0 at PAGE
 ; CHECK-NEXT:  Lloh11:
-; CHECK-NEXT:    adrp x10, lCPI8_1 at PAGE
-; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    adrp x9, lCPI8_1 at PAGE
 ; CHECK-NEXT:  Lloh12:
-; CHECK-NEXT:    ldr q0, [x9, lCPI8_0 at PAGEOFF]
+; CHECK-NEXT:    ldr q0, [x8, lCPI8_0 at PAGEOFF]
 ; CHECK-NEXT:  Lloh13:
-; CHECK-NEXT:    ldr q1, [x10, lCPI8_1 at PAGEOFF]
+; CHECK-NEXT:    ldr q1, [x9, lCPI8_1 at PAGEOFF]
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB8_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr d2, [x0, x8, lsl #3]
@@ -592,22 +592,22 @@ define void @uitofp_v16i8_to_v16f32(ptr %src, ptr %dst) {
 ; CHECK-LABEL: uitofp_v16i8_to_v16f32:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh14:
-; CHECK-NEXT:    adrp x9, lCPI9_0 at PAGE
+; CHECK-NEXT:    adrp x8, lCPI9_0 at PAGE
 ; CHECK-NEXT:  Lloh15:
-; CHECK-NEXT:    adrp x10, lCPI9_1 at PAGE
+; CHECK-NEXT:    adrp x9, lCPI9_1 at PAGE
 ; CHECK-NEXT:  Lloh16:
-; CHECK-NEXT:    adrp x11, lCPI9_2 at PAGE
+; CHECK-NEXT:    adrp x10, lCPI9_2 at PAGE
 ; CHECK-NEXT:  Lloh17:
-; CHECK-NEXT:    adrp x12, lCPI9_3 at PAGE
-; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    ldr q0, [x8, lCPI9_0 at PAGEOFF]
 ; CHECK-NEXT:  Lloh18:
-; CHECK-NEXT:    ldr q0, [x9, lCPI9_0 at PAGEOFF]
+; CHECK-NEXT:    adrp x8, lCPI9_3 at PAGE
 ; CHECK-NEXT:  Lloh19:
-; CHECK-NEXT:    ldr q1, [x10, lCPI9_1 at PAGEOFF]
+; CHECK-NEXT:    ldr q1, [x9, lCPI9_1 at PAGEOFF]
 ; CHECK-NEXT:  Lloh20:
-; CHECK-NEXT:    ldr q2, [x11, lCPI9_2 at PAGEOFF]
+; CHECK-NEXT:    ldr q2, [x10, lCPI9_2 at PAGEOFF]
 ; CHECK-NEXT:  Lloh21:
-; CHECK-NEXT:    ldr q3, [x12, lCPI9_3 at PAGEOFF]
+; CHECK-NEXT:    ldr q3, [x8, lCPI9_3 at PAGEOFF]
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB9_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr q4, [x0, x8, lsl #4]
@@ -627,10 +627,11 @@ define void @uitofp_v16i8_to_v16f32(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    b.eq LBB9_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh17, Lloh21
+; CHECK-NEXT:    .loh AdrpLdr Lloh18, Lloh21
 ; CHECK-NEXT:    .loh AdrpLdr Lloh16, Lloh20
 ; CHECK-NEXT:    .loh AdrpLdr Lloh15, Lloh19
-; CHECK-NEXT:    .loh AdrpLdr Lloh14, Lloh18
+; CHECK-NEXT:    .loh AdrpAdrp Lloh14, Lloh18
+; CHECK-NEXT:    .loh AdrpLdr Lloh14, Lloh17
 entry:
   br label %loop
 

diff  --git a/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll b/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll
index c1cdfa6419deb6..1a9ba9fd4a5180 100644
--- a/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll
@@ -233,22 +233,22 @@ define <4 x i1> @fcmp_v4f32(<4 x float> %x, <4 x float> %y) #0 {
 ; CHECK-NEXT:    mov s2, v1.s[1]
 ; CHECK-NEXT:    mov s3, v0.s[1]
 ; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    mov s4, v1.s[2]
-; CHECK-NEXT:    mov s5, v0.s[2]
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    mov s2, v1.s[2]
+; CHECK-NEXT:    mov s3, v0.s[2]
+; CHECK-NEXT:    fmov s4, w8
 ; CHECK-NEXT:    mov s1, v1.s[3]
 ; CHECK-NEXT:    mov s0, v0.s[3]
 ; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    mov v4.s[1], w8
 ; CHECK-NEXT:    fcmp s3, s2
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    fcmp s5, s4
-; CHECK-NEXT:    mov v2.s[1], w8
 ; CHECK-NEXT:    csetm w8, eq
 ; CHECK-NEXT:    fcmp s0, s1
-; CHECK-NEXT:    mov v2.s[2], w8
+; CHECK-NEXT:    mov v4.s[2], w8
 ; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    mov v2.s[3], w8
-; CHECK-NEXT:    xtn v0.4h, v2.4s
+; CHECK-NEXT:    mov v4.s[3], w8
+; CHECK-NEXT:    xtn v0.4h, v4.4s
 ; CHECK-NEXT:    ret
 entry:
   %val = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f64(<4 x float> %x, <4 x float> %y, metadata !"oeq", metadata !"fpexcept.strict")
@@ -261,22 +261,22 @@ define <4 x i1> @fcmps_v4f32(<4 x float> %x, <4 x float> %y) #0 {
 ; CHECK-NEXT:    mov s2, v1.s[1]
 ; CHECK-NEXT:    mov s3, v0.s[1]
 ; CHECK-NEXT:    fcmpe s0, s1
-; CHECK-NEXT:    mov s4, v1.s[2]
-; CHECK-NEXT:    mov s5, v0.s[2]
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fcmpe s3, s2
+; CHECK-NEXT:    mov s2, v1.s[2]
+; CHECK-NEXT:    mov s3, v0.s[2]
+; CHECK-NEXT:    fmov s4, w8
 ; CHECK-NEXT:    mov s1, v1.s[3]
 ; CHECK-NEXT:    mov s0, v0.s[3]
 ; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    mov v4.s[1], w8
 ; CHECK-NEXT:    fcmpe s3, s2
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    fcmpe s5, s4
-; CHECK-NEXT:    mov v2.s[1], w8
 ; CHECK-NEXT:    csetm w8, eq
 ; CHECK-NEXT:    fcmpe s0, s1
-; CHECK-NEXT:    mov v2.s[2], w8
+; CHECK-NEXT:    mov v4.s[2], w8
 ; CHECK-NEXT:    csetm w8, eq
-; CHECK-NEXT:    mov v2.s[3], w8
-; CHECK-NEXT:    xtn v0.4h, v2.4s
+; CHECK-NEXT:    mov v4.s[3], w8
+; CHECK-NEXT:    xtn v0.4h, v4.4s
 ; CHECK-NEXT:    ret
 entry:
   %val = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f64(<4 x float> %x, <4 x float> %y, metadata !"oeq", metadata !"fpexcept.strict")

diff  --git a/llvm/test/CodeGen/AArch64/fp16-v16-instructions.ll b/llvm/test/CodeGen/AArch64/fp16-v16-instructions.ll
index 6aef8cd9ba619a..4c112cf89aec17 100644
--- a/llvm/test/CodeGen/AArch64/fp16-v16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/fp16-v16-instructions.ll
@@ -5,8 +5,8 @@
 define <16 x half> @sitofp_i32(<16 x i32> %a) #0 {
 ; CHECK-LABEL: sitofp_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    scvtf v2.4s, v2.4s
 ; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    scvtf v2.4s, v2.4s
 ; CHECK-NEXT:    scvtf v4.4s, v1.4s
 ; CHECK-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-NEXT:    fcvtn v1.4h, v2.4s
@@ -30,16 +30,16 @@ define <16 x half> @sitofp_i64(<16 x i64> %a) #0 {
 ; CHECK-NEXT:    scvtf v6.2d, v6.2d
 ; CHECK-NEXT:    scvtf v5.2d, v5.2d
 ; CHECK-NEXT:    scvtf v3.2d, v3.2d
-; CHECK-NEXT:    scvtf v7.2d, v7.2d
 ; CHECK-NEXT:    fcvtn v0.2s, v0.2d
 ; CHECK-NEXT:    fcvtn v4.2s, v4.2d
 ; CHECK-NEXT:    fcvtn v2.2s, v2.2d
 ; CHECK-NEXT:    fcvtn v6.2s, v6.2d
 ; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT:    scvtf v1.2d, v7.2d
 ; CHECK-NEXT:    fcvtn2 v4.4s, v5.2d
 ; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
-; CHECK-NEXT:    fcvtn2 v6.4s, v7.2d
 ; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    fcvtn2 v6.4s, v1.2d
 ; CHECK-NEXT:    fcvtn v1.4h, v4.4s
 ; CHECK-NEXT:    fcvtn2 v0.8h, v2.4s
 ; CHECK-NEXT:    fcvtn2 v1.8h, v6.4s
@@ -56,8 +56,8 @@ define <16 x half> @sitofp_i64(<16 x i64> %a) #0 {
 define <16 x half> @uitofp_i32(<16 x i32> %a) #0 {
 ; CHECK-LABEL: uitofp_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ucvtf v2.4s, v2.4s
 ; CHECK-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-NEXT:    ucvtf v2.4s, v2.4s
 ; CHECK-NEXT:    ucvtf v4.4s, v1.4s
 ; CHECK-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-NEXT:    fcvtn v1.4h, v2.4s
@@ -81,16 +81,16 @@ define <16 x half> @uitofp_i64(<16 x i64> %a) #0 {
 ; CHECK-NEXT:    ucvtf v6.2d, v6.2d
 ; CHECK-NEXT:    ucvtf v5.2d, v5.2d
 ; CHECK-NEXT:    ucvtf v3.2d, v3.2d
-; CHECK-NEXT:    ucvtf v7.2d, v7.2d
 ; CHECK-NEXT:    fcvtn v0.2s, v0.2d
 ; CHECK-NEXT:    fcvtn v4.2s, v4.2d
 ; CHECK-NEXT:    fcvtn v2.2s, v2.2d
 ; CHECK-NEXT:    fcvtn v6.2s, v6.2d
 ; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT:    ucvtf v1.2d, v7.2d
 ; CHECK-NEXT:    fcvtn2 v4.4s, v5.2d
 ; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
-; CHECK-NEXT:    fcvtn2 v6.4s, v7.2d
 ; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    fcvtn2 v6.4s, v1.2d
 ; CHECK-NEXT:    fcvtn v1.4h, v4.4s
 ; CHECK-NEXT:    fcvtn2 v0.8h, v2.4s
 ; CHECK-NEXT:    fcvtn2 v1.8h, v6.4s

diff  --git a/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
index ee81b0b500c430..7ff61d9bcb0cfc 100644
--- a/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
@@ -12,46 +12,46 @@ define <8 x half> @add_h(<8 x half> %a, <8 x half> %b) {
 ; CHECK-CVT-NEXT:    mov h6, v1.h[2]
 ; CHECK-CVT-NEXT:    mov h7, v0.h[2]
 ; CHECK-CVT-NEXT:    mov h16, v1.h[3]
-; CHECK-CVT-NEXT:    mov h17, v0.h[3]
 ; CHECK-CVT-NEXT:    fcvt s2, h2
 ; CHECK-CVT-NEXT:    fcvt s3, h3
 ; CHECK-CVT-NEXT:    fadd s4, s5, s4
-; CHECK-CVT-NEXT:    fcvt s5, h6
-; CHECK-CVT-NEXT:    fcvt s6, h7
-; CHECK-CVT-NEXT:    fcvt s7, h16
-; CHECK-CVT-NEXT:    fcvt s16, h17
+; CHECK-CVT-NEXT:    mov h5, v0.h[3]
+; CHECK-CVT-NEXT:    fcvt s6, h6
+; CHECK-CVT-NEXT:    fcvt s7, h7
+; CHECK-CVT-NEXT:    fcvt s16, h16
 ; CHECK-CVT-NEXT:    fadd s3, s3, s2
+; CHECK-CVT-NEXT:    fcvt s5, h5
 ; CHECK-CVT-NEXT:    fcvt h2, s4
-; CHECK-CVT-NEXT:    fadd s4, s6, s5
-; CHECK-CVT-NEXT:    mov h5, v1.h[4]
-; CHECK-CVT-NEXT:    mov h6, v0.h[4]
-; CHECK-CVT-NEXT:    fadd s7, s16, s7
+; CHECK-CVT-NEXT:    fadd s4, s7, s6
+; CHECK-CVT-NEXT:    mov h6, v1.h[4]
+; CHECK-CVT-NEXT:    mov h7, v0.h[4]
 ; CHECK-CVT-NEXT:    fcvt h3, s3
+; CHECK-CVT-NEXT:    fadd s5, s5, s16
 ; CHECK-CVT-NEXT:    mov h16, v0.h[5]
-; CHECK-CVT-NEXT:    fcvt s5, h5
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    fcvt h7, s7
+; CHECK-CVT-NEXT:    fcvt h4, s4
 ; CHECK-CVT-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-CVT-NEXT:    fcvt h3, s4
-; CHECK-CVT-NEXT:    mov h4, v1.h[5]
-; CHECK-CVT-NEXT:    fadd s5, s6, s5
-; CHECK-CVT-NEXT:    mov h6, v1.h[6]
-; CHECK-CVT-NEXT:    mov v2.h[2], v3.h[0]
+; CHECK-CVT-NEXT:    fcvt s3, h6
+; CHECK-CVT-NEXT:    fcvt s6, h7
+; CHECK-CVT-NEXT:    mov h7, v1.h[5]
+; CHECK-CVT-NEXT:    fcvt h5, s5
+; CHECK-CVT-NEXT:    fcvt s16, h16
+; CHECK-CVT-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-CVT-NEXT:    mov h4, v1.h[6]
+; CHECK-CVT-NEXT:    fadd s3, s6, s3
+; CHECK-CVT-NEXT:    mov h6, v0.h[6]
+; CHECK-CVT-NEXT:    fcvt s7, h7
 ; CHECK-CVT-NEXT:    mov h1, v1.h[7]
-; CHECK-CVT-NEXT:    fcvt s3, h4
-; CHECK-CVT-NEXT:    fcvt s4, h16
-; CHECK-CVT-NEXT:    mov h16, v0.h[6]
 ; CHECK-CVT-NEXT:    mov h0, v0.h[7]
-; CHECK-CVT-NEXT:    mov v2.h[3], v7.h[0]
-; CHECK-CVT-NEXT:    fcvt s1, h1
-; CHECK-CVT-NEXT:    fadd s3, s4, s3
-; CHECK-CVT-NEXT:    fcvt h4, s5
+; CHECK-CVT-NEXT:    mov v2.h[3], v5.h[0]
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    fcvt h3, s3
 ; CHECK-CVT-NEXT:    fcvt s5, h6
-; CHECK-CVT-NEXT:    fcvt s6, h16
+; CHECK-CVT-NEXT:    fadd s6, s16, s7
+; CHECK-CVT-NEXT:    fcvt s1, h1
 ; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov v2.h[4], v4.h[0]
-; CHECK-CVT-NEXT:    fcvt h3, s3
-; CHECK-CVT-NEXT:    fadd s4, s6, s5
+; CHECK-CVT-NEXT:    mov v2.h[4], v3.h[0]
+; CHECK-CVT-NEXT:    fadd s4, s5, s4
+; CHECK-CVT-NEXT:    fcvt h3, s6
 ; CHECK-CVT-NEXT:    fadd s0, s0, s1
 ; CHECK-CVT-NEXT:    mov v2.h[5], v3.h[0]
 ; CHECK-CVT-NEXT:    fcvt h3, s4
@@ -81,46 +81,46 @@ define <8 x half> @sub_h(<8 x half> %a, <8 x half> %b) {
 ; CHECK-CVT-NEXT:    mov h6, v1.h[2]
 ; CHECK-CVT-NEXT:    mov h7, v0.h[2]
 ; CHECK-CVT-NEXT:    mov h16, v1.h[3]
-; CHECK-CVT-NEXT:    mov h17, v0.h[3]
 ; CHECK-CVT-NEXT:    fcvt s2, h2
 ; CHECK-CVT-NEXT:    fcvt s3, h3
 ; CHECK-CVT-NEXT:    fsub s4, s5, s4
-; CHECK-CVT-NEXT:    fcvt s5, h6
-; CHECK-CVT-NEXT:    fcvt s6, h7
-; CHECK-CVT-NEXT:    fcvt s7, h16
-; CHECK-CVT-NEXT:    fcvt s16, h17
+; CHECK-CVT-NEXT:    mov h5, v0.h[3]
+; CHECK-CVT-NEXT:    fcvt s6, h6
+; CHECK-CVT-NEXT:    fcvt s7, h7
+; CHECK-CVT-NEXT:    fcvt s16, h16
 ; CHECK-CVT-NEXT:    fsub s3, s3, s2
+; CHECK-CVT-NEXT:    fcvt s5, h5
 ; CHECK-CVT-NEXT:    fcvt h2, s4
-; CHECK-CVT-NEXT:    fsub s4, s6, s5
-; CHECK-CVT-NEXT:    mov h5, v1.h[4]
-; CHECK-CVT-NEXT:    mov h6, v0.h[4]
-; CHECK-CVT-NEXT:    fsub s7, s16, s7
+; CHECK-CVT-NEXT:    fsub s4, s7, s6
+; CHECK-CVT-NEXT:    mov h6, v1.h[4]
+; CHECK-CVT-NEXT:    mov h7, v0.h[4]
 ; CHECK-CVT-NEXT:    fcvt h3, s3
+; CHECK-CVT-NEXT:    fsub s5, s5, s16
 ; CHECK-CVT-NEXT:    mov h16, v0.h[5]
-; CHECK-CVT-NEXT:    fcvt s5, h5
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    fcvt h7, s7
+; CHECK-CVT-NEXT:    fcvt h4, s4
 ; CHECK-CVT-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-CVT-NEXT:    fcvt h3, s4
-; CHECK-CVT-NEXT:    mov h4, v1.h[5]
-; CHECK-CVT-NEXT:    fsub s5, s6, s5
-; CHECK-CVT-NEXT:    mov h6, v1.h[6]
-; CHECK-CVT-NEXT:    mov v2.h[2], v3.h[0]
+; CHECK-CVT-NEXT:    fcvt s3, h6
+; CHECK-CVT-NEXT:    fcvt s6, h7
+; CHECK-CVT-NEXT:    mov h7, v1.h[5]
+; CHECK-CVT-NEXT:    fcvt h5, s5
+; CHECK-CVT-NEXT:    fcvt s16, h16
+; CHECK-CVT-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-CVT-NEXT:    mov h4, v1.h[6]
+; CHECK-CVT-NEXT:    fsub s3, s6, s3
+; CHECK-CVT-NEXT:    mov h6, v0.h[6]
+; CHECK-CVT-NEXT:    fcvt s7, h7
 ; CHECK-CVT-NEXT:    mov h1, v1.h[7]
-; CHECK-CVT-NEXT:    fcvt s3, h4
-; CHECK-CVT-NEXT:    fcvt s4, h16
-; CHECK-CVT-NEXT:    mov h16, v0.h[6]
 ; CHECK-CVT-NEXT:    mov h0, v0.h[7]
-; CHECK-CVT-NEXT:    mov v2.h[3], v7.h[0]
-; CHECK-CVT-NEXT:    fcvt s1, h1
-; CHECK-CVT-NEXT:    fsub s3, s4, s3
-; CHECK-CVT-NEXT:    fcvt h4, s5
+; CHECK-CVT-NEXT:    mov v2.h[3], v5.h[0]
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    fcvt h3, s3
 ; CHECK-CVT-NEXT:    fcvt s5, h6
-; CHECK-CVT-NEXT:    fcvt s6, h16
+; CHECK-CVT-NEXT:    fsub s6, s16, s7
+; CHECK-CVT-NEXT:    fcvt s1, h1
 ; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov v2.h[4], v4.h[0]
-; CHECK-CVT-NEXT:    fcvt h3, s3
-; CHECK-CVT-NEXT:    fsub s4, s6, s5
+; CHECK-CVT-NEXT:    mov v2.h[4], v3.h[0]
+; CHECK-CVT-NEXT:    fsub s4, s5, s4
+; CHECK-CVT-NEXT:    fcvt h3, s6
 ; CHECK-CVT-NEXT:    fsub s0, s0, s1
 ; CHECK-CVT-NEXT:    mov v2.h[5], v3.h[0]
 ; CHECK-CVT-NEXT:    fcvt h3, s4
@@ -149,47 +149,47 @@ define <8 x half> @mul_h(<8 x half> %a, <8 x half> %b) {
 ; CHECK-CVT-NEXT:    fcvt s5, h0
 ; CHECK-CVT-NEXT:    mov h6, v1.h[2]
 ; CHECK-CVT-NEXT:    mov h7, v0.h[2]
-; CHECK-CVT-NEXT:    mov h16, v0.h[3]
+; CHECK-CVT-NEXT:    mov h16, v1.h[3]
 ; CHECK-CVT-NEXT:    fcvt s2, h2
 ; CHECK-CVT-NEXT:    fcvt s3, h3
 ; CHECK-CVT-NEXT:    fmul s4, s5, s4
-; CHECK-CVT-NEXT:    mov h5, v1.h[3]
+; CHECK-CVT-NEXT:    mov h5, v0.h[3]
 ; CHECK-CVT-NEXT:    fcvt s6, h6
 ; CHECK-CVT-NEXT:    fcvt s7, h7
+; CHECK-CVT-NEXT:    fcvt s16, h16
 ; CHECK-CVT-NEXT:    fmul s3, s3, s2
+; CHECK-CVT-NEXT:    fcvt s5, h5
 ; CHECK-CVT-NEXT:    fcvt h2, s4
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    fcvt s5, h16
-; CHECK-CVT-NEXT:    fmul s6, s7, s6
-; CHECK-CVT-NEXT:    mov h7, v1.h[4]
-; CHECK-CVT-NEXT:    mov h16, v0.h[4]
+; CHECK-CVT-NEXT:    fmul s4, s7, s6
+; CHECK-CVT-NEXT:    mov h6, v1.h[4]
+; CHECK-CVT-NEXT:    mov h7, v0.h[4]
 ; CHECK-CVT-NEXT:    fcvt h3, s3
-; CHECK-CVT-NEXT:    fmul s4, s5, s4
-; CHECK-CVT-NEXT:    mov h5, v0.h[5]
-; CHECK-CVT-NEXT:    fcvt h6, s6
-; CHECK-CVT-NEXT:    fcvt s7, h7
+; CHECK-CVT-NEXT:    fmul s5, s5, s16
+; CHECK-CVT-NEXT:    mov h16, v0.h[5]
+; CHECK-CVT-NEXT:    fcvt h4, s4
 ; CHECK-CVT-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-CVT-NEXT:    mov h3, v1.h[5]
+; CHECK-CVT-NEXT:    fcvt s3, h6
+; CHECK-CVT-NEXT:    fcvt s6, h7
+; CHECK-CVT-NEXT:    mov h7, v1.h[5]
+; CHECK-CVT-NEXT:    fcvt h5, s5
 ; CHECK-CVT-NEXT:    fcvt s16, h16
-; CHECK-CVT-NEXT:    fcvt h4, s4
-; CHECK-CVT-NEXT:    fcvt s5, h5
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov v2.h[2], v6.h[0]
-; CHECK-CVT-NEXT:    fmul s6, s16, s7
-; CHECK-CVT-NEXT:    mov h7, v1.h[6]
-; CHECK-CVT-NEXT:    mov h16, v0.h[6]
+; CHECK-CVT-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-CVT-NEXT:    mov h4, v1.h[6]
+; CHECK-CVT-NEXT:    fmul s3, s6, s3
+; CHECK-CVT-NEXT:    mov h6, v0.h[6]
+; CHECK-CVT-NEXT:    fcvt s7, h7
 ; CHECK-CVT-NEXT:    mov h1, v1.h[7]
-; CHECK-CVT-NEXT:    fmul s3, s5, s3
 ; CHECK-CVT-NEXT:    mov h0, v0.h[7]
-; CHECK-CVT-NEXT:    mov v2.h[3], v4.h[0]
-; CHECK-CVT-NEXT:    fcvt h4, s6
-; CHECK-CVT-NEXT:    fcvt s5, h7
-; CHECK-CVT-NEXT:    fcvt s6, h16
-; CHECK-CVT-NEXT:    fcvt s1, h1
+; CHECK-CVT-NEXT:    mov v2.h[3], v5.h[0]
+; CHECK-CVT-NEXT:    fcvt s4, h4
 ; CHECK-CVT-NEXT:    fcvt h3, s3
+; CHECK-CVT-NEXT:    fcvt s5, h6
+; CHECK-CVT-NEXT:    fmul s6, s16, s7
+; CHECK-CVT-NEXT:    fcvt s1, h1
 ; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov v2.h[4], v4.h[0]
-; CHECK-CVT-NEXT:    fmul s4, s6, s5
+; CHECK-CVT-NEXT:    mov v2.h[4], v3.h[0]
+; CHECK-CVT-NEXT:    fmul s4, s5, s4
+; CHECK-CVT-NEXT:    fcvt h3, s6
 ; CHECK-CVT-NEXT:    fmul s0, s0, s1
 ; CHECK-CVT-NEXT:    mov v2.h[5], v3.h[0]
 ; CHECK-CVT-NEXT:    fcvt h3, s4
@@ -220,17 +220,17 @@ define <8 x half> @div_h(<8 x half> %a, <8 x half> %b) {
 ; CHECK-CVT-NEXT:    mov h7, v0.h[4]
 ; CHECK-CVT-NEXT:    mov h16, v0.h[5]
 ; CHECK-CVT-NEXT:    mov h17, v0.h[6]
+; CHECK-CVT-NEXT:    mov h0, v0.h[7]
 ; CHECK-CVT-NEXT:    fcvt s2, h2
 ; CHECK-CVT-NEXT:    fcvt s3, h3
 ; CHECK-CVT-NEXT:    fcvt s5, h5
-; CHECK-CVT-NEXT:    mov h0, v0.h[7]
 ; CHECK-CVT-NEXT:    fcvt s6, h6
 ; CHECK-CVT-NEXT:    fcvt s7, h7
 ; CHECK-CVT-NEXT:    fcvt s16, h16
 ; CHECK-CVT-NEXT:    fcvt s17, h17
+; CHECK-CVT-NEXT:    fcvt s0, h0
 ; CHECK-CVT-NEXT:    fdiv s2, s3, s2
 ; CHECK-CVT-NEXT:    fcvt s3, h1
-; CHECK-CVT-NEXT:    fcvt s0, h0
 ; CHECK-CVT-NEXT:    fdiv s3, s4, s3
 ; CHECK-CVT-NEXT:    mov h4, v1.h[2]
 ; CHECK-CVT-NEXT:    fcvt h18, s2
@@ -312,22 +312,22 @@ define <8 x half> @s_to_h(<8 x float> %a) {
 define <8 x half> @d_to_h(<8 x double> %a) {
 ; CHECK-LABEL: d_to_h:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov d4, v0.d[1]
+; CHECK-NEXT:    mov d5, v0.d[1]
 ; CHECK-NEXT:    fcvt h0, d0
-; CHECK-NEXT:    mov d5, v1.d[1]
+; CHECK-NEXT:    fcvt h4, d1
+; CHECK-NEXT:    mov d1, v1.d[1]
+; CHECK-NEXT:    fcvt h5, d5
 ; CHECK-NEXT:    fcvt h1, d1
-; CHECK-NEXT:    fcvt h4, d4
-; CHECK-NEXT:    mov v0.h[1], v4.h[0]
-; CHECK-NEXT:    fcvt h4, d5
-; CHECK-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-NEXT:    mov v0.h[1], v5.h[0]
+; CHECK-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-NEXT:    mov v0.h[3], v1.h[0]
 ; CHECK-NEXT:    fcvt h1, d2
 ; CHECK-NEXT:    mov d2, v2.d[1]
-; CHECK-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-NEXT:    fcvt h2, d2
 ; CHECK-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-NEXT:    fcvt h1, d3
-; CHECK-NEXT:    mov v0.h[5], v2.h[0]
+; CHECK-NEXT:    fcvt h1, d2
 ; CHECK-NEXT:    mov d2, v3.d[1]
+; CHECK-NEXT:    mov v0.h[5], v1.h[0]
+; CHECK-NEXT:    fcvt h1, d3
 ; CHECK-NEXT:    mov v0.h[6], v1.h[0]
 ; CHECK-NEXT:    fcvt h1, d2
 ; CHECK-NEXT:    mov v0.h[7], v1.h[0]
@@ -440,16 +440,16 @@ define <16 x half> @sitofp_v16i8(<16 x i8> %a) #0 {
 ; CHECK-CVT-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-CVT-NEXT:    sshll v2.4s, v1.4h, #0
 ; CHECK-CVT-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    sshll2 v4.4s, v1.8h, #0
+; CHECK-CVT-NEXT:    sshll2 v5.4s, v0.8h, #0
 ; CHECK-CVT-NEXT:    scvtf v2.4s, v2.4s
 ; CHECK-CVT-NEXT:    scvtf v3.4s, v3.4s
-; CHECK-CVT-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-CVT-NEXT:    sshll2 v4.4s, v0.8h, #0
-; CHECK-CVT-NEXT:    scvtf v5.4s, v1.4s
 ; CHECK-CVT-NEXT:    fcvtn v1.4h, v2.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v3.4s
 ; CHECK-CVT-NEXT:    scvtf v2.4s, v4.4s
-; CHECK-CVT-NEXT:    fcvtn2 v1.8h, v5.4s
-; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-CVT-NEXT:    fcvtn v0.4h, v3.4s
+; CHECK-CVT-NEXT:    scvtf v3.4s, v5.4s
+; CHECK-CVT-NEXT:    fcvtn2 v1.8h, v2.4s
+; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v3.4s
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: sitofp_v16i8:
@@ -559,16 +559,16 @@ define <16 x half> @uitofp_v16i8(<16 x i8> %a) #0 {
 ; CHECK-CVT-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-CVT-NEXT:    ushll v2.4s, v1.4h, #0
 ; CHECK-CVT-NEXT:    ushll v3.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    ushll2 v4.4s, v1.8h, #0
+; CHECK-CVT-NEXT:    ushll2 v5.4s, v0.8h, #0
 ; CHECK-CVT-NEXT:    ucvtf v2.4s, v2.4s
 ; CHECK-CVT-NEXT:    ucvtf v3.4s, v3.4s
-; CHECK-CVT-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-CVT-NEXT:    ushll2 v4.4s, v0.8h, #0
-; CHECK-CVT-NEXT:    ucvtf v5.4s, v1.4s
 ; CHECK-CVT-NEXT:    fcvtn v1.4h, v2.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v3.4s
 ; CHECK-CVT-NEXT:    ucvtf v2.4s, v4.4s
-; CHECK-CVT-NEXT:    fcvtn2 v1.8h, v5.4s
-; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-CVT-NEXT:    fcvtn v0.4h, v3.4s
+; CHECK-CVT-NEXT:    ucvtf v3.4s, v5.4s
+; CHECK-CVT-NEXT:    fcvtn2 v1.8h, v2.4s
+; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v3.4s
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: uitofp_v16i8:
@@ -728,39 +728,39 @@ define <8 x i1> @test_fcmp_une(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-NEXT:    mov h3, v0.h[1]
 ; CHECK-CVT-NEXT:    fcvt s4, h1
 ; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[4]
-; CHECK-CVT-NEXT:    mov h7, v0.h[4]
-; CHECK-CVT-NEXT:    mov h16, v1.h[5]
+; CHECK-CVT-NEXT:    mov h6, v1.h[2]
 ; CHECK-CVT-NEXT:    fcvt s2, h2
 ; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    fcvt s7, h7
 ; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v1.h[2]
-; CHECK-CVT-NEXT:    mov h3, v0.h[2]
+; CHECK-CVT-NEXT:    mov h2, v0.h[2]
+; CHECK-CVT-NEXT:    mov h3, v1.h[3]
 ; CHECK-CVT-NEXT:    csetm w8, ne
 ; CHECK-CVT-NEXT:    fcmp s5, s4
+; CHECK-CVT-NEXT:    fcvt s5, h6
 ; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    mov h4, v1.h[3]
+; CHECK-CVT-NEXT:    mov h4, v0.h[3]
 ; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov h5, v0.h[3]
+; CHECK-CVT-NEXT:    mov h6, v0.h[4]
 ; CHECK-CVT-NEXT:    csetm w9, ne
-; CHECK-CVT-NEXT:    fcmp s3, s2
+; CHECK-CVT-NEXT:    fcmp s2, s5
 ; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    fcvt s3, h4
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    mov h5, v0.h[5]
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    mov h5, v1.h[4]
+; CHECK-CVT-NEXT:    fcvt s6, h6
 ; CHECK-CVT-NEXT:    mov v2.h[1], w8
 ; CHECK-CVT-NEXT:    csetm w8, ne
 ; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    fcvt s3, h16
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    mov h5, v1.h[6]
+; CHECK-CVT-NEXT:    mov h3, v1.h[5]
+; CHECK-CVT-NEXT:    mov h4, v0.h[5]
+; CHECK-CVT-NEXT:    fcvt s5, h5
 ; CHECK-CVT-NEXT:    mov v2.h[2], w8
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
 ; CHECK-CVT-NEXT:    csetm w8, ne
-; CHECK-CVT-NEXT:    fcmp s7, s6
+; CHECK-CVT-NEXT:    fcvt s3, h3
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    fcmp s6, s5
+; CHECK-CVT-NEXT:    mov h5, v1.h[6]
 ; CHECK-CVT-NEXT:    mov h6, v0.h[6]
+; CHECK-CVT-NEXT:    mov h1, v1.h[7]
 ; CHECK-CVT-NEXT:    mov h0, v0.h[7]
 ; CHECK-CVT-NEXT:    mov v2.h[3], w8
 ; CHECK-CVT-NEXT:    csetm w8, ne
@@ -797,66 +797,66 @@ define <8 x i1> @test_fcmp_ueq(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-NEXT:    mov h2, v1.h[1]
 ; CHECK-CVT-NEXT:    mov h3, v0.h[1]
 ; CHECK-CVT-NEXT:    fcvt s4, h1
-; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v0.h[4]
+; CHECK-CVT-NEXT:    fcvt s6, h0
+; CHECK-CVT-NEXT:    mov h5, v1.h[2]
 ; CHECK-CVT-NEXT:    fcvt s2, h2
 ; CHECK-CVT-NEXT:    fcvt s3, h3
 ; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v1.h[2]
-; CHECK-CVT-NEXT:    mov h3, v0.h[2]
+; CHECK-CVT-NEXT:    mov h2, v0.h[2]
+; CHECK-CVT-NEXT:    fcvt s3, h5
+; CHECK-CVT-NEXT:    mov h5, v0.h[3]
 ; CHECK-CVT-NEXT:    csetm w8, eq
 ; CHECK-CVT-NEXT:    csinv w8, w8, wzr, vc
-; CHECK-CVT-NEXT:    fcmp s5, s4
-; CHECK-CVT-NEXT:    mov h4, v1.h[3]
+; CHECK-CVT-NEXT:    fcmp s6, s4
 ; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov h5, v0.h[3]
+; CHECK-CVT-NEXT:    mov h4, v1.h[3]
+; CHECK-CVT-NEXT:    mov h6, v1.h[4]
 ; CHECK-CVT-NEXT:    csetm w9, eq
 ; CHECK-CVT-NEXT:    csinv w9, w9, wzr, vc
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    fcvt s2, h5
-; CHECK-CVT-NEXT:    fmov s3, w9
-; CHECK-CVT-NEXT:    mov h5, v1.h[4]
-; CHECK-CVT-NEXT:    csetm w9, eq
-; CHECK-CVT-NEXT:    mov v3.h[1], w8
-; CHECK-CVT-NEXT:    csinv w8, w9, wzr, vc
-; CHECK-CVT-NEXT:    fcmp s2, s4
-; CHECK-CVT-NEXT:    fcvt s2, h5
-; CHECK-CVT-NEXT:    fcvt s4, h6
-; CHECK-CVT-NEXT:    mov h5, v1.h[5]
-; CHECK-CVT-NEXT:    mov h6, v0.h[5]
-; CHECK-CVT-NEXT:    csetm w9, eq
-; CHECK-CVT-NEXT:    mov v3.h[2], w8
-; CHECK-CVT-NEXT:    csinv w8, w9, wzr, vc
-; CHECK-CVT-NEXT:    fcmp s4, s2
-; CHECK-CVT-NEXT:    fcvt s2, h5
-; CHECK-CVT-NEXT:    fcvt s4, h6
-; CHECK-CVT-NEXT:    mov h5, v1.h[6]
+; CHECK-CVT-NEXT:    fcmp s2, s3
+; CHECK-CVT-NEXT:    mov h2, v0.h[4]
+; CHECK-CVT-NEXT:    fcvt s3, h4
+; CHECK-CVT-NEXT:    fcvt s4, h5
+; CHECK-CVT-NEXT:    fmov s5, w9
+; CHECK-CVT-NEXT:    fcvt s6, h6
+; CHECK-CVT-NEXT:    mov v5.h[1], w8
+; CHECK-CVT-NEXT:    csetm w8, eq
+; CHECK-CVT-NEXT:    fcvt s2, h2
+; CHECK-CVT-NEXT:    csinv w8, w8, wzr, vc
+; CHECK-CVT-NEXT:    fcmp s4, s3
+; CHECK-CVT-NEXT:    mov h3, v1.h[5]
+; CHECK-CVT-NEXT:    mov h4, v0.h[5]
+; CHECK-CVT-NEXT:    mov v5.h[2], w8
+; CHECK-CVT-NEXT:    csetm w8, eq
+; CHECK-CVT-NEXT:    csinv w8, w8, wzr, vc
+; CHECK-CVT-NEXT:    fcmp s2, s6
+; CHECK-CVT-NEXT:    fcvt s2, h3
+; CHECK-CVT-NEXT:    fcvt s3, h4
+; CHECK-CVT-NEXT:    mov h4, v1.h[6]
 ; CHECK-CVT-NEXT:    mov h6, v0.h[6]
-; CHECK-CVT-NEXT:    csetm w9, eq
 ; CHECK-CVT-NEXT:    mov h1, v1.h[7]
-; CHECK-CVT-NEXT:    mov v3.h[3], w8
-; CHECK-CVT-NEXT:    csinv w8, w9, wzr, vc
-; CHECK-CVT-NEXT:    fcmp s4, s2
-; CHECK-CVT-NEXT:    fcvt s2, h5
-; CHECK-CVT-NEXT:    fcvt s4, h6
 ; CHECK-CVT-NEXT:    mov h0, v0.h[7]
+; CHECK-CVT-NEXT:    mov v5.h[3], w8
+; CHECK-CVT-NEXT:    csetm w8, eq
+; CHECK-CVT-NEXT:    csinv w8, w8, wzr, vc
+; CHECK-CVT-NEXT:    fcmp s3, s2
+; CHECK-CVT-NEXT:    fcvt s2, h4
+; CHECK-CVT-NEXT:    fcvt s3, h6
 ; CHECK-CVT-NEXT:    fcvt s1, h1
-; CHECK-CVT-NEXT:    csetm w9, eq
-; CHECK-CVT-NEXT:    mov v3.h[4], w8
-; CHECK-CVT-NEXT:    csinv w8, w9, wzr, vc
-; CHECK-CVT-NEXT:    fcmp s4, s2
 ; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov v3.h[5], w8
+; CHECK-CVT-NEXT:    mov v5.h[4], w8
+; CHECK-CVT-NEXT:    csetm w8, eq
+; CHECK-CVT-NEXT:    csinv w8, w8, wzr, vc
+; CHECK-CVT-NEXT:    fcmp s3, s2
+; CHECK-CVT-NEXT:    mov v5.h[5], w8
 ; CHECK-CVT-NEXT:    csetm w8, eq
 ; CHECK-CVT-NEXT:    csinv w8, w8, wzr, vc
 ; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    mov v3.h[6], w8
+; CHECK-CVT-NEXT:    mov v5.h[6], w8
 ; CHECK-CVT-NEXT:    csetm w8, eq
 ; CHECK-CVT-NEXT:    csinv w8, w8, wzr, vc
-; CHECK-CVT-NEXT:    mov v3.h[7], w8
-; CHECK-CVT-NEXT:    xtn v0.8b, v3.8h
+; CHECK-CVT-NEXT:    mov v5.h[7], w8
+; CHECK-CVT-NEXT:    xtn v0.8b, v5.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ueq:
@@ -878,39 +878,39 @@ define <8 x i1> @test_fcmp_ugt(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-NEXT:    mov h3, v0.h[1]
 ; CHECK-CVT-NEXT:    fcvt s4, h1
 ; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[4]
-; CHECK-CVT-NEXT:    mov h7, v0.h[4]
-; CHECK-CVT-NEXT:    mov h16, v1.h[5]
+; CHECK-CVT-NEXT:    mov h6, v1.h[2]
 ; CHECK-CVT-NEXT:    fcvt s2, h2
 ; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    fcvt s7, h7
 ; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v1.h[2]
-; CHECK-CVT-NEXT:    mov h3, v0.h[2]
+; CHECK-CVT-NEXT:    mov h2, v0.h[2]
+; CHECK-CVT-NEXT:    mov h3, v1.h[3]
 ; CHECK-CVT-NEXT:    csetm w8, hi
 ; CHECK-CVT-NEXT:    fcmp s5, s4
+; CHECK-CVT-NEXT:    fcvt s5, h6
 ; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    mov h4, v1.h[3]
+; CHECK-CVT-NEXT:    mov h4, v0.h[3]
 ; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov h5, v0.h[3]
+; CHECK-CVT-NEXT:    mov h6, v0.h[4]
 ; CHECK-CVT-NEXT:    csetm w9, hi
-; CHECK-CVT-NEXT:    fcmp s3, s2
+; CHECK-CVT-NEXT:    fcmp s2, s5
 ; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    fcvt s3, h4
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    mov h5, v0.h[5]
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    mov h5, v1.h[4]
+; CHECK-CVT-NEXT:    fcvt s6, h6
 ; CHECK-CVT-NEXT:    mov v2.h[1], w8
 ; CHECK-CVT-NEXT:    csetm w8, hi
 ; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    fcvt s3, h16
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    mov h5, v1.h[6]
+; CHECK-CVT-NEXT:    mov h3, v1.h[5]
+; CHECK-CVT-NEXT:    mov h4, v0.h[5]
+; CHECK-CVT-NEXT:    fcvt s5, h5
 ; CHECK-CVT-NEXT:    mov v2.h[2], w8
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
 ; CHECK-CVT-NEXT:    csetm w8, hi
-; CHECK-CVT-NEXT:    fcmp s7, s6
+; CHECK-CVT-NEXT:    fcvt s3, h3
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    fcmp s6, s5
+; CHECK-CVT-NEXT:    mov h5, v1.h[6]
 ; CHECK-CVT-NEXT:    mov h6, v0.h[6]
+; CHECK-CVT-NEXT:    mov h1, v1.h[7]
 ; CHECK-CVT-NEXT:    mov h0, v0.h[7]
 ; CHECK-CVT-NEXT:    mov v2.h[3], w8
 ; CHECK-CVT-NEXT:    csetm w8, hi
@@ -948,39 +948,39 @@ define <8 x i1> @test_fcmp_uge(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-NEXT:    mov h3, v0.h[1]
 ; CHECK-CVT-NEXT:    fcvt s4, h1
 ; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[4]
-; CHECK-CVT-NEXT:    mov h7, v0.h[4]
-; CHECK-CVT-NEXT:    mov h16, v1.h[5]
+; CHECK-CVT-NEXT:    mov h6, v1.h[2]
 ; CHECK-CVT-NEXT:    fcvt s2, h2
 ; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    fcvt s7, h7
 ; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v1.h[2]
-; CHECK-CVT-NEXT:    mov h3, v0.h[2]
+; CHECK-CVT-NEXT:    mov h2, v0.h[2]
+; CHECK-CVT-NEXT:    mov h3, v1.h[3]
 ; CHECK-CVT-NEXT:    csetm w8, pl
 ; CHECK-CVT-NEXT:    fcmp s5, s4
+; CHECK-CVT-NEXT:    fcvt s5, h6
 ; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    mov h4, v1.h[3]
+; CHECK-CVT-NEXT:    mov h4, v0.h[3]
 ; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov h5, v0.h[3]
+; CHECK-CVT-NEXT:    mov h6, v0.h[4]
 ; CHECK-CVT-NEXT:    csetm w9, pl
-; CHECK-CVT-NEXT:    fcmp s3, s2
+; CHECK-CVT-NEXT:    fcmp s2, s5
 ; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    fcvt s3, h4
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    mov h5, v0.h[5]
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    mov h5, v1.h[4]
+; CHECK-CVT-NEXT:    fcvt s6, h6
 ; CHECK-CVT-NEXT:    mov v2.h[1], w8
 ; CHECK-CVT-NEXT:    csetm w8, pl
 ; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    fcvt s3, h16
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    mov h5, v1.h[6]
+; CHECK-CVT-NEXT:    mov h3, v1.h[5]
+; CHECK-CVT-NEXT:    mov h4, v0.h[5]
+; CHECK-CVT-NEXT:    fcvt s5, h5
 ; CHECK-CVT-NEXT:    mov v2.h[2], w8
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
 ; CHECK-CVT-NEXT:    csetm w8, pl
-; CHECK-CVT-NEXT:    fcmp s7, s6
+; CHECK-CVT-NEXT:    fcvt s3, h3
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    fcmp s6, s5
+; CHECK-CVT-NEXT:    mov h5, v1.h[6]
 ; CHECK-CVT-NEXT:    mov h6, v0.h[6]
+; CHECK-CVT-NEXT:    mov h1, v1.h[7]
 ; CHECK-CVT-NEXT:    mov h0, v0.h[7]
 ; CHECK-CVT-NEXT:    mov v2.h[3], w8
 ; CHECK-CVT-NEXT:    csetm w8, pl
@@ -1018,39 +1018,39 @@ define <8 x i1> @test_fcmp_ult(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-NEXT:    mov h3, v0.h[1]
 ; CHECK-CVT-NEXT:    fcvt s4, h1
 ; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[4]
-; CHECK-CVT-NEXT:    mov h7, v0.h[4]
-; CHECK-CVT-NEXT:    mov h16, v1.h[5]
+; CHECK-CVT-NEXT:    mov h6, v1.h[2]
 ; CHECK-CVT-NEXT:    fcvt s2, h2
 ; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    fcvt s7, h7
 ; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v1.h[2]
-; CHECK-CVT-NEXT:    mov h3, v0.h[2]
+; CHECK-CVT-NEXT:    mov h2, v0.h[2]
+; CHECK-CVT-NEXT:    mov h3, v1.h[3]
 ; CHECK-CVT-NEXT:    csetm w8, lt
 ; CHECK-CVT-NEXT:    fcmp s5, s4
+; CHECK-CVT-NEXT:    fcvt s5, h6
 ; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    mov h4, v1.h[3]
+; CHECK-CVT-NEXT:    mov h4, v0.h[3]
 ; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov h5, v0.h[3]
+; CHECK-CVT-NEXT:    mov h6, v0.h[4]
 ; CHECK-CVT-NEXT:    csetm w9, lt
-; CHECK-CVT-NEXT:    fcmp s3, s2
+; CHECK-CVT-NEXT:    fcmp s2, s5
 ; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    fcvt s3, h4
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    mov h5, v0.h[5]
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    mov h5, v1.h[4]
+; CHECK-CVT-NEXT:    fcvt s6, h6
 ; CHECK-CVT-NEXT:    mov v2.h[1], w8
 ; CHECK-CVT-NEXT:    csetm w8, lt
 ; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    fcvt s3, h16
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    mov h5, v1.h[6]
+; CHECK-CVT-NEXT:    mov h3, v1.h[5]
+; CHECK-CVT-NEXT:    mov h4, v0.h[5]
+; CHECK-CVT-NEXT:    fcvt s5, h5
 ; CHECK-CVT-NEXT:    mov v2.h[2], w8
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
 ; CHECK-CVT-NEXT:    csetm w8, lt
-; CHECK-CVT-NEXT:    fcmp s7, s6
+; CHECK-CVT-NEXT:    fcvt s3, h3
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    fcmp s6, s5
+; CHECK-CVT-NEXT:    mov h5, v1.h[6]
 ; CHECK-CVT-NEXT:    mov h6, v0.h[6]
+; CHECK-CVT-NEXT:    mov h1, v1.h[7]
 ; CHECK-CVT-NEXT:    mov h0, v0.h[7]
 ; CHECK-CVT-NEXT:    mov v2.h[3], w8
 ; CHECK-CVT-NEXT:    csetm w8, lt
@@ -1088,39 +1088,39 @@ define <8 x i1> @test_fcmp_ule(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-NEXT:    mov h3, v0.h[1]
 ; CHECK-CVT-NEXT:    fcvt s4, h1
 ; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[4]
-; CHECK-CVT-NEXT:    mov h7, v0.h[4]
-; CHECK-CVT-NEXT:    mov h16, v1.h[5]
+; CHECK-CVT-NEXT:    mov h6, v1.h[2]
 ; CHECK-CVT-NEXT:    fcvt s2, h2
 ; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    fcvt s7, h7
 ; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v1.h[2]
-; CHECK-CVT-NEXT:    mov h3, v0.h[2]
+; CHECK-CVT-NEXT:    mov h2, v0.h[2]
+; CHECK-CVT-NEXT:    mov h3, v1.h[3]
 ; CHECK-CVT-NEXT:    csetm w8, le
 ; CHECK-CVT-NEXT:    fcmp s5, s4
+; CHECK-CVT-NEXT:    fcvt s5, h6
 ; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    mov h4, v1.h[3]
+; CHECK-CVT-NEXT:    mov h4, v0.h[3]
 ; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov h5, v0.h[3]
+; CHECK-CVT-NEXT:    mov h6, v0.h[4]
 ; CHECK-CVT-NEXT:    csetm w9, le
-; CHECK-CVT-NEXT:    fcmp s3, s2
+; CHECK-CVT-NEXT:    fcmp s2, s5
 ; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    fcvt s3, h4
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    mov h5, v0.h[5]
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    mov h5, v1.h[4]
+; CHECK-CVT-NEXT:    fcvt s6, h6
 ; CHECK-CVT-NEXT:    mov v2.h[1], w8
 ; CHECK-CVT-NEXT:    csetm w8, le
 ; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    fcvt s3, h16
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    mov h5, v1.h[6]
+; CHECK-CVT-NEXT:    mov h3, v1.h[5]
+; CHECK-CVT-NEXT:    mov h4, v0.h[5]
+; CHECK-CVT-NEXT:    fcvt s5, h5
 ; CHECK-CVT-NEXT:    mov v2.h[2], w8
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
 ; CHECK-CVT-NEXT:    csetm w8, le
-; CHECK-CVT-NEXT:    fcmp s7, s6
+; CHECK-CVT-NEXT:    fcvt s3, h3
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    fcmp s6, s5
+; CHECK-CVT-NEXT:    mov h5, v1.h[6]
 ; CHECK-CVT-NEXT:    mov h6, v0.h[6]
+; CHECK-CVT-NEXT:    mov h1, v1.h[7]
 ; CHECK-CVT-NEXT:    mov h0, v0.h[7]
 ; CHECK-CVT-NEXT:    mov v2.h[3], w8
 ; CHECK-CVT-NEXT:    csetm w8, le
@@ -1158,39 +1158,39 @@ define <8 x i1> @test_fcmp_uno(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-NEXT:    mov h3, v0.h[1]
 ; CHECK-CVT-NEXT:    fcvt s4, h1
 ; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[4]
-; CHECK-CVT-NEXT:    mov h7, v0.h[4]
-; CHECK-CVT-NEXT:    mov h16, v1.h[5]
+; CHECK-CVT-NEXT:    mov h6, v1.h[2]
 ; CHECK-CVT-NEXT:    fcvt s2, h2
 ; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    fcvt s7, h7
 ; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v1.h[2]
-; CHECK-CVT-NEXT:    mov h3, v0.h[2]
+; CHECK-CVT-NEXT:    mov h2, v0.h[2]
+; CHECK-CVT-NEXT:    mov h3, v1.h[3]
 ; CHECK-CVT-NEXT:    csetm w8, vs
 ; CHECK-CVT-NEXT:    fcmp s5, s4
+; CHECK-CVT-NEXT:    fcvt s5, h6
 ; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    mov h4, v1.h[3]
+; CHECK-CVT-NEXT:    mov h4, v0.h[3]
 ; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov h5, v0.h[3]
+; CHECK-CVT-NEXT:    mov h6, v0.h[4]
 ; CHECK-CVT-NEXT:    csetm w9, vs
-; CHECK-CVT-NEXT:    fcmp s3, s2
+; CHECK-CVT-NEXT:    fcmp s2, s5
 ; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    fcvt s3, h4
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    mov h5, v0.h[5]
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    mov h5, v1.h[4]
+; CHECK-CVT-NEXT:    fcvt s6, h6
 ; CHECK-CVT-NEXT:    mov v2.h[1], w8
 ; CHECK-CVT-NEXT:    csetm w8, vs
 ; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    fcvt s3, h16
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    mov h5, v1.h[6]
+; CHECK-CVT-NEXT:    mov h3, v1.h[5]
+; CHECK-CVT-NEXT:    mov h4, v0.h[5]
+; CHECK-CVT-NEXT:    fcvt s5, h5
 ; CHECK-CVT-NEXT:    mov v2.h[2], w8
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
 ; CHECK-CVT-NEXT:    csetm w8, vs
-; CHECK-CVT-NEXT:    fcmp s7, s6
+; CHECK-CVT-NEXT:    fcvt s3, h3
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    fcmp s6, s5
+; CHECK-CVT-NEXT:    mov h5, v1.h[6]
 ; CHECK-CVT-NEXT:    mov h6, v0.h[6]
+; CHECK-CVT-NEXT:    mov h1, v1.h[7]
 ; CHECK-CVT-NEXT:    mov h0, v0.h[7]
 ; CHECK-CVT-NEXT:    mov v2.h[3], w8
 ; CHECK-CVT-NEXT:    csetm w8, vs
@@ -1229,66 +1229,66 @@ define <8 x i1> @test_fcmp_one(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-NEXT:    mov h2, v1.h[1]
 ; CHECK-CVT-NEXT:    mov h3, v0.h[1]
 ; CHECK-CVT-NEXT:    fcvt s4, h1
-; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v0.h[4]
+; CHECK-CVT-NEXT:    fcvt s6, h0
+; CHECK-CVT-NEXT:    mov h5, v1.h[2]
 ; CHECK-CVT-NEXT:    fcvt s2, h2
 ; CHECK-CVT-NEXT:    fcvt s3, h3
 ; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v1.h[2]
-; CHECK-CVT-NEXT:    mov h3, v0.h[2]
+; CHECK-CVT-NEXT:    mov h2, v0.h[2]
+; CHECK-CVT-NEXT:    fcvt s3, h5
+; CHECK-CVT-NEXT:    mov h5, v0.h[3]
 ; CHECK-CVT-NEXT:    csetm w8, mi
 ; CHECK-CVT-NEXT:    csinv w8, w8, wzr, le
-; CHECK-CVT-NEXT:    fcmp s5, s4
-; CHECK-CVT-NEXT:    mov h4, v1.h[3]
+; CHECK-CVT-NEXT:    fcmp s6, s4
 ; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov h5, v0.h[3]
+; CHECK-CVT-NEXT:    mov h4, v1.h[3]
+; CHECK-CVT-NEXT:    mov h6, v1.h[4]
 ; CHECK-CVT-NEXT:    csetm w9, mi
 ; CHECK-CVT-NEXT:    csinv w9, w9, wzr, le
-; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    fcvt s2, h5
-; CHECK-CVT-NEXT:    fmov s3, w9
-; CHECK-CVT-NEXT:    mov h5, v1.h[4]
-; CHECK-CVT-NEXT:    csetm w9, mi
-; CHECK-CVT-NEXT:    mov v3.h[1], w8
-; CHECK-CVT-NEXT:    csinv w8, w9, wzr, le
-; CHECK-CVT-NEXT:    fcmp s2, s4
-; CHECK-CVT-NEXT:    fcvt s2, h5
-; CHECK-CVT-NEXT:    fcvt s4, h6
-; CHECK-CVT-NEXT:    mov h5, v1.h[5]
-; CHECK-CVT-NEXT:    mov h6, v0.h[5]
-; CHECK-CVT-NEXT:    csetm w9, mi
-; CHECK-CVT-NEXT:    mov v3.h[2], w8
-; CHECK-CVT-NEXT:    csinv w8, w9, wzr, le
-; CHECK-CVT-NEXT:    fcmp s4, s2
-; CHECK-CVT-NEXT:    fcvt s2, h5
-; CHECK-CVT-NEXT:    fcvt s4, h6
-; CHECK-CVT-NEXT:    mov h5, v1.h[6]
+; CHECK-CVT-NEXT:    fcmp s2, s3
+; CHECK-CVT-NEXT:    mov h2, v0.h[4]
+; CHECK-CVT-NEXT:    fcvt s3, h4
+; CHECK-CVT-NEXT:    fcvt s4, h5
+; CHECK-CVT-NEXT:    fmov s5, w9
+; CHECK-CVT-NEXT:    fcvt s6, h6
+; CHECK-CVT-NEXT:    mov v5.h[1], w8
+; CHECK-CVT-NEXT:    csetm w8, mi
+; CHECK-CVT-NEXT:    fcvt s2, h2
+; CHECK-CVT-NEXT:    csinv w8, w8, wzr, le
+; CHECK-CVT-NEXT:    fcmp s4, s3
+; CHECK-CVT-NEXT:    mov h3, v1.h[5]
+; CHECK-CVT-NEXT:    mov h4, v0.h[5]
+; CHECK-CVT-NEXT:    mov v5.h[2], w8
+; CHECK-CVT-NEXT:    csetm w8, mi
+; CHECK-CVT-NEXT:    csinv w8, w8, wzr, le
+; CHECK-CVT-NEXT:    fcmp s2, s6
+; CHECK-CVT-NEXT:    fcvt s2, h3
+; CHECK-CVT-NEXT:    fcvt s3, h4
+; CHECK-CVT-NEXT:    mov h4, v1.h[6]
 ; CHECK-CVT-NEXT:    mov h6, v0.h[6]
-; CHECK-CVT-NEXT:    csetm w9, mi
 ; CHECK-CVT-NEXT:    mov h1, v1.h[7]
-; CHECK-CVT-NEXT:    mov v3.h[3], w8
-; CHECK-CVT-NEXT:    csinv w8, w9, wzr, le
-; CHECK-CVT-NEXT:    fcmp s4, s2
-; CHECK-CVT-NEXT:    fcvt s2, h5
-; CHECK-CVT-NEXT:    fcvt s4, h6
 ; CHECK-CVT-NEXT:    mov h0, v0.h[7]
+; CHECK-CVT-NEXT:    mov v5.h[3], w8
+; CHECK-CVT-NEXT:    csetm w8, mi
+; CHECK-CVT-NEXT:    csinv w8, w8, wzr, le
+; CHECK-CVT-NEXT:    fcmp s3, s2
+; CHECK-CVT-NEXT:    fcvt s2, h4
+; CHECK-CVT-NEXT:    fcvt s3, h6
 ; CHECK-CVT-NEXT:    fcvt s1, h1
-; CHECK-CVT-NEXT:    csetm w9, mi
-; CHECK-CVT-NEXT:    mov v3.h[4], w8
-; CHECK-CVT-NEXT:    csinv w8, w9, wzr, le
-; CHECK-CVT-NEXT:    fcmp s4, s2
 ; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov v3.h[5], w8
+; CHECK-CVT-NEXT:    mov v5.h[4], w8
+; CHECK-CVT-NEXT:    csetm w8, mi
+; CHECK-CVT-NEXT:    csinv w8, w8, wzr, le
+; CHECK-CVT-NEXT:    fcmp s3, s2
+; CHECK-CVT-NEXT:    mov v5.h[5], w8
 ; CHECK-CVT-NEXT:    csetm w8, mi
 ; CHECK-CVT-NEXT:    csinv w8, w8, wzr, le
 ; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    mov v3.h[6], w8
+; CHECK-CVT-NEXT:    mov v5.h[6], w8
 ; CHECK-CVT-NEXT:    csetm w8, mi
 ; CHECK-CVT-NEXT:    csinv w8, w8, wzr, le
-; CHECK-CVT-NEXT:    mov v3.h[7], w8
-; CHECK-CVT-NEXT:    xtn v0.8b, v3.8h
+; CHECK-CVT-NEXT:    mov v5.h[7], w8
+; CHECK-CVT-NEXT:    xtn v0.8b, v5.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_one:
@@ -1309,39 +1309,39 @@ define <8 x i1> @test_fcmp_oeq(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-NEXT:    mov h3, v0.h[1]
 ; CHECK-CVT-NEXT:    fcvt s4, h1
 ; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[4]
-; CHECK-CVT-NEXT:    mov h7, v0.h[4]
-; CHECK-CVT-NEXT:    mov h16, v1.h[5]
+; CHECK-CVT-NEXT:    mov h6, v1.h[2]
 ; CHECK-CVT-NEXT:    fcvt s2, h2
 ; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    fcvt s7, h7
 ; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v1.h[2]
-; CHECK-CVT-NEXT:    mov h3, v0.h[2]
+; CHECK-CVT-NEXT:    mov h2, v0.h[2]
+; CHECK-CVT-NEXT:    mov h3, v1.h[3]
 ; CHECK-CVT-NEXT:    csetm w8, eq
 ; CHECK-CVT-NEXT:    fcmp s5, s4
+; CHECK-CVT-NEXT:    fcvt s5, h6
 ; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    mov h4, v1.h[3]
+; CHECK-CVT-NEXT:    mov h4, v0.h[3]
 ; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov h5, v0.h[3]
+; CHECK-CVT-NEXT:    mov h6, v0.h[4]
 ; CHECK-CVT-NEXT:    csetm w9, eq
-; CHECK-CVT-NEXT:    fcmp s3, s2
+; CHECK-CVT-NEXT:    fcmp s2, s5
 ; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    fcvt s3, h4
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    mov h5, v0.h[5]
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    mov h5, v1.h[4]
+; CHECK-CVT-NEXT:    fcvt s6, h6
 ; CHECK-CVT-NEXT:    mov v2.h[1], w8
 ; CHECK-CVT-NEXT:    csetm w8, eq
 ; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    fcvt s3, h16
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    mov h5, v1.h[6]
+; CHECK-CVT-NEXT:    mov h3, v1.h[5]
+; CHECK-CVT-NEXT:    mov h4, v0.h[5]
+; CHECK-CVT-NEXT:    fcvt s5, h5
 ; CHECK-CVT-NEXT:    mov v2.h[2], w8
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
 ; CHECK-CVT-NEXT:    csetm w8, eq
-; CHECK-CVT-NEXT:    fcmp s7, s6
+; CHECK-CVT-NEXT:    fcvt s3, h3
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    fcmp s6, s5
+; CHECK-CVT-NEXT:    mov h5, v1.h[6]
 ; CHECK-CVT-NEXT:    mov h6, v0.h[6]
+; CHECK-CVT-NEXT:    mov h1, v1.h[7]
 ; CHECK-CVT-NEXT:    mov h0, v0.h[7]
 ; CHECK-CVT-NEXT:    mov v2.h[3], w8
 ; CHECK-CVT-NEXT:    csetm w8, eq
@@ -1378,39 +1378,39 @@ define <8 x i1> @test_fcmp_ogt(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-NEXT:    mov h3, v0.h[1]
 ; CHECK-CVT-NEXT:    fcvt s4, h1
 ; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[4]
-; CHECK-CVT-NEXT:    mov h7, v0.h[4]
-; CHECK-CVT-NEXT:    mov h16, v1.h[5]
+; CHECK-CVT-NEXT:    mov h6, v1.h[2]
 ; CHECK-CVT-NEXT:    fcvt s2, h2
 ; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    fcvt s7, h7
 ; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v1.h[2]
-; CHECK-CVT-NEXT:    mov h3, v0.h[2]
+; CHECK-CVT-NEXT:    mov h2, v0.h[2]
+; CHECK-CVT-NEXT:    mov h3, v1.h[3]
 ; CHECK-CVT-NEXT:    csetm w8, gt
 ; CHECK-CVT-NEXT:    fcmp s5, s4
+; CHECK-CVT-NEXT:    fcvt s5, h6
 ; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    mov h4, v1.h[3]
+; CHECK-CVT-NEXT:    mov h4, v0.h[3]
 ; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov h5, v0.h[3]
+; CHECK-CVT-NEXT:    mov h6, v0.h[4]
 ; CHECK-CVT-NEXT:    csetm w9, gt
-; CHECK-CVT-NEXT:    fcmp s3, s2
+; CHECK-CVT-NEXT:    fcmp s2, s5
 ; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    fcvt s3, h4
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    mov h5, v0.h[5]
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    mov h5, v1.h[4]
+; CHECK-CVT-NEXT:    fcvt s6, h6
 ; CHECK-CVT-NEXT:    mov v2.h[1], w8
 ; CHECK-CVT-NEXT:    csetm w8, gt
 ; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    fcvt s3, h16
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    mov h5, v1.h[6]
+; CHECK-CVT-NEXT:    mov h3, v1.h[5]
+; CHECK-CVT-NEXT:    mov h4, v0.h[5]
+; CHECK-CVT-NEXT:    fcvt s5, h5
 ; CHECK-CVT-NEXT:    mov v2.h[2], w8
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
 ; CHECK-CVT-NEXT:    csetm w8, gt
-; CHECK-CVT-NEXT:    fcmp s7, s6
+; CHECK-CVT-NEXT:    fcvt s3, h3
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    fcmp s6, s5
+; CHECK-CVT-NEXT:    mov h5, v1.h[6]
 ; CHECK-CVT-NEXT:    mov h6, v0.h[6]
+; CHECK-CVT-NEXT:    mov h1, v1.h[7]
 ; CHECK-CVT-NEXT:    mov h0, v0.h[7]
 ; CHECK-CVT-NEXT:    mov v2.h[3], w8
 ; CHECK-CVT-NEXT:    csetm w8, gt
@@ -1447,39 +1447,39 @@ define <8 x i1> @test_fcmp_oge(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-NEXT:    mov h3, v0.h[1]
 ; CHECK-CVT-NEXT:    fcvt s4, h1
 ; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[4]
-; CHECK-CVT-NEXT:    mov h7, v0.h[4]
-; CHECK-CVT-NEXT:    mov h16, v1.h[5]
+; CHECK-CVT-NEXT:    mov h6, v1.h[2]
 ; CHECK-CVT-NEXT:    fcvt s2, h2
 ; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    fcvt s7, h7
 ; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v1.h[2]
-; CHECK-CVT-NEXT:    mov h3, v0.h[2]
+; CHECK-CVT-NEXT:    mov h2, v0.h[2]
+; CHECK-CVT-NEXT:    mov h3, v1.h[3]
 ; CHECK-CVT-NEXT:    csetm w8, ge
 ; CHECK-CVT-NEXT:    fcmp s5, s4
+; CHECK-CVT-NEXT:    fcvt s5, h6
 ; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    mov h4, v1.h[3]
+; CHECK-CVT-NEXT:    mov h4, v0.h[3]
 ; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov h5, v0.h[3]
+; CHECK-CVT-NEXT:    mov h6, v0.h[4]
 ; CHECK-CVT-NEXT:    csetm w9, ge
-; CHECK-CVT-NEXT:    fcmp s3, s2
+; CHECK-CVT-NEXT:    fcmp s2, s5
 ; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    fcvt s3, h4
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    mov h5, v0.h[5]
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    mov h5, v1.h[4]
+; CHECK-CVT-NEXT:    fcvt s6, h6
 ; CHECK-CVT-NEXT:    mov v2.h[1], w8
 ; CHECK-CVT-NEXT:    csetm w8, ge
 ; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    fcvt s3, h16
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    mov h5, v1.h[6]
+; CHECK-CVT-NEXT:    mov h3, v1.h[5]
+; CHECK-CVT-NEXT:    mov h4, v0.h[5]
+; CHECK-CVT-NEXT:    fcvt s5, h5
 ; CHECK-CVT-NEXT:    mov v2.h[2], w8
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
 ; CHECK-CVT-NEXT:    csetm w8, ge
-; CHECK-CVT-NEXT:    fcmp s7, s6
+; CHECK-CVT-NEXT:    fcvt s3, h3
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    fcmp s6, s5
+; CHECK-CVT-NEXT:    mov h5, v1.h[6]
 ; CHECK-CVT-NEXT:    mov h6, v0.h[6]
+; CHECK-CVT-NEXT:    mov h1, v1.h[7]
 ; CHECK-CVT-NEXT:    mov h0, v0.h[7]
 ; CHECK-CVT-NEXT:    mov v2.h[3], w8
 ; CHECK-CVT-NEXT:    csetm w8, ge
@@ -1516,39 +1516,39 @@ define <8 x i1> @test_fcmp_olt(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-NEXT:    mov h3, v0.h[1]
 ; CHECK-CVT-NEXT:    fcvt s4, h1
 ; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[4]
-; CHECK-CVT-NEXT:    mov h7, v0.h[4]
-; CHECK-CVT-NEXT:    mov h16, v1.h[5]
+; CHECK-CVT-NEXT:    mov h6, v1.h[2]
 ; CHECK-CVT-NEXT:    fcvt s2, h2
 ; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    fcvt s7, h7
 ; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v1.h[2]
-; CHECK-CVT-NEXT:    mov h3, v0.h[2]
+; CHECK-CVT-NEXT:    mov h2, v0.h[2]
+; CHECK-CVT-NEXT:    mov h3, v1.h[3]
 ; CHECK-CVT-NEXT:    csetm w8, mi
 ; CHECK-CVT-NEXT:    fcmp s5, s4
+; CHECK-CVT-NEXT:    fcvt s5, h6
 ; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    mov h4, v1.h[3]
+; CHECK-CVT-NEXT:    mov h4, v0.h[3]
 ; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov h5, v0.h[3]
+; CHECK-CVT-NEXT:    mov h6, v0.h[4]
 ; CHECK-CVT-NEXT:    csetm w9, mi
-; CHECK-CVT-NEXT:    fcmp s3, s2
+; CHECK-CVT-NEXT:    fcmp s2, s5
 ; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    fcvt s3, h4
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    mov h5, v0.h[5]
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    mov h5, v1.h[4]
+; CHECK-CVT-NEXT:    fcvt s6, h6
 ; CHECK-CVT-NEXT:    mov v2.h[1], w8
 ; CHECK-CVT-NEXT:    csetm w8, mi
 ; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    fcvt s3, h16
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    mov h5, v1.h[6]
+; CHECK-CVT-NEXT:    mov h3, v1.h[5]
+; CHECK-CVT-NEXT:    mov h4, v0.h[5]
+; CHECK-CVT-NEXT:    fcvt s5, h5
 ; CHECK-CVT-NEXT:    mov v2.h[2], w8
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
 ; CHECK-CVT-NEXT:    csetm w8, mi
-; CHECK-CVT-NEXT:    fcmp s7, s6
+; CHECK-CVT-NEXT:    fcvt s3, h3
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    fcmp s6, s5
+; CHECK-CVT-NEXT:    mov h5, v1.h[6]
 ; CHECK-CVT-NEXT:    mov h6, v0.h[6]
+; CHECK-CVT-NEXT:    mov h1, v1.h[7]
 ; CHECK-CVT-NEXT:    mov h0, v0.h[7]
 ; CHECK-CVT-NEXT:    mov v2.h[3], w8
 ; CHECK-CVT-NEXT:    csetm w8, mi
@@ -1585,39 +1585,39 @@ define <8 x i1> @test_fcmp_ole(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-NEXT:    mov h3, v0.h[1]
 ; CHECK-CVT-NEXT:    fcvt s4, h1
 ; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[4]
-; CHECK-CVT-NEXT:    mov h7, v0.h[4]
-; CHECK-CVT-NEXT:    mov h16, v1.h[5]
+; CHECK-CVT-NEXT:    mov h6, v1.h[2]
 ; CHECK-CVT-NEXT:    fcvt s2, h2
 ; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    fcvt s7, h7
 ; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v1.h[2]
-; CHECK-CVT-NEXT:    mov h3, v0.h[2]
+; CHECK-CVT-NEXT:    mov h2, v0.h[2]
+; CHECK-CVT-NEXT:    mov h3, v1.h[3]
 ; CHECK-CVT-NEXT:    csetm w8, ls
 ; CHECK-CVT-NEXT:    fcmp s5, s4
+; CHECK-CVT-NEXT:    fcvt s5, h6
 ; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    mov h4, v1.h[3]
+; CHECK-CVT-NEXT:    mov h4, v0.h[3]
 ; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov h5, v0.h[3]
+; CHECK-CVT-NEXT:    mov h6, v0.h[4]
 ; CHECK-CVT-NEXT:    csetm w9, ls
-; CHECK-CVT-NEXT:    fcmp s3, s2
+; CHECK-CVT-NEXT:    fcmp s2, s5
 ; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    fcvt s3, h4
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    mov h5, v0.h[5]
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    mov h5, v1.h[4]
+; CHECK-CVT-NEXT:    fcvt s6, h6
 ; CHECK-CVT-NEXT:    mov v2.h[1], w8
 ; CHECK-CVT-NEXT:    csetm w8, ls
 ; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    fcvt s3, h16
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    mov h5, v1.h[6]
+; CHECK-CVT-NEXT:    mov h3, v1.h[5]
+; CHECK-CVT-NEXT:    mov h4, v0.h[5]
+; CHECK-CVT-NEXT:    fcvt s5, h5
 ; CHECK-CVT-NEXT:    mov v2.h[2], w8
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
 ; CHECK-CVT-NEXT:    csetm w8, ls
-; CHECK-CVT-NEXT:    fcmp s7, s6
+; CHECK-CVT-NEXT:    fcvt s3, h3
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    fcmp s6, s5
+; CHECK-CVT-NEXT:    mov h5, v1.h[6]
 ; CHECK-CVT-NEXT:    mov h6, v0.h[6]
+; CHECK-CVT-NEXT:    mov h1, v1.h[7]
 ; CHECK-CVT-NEXT:    mov h0, v0.h[7]
 ; CHECK-CVT-NEXT:    mov v2.h[3], w8
 ; CHECK-CVT-NEXT:    csetm w8, ls
@@ -1654,39 +1654,39 @@ define <8 x i1> @test_fcmp_ord(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-CVT-NEXT:    mov h3, v0.h[1]
 ; CHECK-CVT-NEXT:    fcvt s4, h1
 ; CHECK-CVT-NEXT:    fcvt s5, h0
-; CHECK-CVT-NEXT:    mov h6, v1.h[4]
-; CHECK-CVT-NEXT:    mov h7, v0.h[4]
-; CHECK-CVT-NEXT:    mov h16, v1.h[5]
+; CHECK-CVT-NEXT:    mov h6, v1.h[2]
 ; CHECK-CVT-NEXT:    fcvt s2, h2
 ; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    fcvt s7, h7
 ; CHECK-CVT-NEXT:    fcmp s3, s2
-; CHECK-CVT-NEXT:    mov h2, v1.h[2]
-; CHECK-CVT-NEXT:    mov h3, v0.h[2]
+; CHECK-CVT-NEXT:    mov h2, v0.h[2]
+; CHECK-CVT-NEXT:    mov h3, v1.h[3]
 ; CHECK-CVT-NEXT:    csetm w8, vc
 ; CHECK-CVT-NEXT:    fcmp s5, s4
+; CHECK-CVT-NEXT:    fcvt s5, h6
 ; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    mov h4, v1.h[3]
+; CHECK-CVT-NEXT:    mov h4, v0.h[3]
 ; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    mov h5, v0.h[3]
+; CHECK-CVT-NEXT:    mov h6, v0.h[4]
 ; CHECK-CVT-NEXT:    csetm w9, vc
-; CHECK-CVT-NEXT:    fcmp s3, s2
+; CHECK-CVT-NEXT:    fcmp s2, s5
 ; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    fcvt s3, h4
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    mov h5, v0.h[5]
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    mov h5, v1.h[4]
+; CHECK-CVT-NEXT:    fcvt s6, h6
 ; CHECK-CVT-NEXT:    mov v2.h[1], w8
 ; CHECK-CVT-NEXT:    csetm w8, vc
 ; CHECK-CVT-NEXT:    fcmp s4, s3
-; CHECK-CVT-NEXT:    fcvt s3, h16
-; CHECK-CVT-NEXT:    fcvt s4, h5
-; CHECK-CVT-NEXT:    mov h5, v1.h[6]
+; CHECK-CVT-NEXT:    mov h3, v1.h[5]
+; CHECK-CVT-NEXT:    mov h4, v0.h[5]
+; CHECK-CVT-NEXT:    fcvt s5, h5
 ; CHECK-CVT-NEXT:    mov v2.h[2], w8
-; CHECK-CVT-NEXT:    mov h1, v1.h[7]
 ; CHECK-CVT-NEXT:    csetm w8, vc
-; CHECK-CVT-NEXT:    fcmp s7, s6
+; CHECK-CVT-NEXT:    fcvt s3, h3
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    fcmp s6, s5
+; CHECK-CVT-NEXT:    mov h5, v1.h[6]
 ; CHECK-CVT-NEXT:    mov h6, v0.h[6]
+; CHECK-CVT-NEXT:    mov h1, v1.h[7]
 ; CHECK-CVT-NEXT:    mov h0, v0.h[7]
 ; CHECK-CVT-NEXT:    mov v2.h[3], w8
 ; CHECK-CVT-NEXT:    csetm w8, vc

diff  --git a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
index c81eb610609a6f..301d28fd7be56b 100644
--- a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
@@ -9,9 +9,9 @@ define <2 x i32> @stest_f64i32(<2 x double> %x) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov d1, v0.d[1]
 ; CHECK-NEXT:    fcvtzs w8, d0
+; CHECK-NEXT:    fcvtzs w9, d1
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fcvtzs w8, d1
-; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    mov v0.s[1], w9
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 entry:
@@ -29,9 +29,9 @@ define <2 x i32> @utest_f64i32(<2 x double> %x) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov d1, v0.d[1]
 ; CHECK-NEXT:    fcvtzu w8, d0
+; CHECK-NEXT:    fcvtzu w9, d1
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fcvtzu w8, d1
-; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    mov v0.s[1], w9
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 entry:
@@ -47,9 +47,9 @@ define <2 x i32> @ustest_f64i32(<2 x double> %x) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov d1, v0.d[1]
 ; CHECK-NEXT:    fcvtzu w8, d0
+; CHECK-NEXT:    fcvtzu w9, d1
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fcvtzu w8, d1
-; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    mov v0.s[1], w9
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 entry:
@@ -194,10 +194,10 @@ define <2 x i16> @ustest_f64i16(<2 x double> %x) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
 ; CHECK-NEXT:    movi d1, #0x00ffff0000ffff
-; CHECK-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-NEXT:    smin v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    smax v0.2s, v0.2s, v2.2s
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    smax v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ret
 entry:
   %conv = fptosi <2 x double> %x to <2 x i32>
@@ -372,9 +372,9 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) {
 ; CHECK-NEXT:    csel x8, x0, xzr, eq
 ; CHECK-NEXT:    cmp x20, #0
 ; CHECK-NEXT:    csel x9, x19, xzr, eq
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    add sp, sp, #48
 ; CHECK-NEXT:    ret
@@ -412,13 +412,13 @@ define <2 x i64> @ustest_f64i64(<2 x double> %x) {
 ; CHECK-NEXT:    csel x10, x19, xzr, lt
 ; CHECK-NEXT:    csinc x11, x20, xzr, lt
 ; CHECK-NEXT:    cmp xzr, x10
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ngcs xzr, x11
 ; CHECK-NEXT:    csel x10, x10, xzr, lt
 ; CHECK-NEXT:    cmp xzr, x8
 ; CHECK-NEXT:    ngcs xzr, x9
-; CHECK-NEXT:    csel x8, x8, xzr, lt
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov d0, x10
+; CHECK-NEXT:    csel x8, x8, xzr, lt
 ; CHECK-NEXT:    fmov d1, x8
 ; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    add sp, sp, #48
@@ -439,9 +439,9 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    mov s1, v0.s[1]
 ; CHECK-NEXT:    fcvtzs x8, s0
+; CHECK-NEXT:    fcvtzs x9, s1
 ; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fcvtzs x8, s1
-; CHECK-NEXT:    mov v0.d[1], x8
+; CHECK-NEXT:    mov v0.d[1], x9
 ; CHECK-NEXT:    ret
 entry:
   %conv = fptosi <2 x float> %x to <2 x i128>
@@ -477,9 +477,9 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) {
 ; CHECK-NEXT:    csel x8, x0, xzr, eq
 ; CHECK-NEXT:    cmp x20, #0
 ; CHECK-NEXT:    csel x9, x19, xzr, eq
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    add sp, sp, #48
 ; CHECK-NEXT:    ret
@@ -518,13 +518,13 @@ define <2 x i64> @ustest_f32i64(<2 x float> %x) {
 ; CHECK-NEXT:    csel x10, x19, xzr, lt
 ; CHECK-NEXT:    csinc x11, x20, xzr, lt
 ; CHECK-NEXT:    cmp xzr, x10
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ngcs xzr, x11
 ; CHECK-NEXT:    csel x10, x10, xzr, lt
 ; CHECK-NEXT:    cmp xzr, x9
 ; CHECK-NEXT:    ngcs xzr, x8
-; CHECK-NEXT:    csel x8, x9, xzr, lt
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov d0, x10
+; CHECK-NEXT:    csel x8, x9, xzr, lt
 ; CHECK-NEXT:    fmov d1, x8
 ; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    add sp, sp, #48
@@ -547,9 +547,9 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
 ; CHECK-CVT-NEXT:    fcvt s0, h0
 ; CHECK-CVT-NEXT:    fcvt s1, h1
 ; CHECK-CVT-NEXT:    fcvtzs x8, s0
+; CHECK-CVT-NEXT:    fcvtzs x9, s1
 ; CHECK-CVT-NEXT:    fmov d0, x8
-; CHECK-CVT-NEXT:    fcvtzs x8, s1
-; CHECK-CVT-NEXT:    mov v0.d[1], x8
+; CHECK-CVT-NEXT:    mov v0.d[1], x9
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: stest_f16i64:
@@ -557,9 +557,9 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) {
 ; CHECK-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-FP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-FP16-NEXT:    fcvtzs x8, h0
+; CHECK-FP16-NEXT:    fcvtzs x9, h1
 ; CHECK-FP16-NEXT:    fmov d0, x8
-; CHECK-FP16-NEXT:    fcvtzs x8, h1
-; CHECK-FP16-NEXT:    mov v0.d[1], x8
+; CHECK-FP16-NEXT:    mov v0.d[1], x9
 ; CHECK-FP16-NEXT:    ret
 entry:
   %conv = fptosi <2 x half> %x to <2 x i128>
@@ -595,9 +595,9 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
 ; CHECK-NEXT:    csel x8, x0, xzr, eq
 ; CHECK-NEXT:    cmp x20, #0
 ; CHECK-NEXT:    csel x9, x19, xzr, eq
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    add sp, sp, #48
 ; CHECK-NEXT:    ret
@@ -636,13 +636,13 @@ define <2 x i64> @ustest_f16i64(<2 x half> %x) {
 ; CHECK-NEXT:    csel x10, x19, xzr, lt
 ; CHECK-NEXT:    csinc x11, x20, xzr, lt
 ; CHECK-NEXT:    cmp xzr, x10
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ngcs xzr, x11
 ; CHECK-NEXT:    csel x10, x10, xzr, lt
 ; CHECK-NEXT:    cmp xzr, x9
 ; CHECK-NEXT:    ngcs xzr, x8
-; CHECK-NEXT:    csel x8, x9, xzr, lt
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov d0, x10
+; CHECK-NEXT:    csel x8, x9, xzr, lt
 ; CHECK-NEXT:    fmov d1, x8
 ; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    add sp, sp, #48
@@ -666,9 +666,9 @@ define <2 x i32> @stest_f64i32_mm(<2 x double> %x) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov d1, v0.d[1]
 ; CHECK-NEXT:    fcvtzs w8, d0
+; CHECK-NEXT:    fcvtzs w9, d1
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fcvtzs w8, d1
-; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    mov v0.s[1], w9
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 entry:
@@ -684,9 +684,9 @@ define <2 x i32> @utest_f64i32_mm(<2 x double> %x) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov d1, v0.d[1]
 ; CHECK-NEXT:    fcvtzu w8, d0
+; CHECK-NEXT:    fcvtzu w9, d1
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fcvtzu w8, d1
-; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    mov v0.s[1], w9
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 entry:
@@ -701,9 +701,9 @@ define <2 x i32> @ustest_f64i32_mm(<2 x double> %x) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov d1, v0.d[1]
 ; CHECK-NEXT:    fcvtzu w8, d0
+; CHECK-NEXT:    fcvtzu w9, d1
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fcvtzu w8, d1
-; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    mov v0.s[1], w9
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 entry:
@@ -833,10 +833,10 @@ define <2 x i16> @ustest_f64i16_mm(<2 x double> %x) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
 ; CHECK-NEXT:    movi d1, #0x00ffff0000ffff
-; CHECK-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-NEXT:    smin v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    smax v0.2s, v0.2s, v2.2s
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    smax v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ret
 entry:
   %conv = fptosi <2 x double> %x to <2 x i32>
@@ -997,9 +997,9 @@ define <2 x i64> @utest_f64i64_mm(<2 x double> %x) {
 ; CHECK-NEXT:    csel x8, x0, xzr, eq
 ; CHECK-NEXT:    cmp x20, #0
 ; CHECK-NEXT:    csel x9, x19, xzr, eq
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    add sp, sp, #48
 ; CHECK-NEXT:    ret
@@ -1036,10 +1036,10 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
 ; CHECK-NEXT:    csinc x10, x20, xzr, lt
 ; CHECK-NEXT:    csel x11, x19, xzr, lt
 ; CHECK-NEXT:    cmp x10, #0
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x10, xzr, x11, lt
 ; CHECK-NEXT:    cmp x9, #0
 ; CHECK-NEXT:    csel x8, xzr, x8, lt
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov d0, x10
 ; CHECK-NEXT:    fmov d1, x8
 ; CHECK-NEXT:    mov v0.d[1], v1.d[0]
@@ -1059,9 +1059,9 @@ define <2 x i64> @stest_f32i64_mm(<2 x float> %x) {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    mov s1, v0.s[1]
 ; CHECK-NEXT:    fcvtzs x8, s0
+; CHECK-NEXT:    fcvtzs x9, s1
 ; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fcvtzs x8, s1
-; CHECK-NEXT:    mov v0.d[1], x8
+; CHECK-NEXT:    mov v0.d[1], x9
 ; CHECK-NEXT:    ret
 entry:
   %conv = fptosi <2 x float> %x to <2 x i128>
@@ -1095,9 +1095,9 @@ define <2 x i64> @utest_f32i64_mm(<2 x float> %x) {
 ; CHECK-NEXT:    csel x8, x0, xzr, eq
 ; CHECK-NEXT:    cmp x20, #0
 ; CHECK-NEXT:    csel x9, x19, xzr, eq
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    add sp, sp, #48
 ; CHECK-NEXT:    ret
@@ -1135,10 +1135,10 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
 ; CHECK-NEXT:    csinc x10, x20, xzr, lt
 ; CHECK-NEXT:    csel x11, x19, xzr, lt
 ; CHECK-NEXT:    cmp x10, #0
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x10, xzr, x11, lt
 ; CHECK-NEXT:    cmp x9, #0
 ; CHECK-NEXT:    csel x8, xzr, x8, lt
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov d0, x10
 ; CHECK-NEXT:    fmov d1, x8
 ; CHECK-NEXT:    mov v0.d[1], v1.d[0]
@@ -1160,9 +1160,9 @@ define <2 x i64> @stest_f16i64_mm(<2 x half> %x) {
 ; CHECK-CVT-NEXT:    fcvt s0, h0
 ; CHECK-CVT-NEXT:    fcvt s1, h1
 ; CHECK-CVT-NEXT:    fcvtzs x8, s0
+; CHECK-CVT-NEXT:    fcvtzs x9, s1
 ; CHECK-CVT-NEXT:    fmov d0, x8
-; CHECK-CVT-NEXT:    fcvtzs x8, s1
-; CHECK-CVT-NEXT:    mov v0.d[1], x8
+; CHECK-CVT-NEXT:    mov v0.d[1], x9
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: stest_f16i64_mm:
@@ -1170,9 +1170,9 @@ define <2 x i64> @stest_f16i64_mm(<2 x half> %x) {
 ; CHECK-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-FP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-FP16-NEXT:    fcvtzs x8, h0
+; CHECK-FP16-NEXT:    fcvtzs x9, h1
 ; CHECK-FP16-NEXT:    fmov d0, x8
-; CHECK-FP16-NEXT:    fcvtzs x8, h1
-; CHECK-FP16-NEXT:    mov v0.d[1], x8
+; CHECK-FP16-NEXT:    mov v0.d[1], x9
 ; CHECK-FP16-NEXT:    ret
 entry:
   %conv = fptosi <2 x half> %x to <2 x i128>
@@ -1206,9 +1206,9 @@ define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) {
 ; CHECK-NEXT:    csel x8, x0, xzr, eq
 ; CHECK-NEXT:    cmp x20, #0
 ; CHECK-NEXT:    csel x9, x19, xzr, eq
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    add sp, sp, #48
 ; CHECK-NEXT:    ret
@@ -1246,10 +1246,10 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
 ; CHECK-NEXT:    csinc x10, x20, xzr, lt
 ; CHECK-NEXT:    csel x11, x19, xzr, lt
 ; CHECK-NEXT:    cmp x10, #0
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x10, xzr, x11, lt
 ; CHECK-NEXT:    cmp x9, #0
 ; CHECK-NEXT:    csel x8, xzr, x8, lt
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov d0, x10
 ; CHECK-NEXT:    fmov d1, x8
 ; CHECK-NEXT:    mov v0.d[1], v1.d[0]

diff  --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index 147333b7d864a6..a36a58660cd40e 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -165,9 +165,9 @@ define <2 x i32> @test_signed_v2f64_v2i32(<2 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov d1, v0.d[1]
 ; CHECK-NEXT:    fcvtzs w8, d0
+; CHECK-NEXT:    fcvtzs w9, d1
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fcvtzs w8, d1
-; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    mov v0.s[1], w9
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
     %x = call <2 x i32> @llvm.fptosi.sat.v2f64.v2i32(<2 x double> %f)
@@ -178,10 +178,10 @@ define <3 x i32> @test_signed_v3f64_v3i32(<3 x double> %f) {
 ; CHECK-LABEL: test_signed_v3f64_v3i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtzs w8, d0
+; CHECK-NEXT:    fcvtzs w9, d1
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fcvtzs w8, d1
-; CHECK-NEXT:    mov v0.s[1], w8
 ; CHECK-NEXT:    fcvtzs w8, d2
+; CHECK-NEXT:    mov v0.s[1], w9
 ; CHECK-NEXT:    mov v0.s[2], w8
 ; CHECK-NEXT:    fcvtzs w8, d0
 ; CHECK-NEXT:    mov v0.s[3], w8
@@ -195,11 +195,11 @@ define <4 x i32> @test_signed_v4f64_v4i32(<4 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov d2, v0.d[1]
 ; CHECK-NEXT:    fcvtzs w8, d0
+; CHECK-NEXT:    fcvtzs w9, d2
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fcvtzs w8, d2
-; CHECK-NEXT:    mov v0.s[1], w8
 ; CHECK-NEXT:    fcvtzs w8, d1
 ; CHECK-NEXT:    mov d1, v1.d[1]
+; CHECK-NEXT:    mov v0.s[1], w9
 ; CHECK-NEXT:    mov v0.s[2], w8
 ; CHECK-NEXT:    fcvtzs w8, d1
 ; CHECK-NEXT:    mov v0.s[3], w8
@@ -261,9 +261,9 @@ define <1 x i32> @test_signed_v1f128_v1i32(<1 x fp128> %f) {
 ; CHECK-NEXT:    bl __fixtfsi
 ; CHECK-NEXT:    cmp w19, #0
 ; CHECK-NEXT:    mov w8, #-2147483648 // =0x80000000
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel w19, w8, w0, lt
 ; CHECK-NEXT:    adrp x8, .LCPI14_1
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI14_1]
 ; CHECK-NEXT:    bl __gttf2
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
@@ -295,11 +295,11 @@ define <2 x i32> @test_signed_v2f128_v2i32(<2 x fp128> %f) {
 ; CHECK-NEXT:    .cfi_offset w21, -24
 ; CHECK-NEXT:    .cfi_offset w22, -32
 ; CHECK-NEXT:    .cfi_offset w30, -48
-; CHECK-NEXT:    adrp x8, .LCPI15_0
-; CHECK-NEXT:    stp q1, q0, [sp, #32] // 32-byte Folded Spill
 ; CHECK-NEXT:    mov v2.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    stp q1, q0, [sp, #32] // 32-byte Folded Spill
+; CHECK-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI15_0]
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __getf2
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
@@ -308,9 +308,9 @@ define <2 x i32> @test_signed_v2f128_v2i32(<2 x fp128> %f) {
 ; CHECK-NEXT:    adrp x8, .LCPI15_1
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    cmp w19, #0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI15_1]
 ; CHECK-NEXT:    mov w20, #-2147483648 // =0x80000000
 ; CHECK-NEXT:    csel w19, w20, w0, lt
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI15_1]
 ; CHECK-NEXT:    str q1, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __gttf2
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
@@ -320,16 +320,16 @@ define <2 x i32> @test_signed_v2f128_v2i32(<2 x fp128> %f) {
 ; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    bl __unordtf2
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    cmp w0, #0
 ; CHECK-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    cmp w0, #0
 ; CHECK-NEXT:    csel w22, wzr, w19, ne
 ; CHECK-NEXT:    bl __getf2
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov w19, w0
 ; CHECK-NEXT:    bl __fixtfsi
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    cmp w19, #0
 ; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    cmp w19, #0
 ; CHECK-NEXT:    csel w19, w20, w0, lt
 ; CHECK-NEXT:    bl __gttf2
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
@@ -364,12 +364,12 @@ define <3 x i32> @test_signed_v3f128_v3i32(<3 x fp128> %f) {
 ; CHECK-NEXT:    .cfi_offset w21, -24
 ; CHECK-NEXT:    .cfi_offset w22, -32
 ; CHECK-NEXT:    .cfi_offset w30, -48
-; CHECK-NEXT:    adrp x8, .LCPI16_0
 ; CHECK-NEXT:    stp q0, q2, [sp, #48] // 32-byte Folded Spill
 ; CHECK-NEXT:    mov v2.16b, v1.16b
+; CHECK-NEXT:    adrp x8, .LCPI16_0
 ; CHECK-NEXT:    str q1, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __getf2
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
@@ -378,9 +378,9 @@ define <3 x i32> @test_signed_v3f128_v3i32(<3 x fp128> %f) {
 ; CHECK-NEXT:    adrp x8, .LCPI16_1
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    cmp w19, #0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI16_1]
 ; CHECK-NEXT:    mov w20, #-2147483648 // =0x80000000
 ; CHECK-NEXT:    csel w19, w20, w0, lt
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI16_1]
 ; CHECK-NEXT:    str q1, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __gttf2
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
@@ -390,16 +390,16 @@ define <3 x i32> @test_signed_v3f128_v3i32(<3 x fp128> %f) {
 ; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    bl __unordtf2
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    cmp w0, #0
 ; CHECK-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    cmp w0, #0
 ; CHECK-NEXT:    csel w22, wzr, w19, ne
 ; CHECK-NEXT:    bl __getf2
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov w19, w0
 ; CHECK-NEXT:    bl __fixtfsi
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    cmp w19, #0
 ; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    cmp w19, #0
 ; CHECK-NEXT:    csel w19, w20, w0, lt
 ; CHECK-NEXT:    bl __gttf2
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
@@ -419,8 +419,8 @@ define <3 x i32> @test_signed_v3f128_v3i32(<3 x fp128> %f) {
 ; CHECK-NEXT:    mov w19, w0
 ; CHECK-NEXT:    bl __fixtfsi
 ; CHECK-NEXT:    ldr q0, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    cmp w19, #0
 ; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    cmp w19, #0
 ; CHECK-NEXT:    csel w19, w20, w0, lt
 ; CHECK-NEXT:    bl __gttf2
 ; CHECK-NEXT:    ldr q0, [sp, #64] // 16-byte Folded Reload
@@ -430,8 +430,8 @@ define <3 x i32> @test_signed_v3f128_v3i32(<3 x fp128> %f) {
 ; CHECK-NEXT:    bl __unordtf2
 ; CHECK-NEXT:    cmp w0, #0
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    csel w8, wzr, w19, ne
 ; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT:    csel w8, wzr, w19, ne
 ; CHECK-NEXT:    ldp x20, x19, [sp, #112] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x22, x21, [sp, #96] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov v0.s[2], w8
@@ -454,11 +454,11 @@ define <4 x i32> @test_signed_v4f128_v4i32(<4 x fp128> %f) {
 ; CHECK-NEXT:    .cfi_offset w21, -24
 ; CHECK-NEXT:    .cfi_offset w22, -32
 ; CHECK-NEXT:    .cfi_offset w30, -48
-; CHECK-NEXT:    adrp x8, .LCPI17_0
 ; CHECK-NEXT:    stp q2, q3, [sp, #64] // 32-byte Folded Spill
 ; CHECK-NEXT:    mov v2.16b, v1.16b
-; CHECK-NEXT:    str q1, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    adrp x8, .LCPI17_0
 ; CHECK-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str q1, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI17_0]
 ; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    str q1, [sp, #32] // 16-byte Folded Spill
@@ -469,9 +469,9 @@ define <4 x i32> @test_signed_v4f128_v4i32(<4 x fp128> %f) {
 ; CHECK-NEXT:    adrp x8, .LCPI17_1
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    cmp w19, #0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI17_1]
 ; CHECK-NEXT:    mov w20, #-2147483648 // =0x80000000
 ; CHECK-NEXT:    csel w19, w20, w0, lt
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI17_1]
 ; CHECK-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __gttf2
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
@@ -488,8 +488,8 @@ define <4 x i32> @test_signed_v4f128_v4i32(<4 x fp128> %f) {
 ; CHECK-NEXT:    mov w19, w0
 ; CHECK-NEXT:    bl __fixtfsi
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    cmp w19, #0
 ; CHECK-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    cmp w19, #0
 ; CHECK-NEXT:    csel w19, w20, w0, lt
 ; CHECK-NEXT:    bl __gttf2
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
@@ -509,8 +509,8 @@ define <4 x i32> @test_signed_v4f128_v4i32(<4 x fp128> %f) {
 ; CHECK-NEXT:    mov w19, w0
 ; CHECK-NEXT:    bl __fixtfsi
 ; CHECK-NEXT:    ldr q0, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    cmp w19, #0
 ; CHECK-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    cmp w19, #0
 ; CHECK-NEXT:    csel w19, w20, w0, lt
 ; CHECK-NEXT:    bl __gttf2
 ; CHECK-NEXT:    ldr q0, [sp, #64] // 16-byte Folded Reload
@@ -529,8 +529,8 @@ define <4 x i32> @test_signed_v4f128_v4i32(<4 x fp128> %f) {
 ; CHECK-NEXT:    mov w19, w0
 ; CHECK-NEXT:    bl __fixtfsi
 ; CHECK-NEXT:    ldr q0, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT:    cmp w19, #0
 ; CHECK-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    cmp w19, #0
 ; CHECK-NEXT:    csel w19, w20, w0, lt
 ; CHECK-NEXT:    bl __gttf2
 ; CHECK-NEXT:    ldr q0, [sp, #80] // 16-byte Folded Reload
@@ -540,8 +540,8 @@ define <4 x i32> @test_signed_v4f128_v4i32(<4 x fp128> %f) {
 ; CHECK-NEXT:    bl __unordtf2
 ; CHECK-NEXT:    cmp w0, #0
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    csel w8, wzr, w19, ne
 ; CHECK-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT:    csel w8, wzr, w19, ne
 ; CHECK-NEXT:    ldp x20, x19, [sp, #128] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x22, x21, [sp, #112] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov v0.s[3], w8
@@ -638,10 +638,10 @@ define <6 x i32> @test_signed_v6f16_v6i32(<6 x half> %f) {
 ; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
 ; CHECK-NEXT:    mov w1, v1.s[1]
 ; CHECK-NEXT:    mov w2, v1.s[2]
-; CHECK-NEXT:    mov w3, v1.s[3]
 ; CHECK-NEXT:    mov w5, v0.s[1]
-; CHECK-NEXT:    fmov w0, s1
+; CHECK-NEXT:    mov w3, v1.s[3]
 ; CHECK-NEXT:    fmov w4, s0
+; CHECK-NEXT:    fmov w0, s1
 ; CHECK-NEXT:    ret
     %x = call <6 x i32> @llvm.fptosi.sat.v6f16.v6i32(<6 x half> %f)
     ret <6 x i32> %x
@@ -697,9 +697,9 @@ define <2 x i1> @test_signed_v2f32_v2i1(<2 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    fcvtzs v0.2s, v0.2s
-; CHECK-NEXT:    movi v2.2d, #0xffffffffffffffff
 ; CHECK-NEXT:    smin v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    smax v0.2s, v0.2s, v2.2s
+; CHECK-NEXT:    movi v1.2d, #0xffffffffffffffff
+; CHECK-NEXT:    smax v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ret
     %x = call <2 x i1> @llvm.fptosi.sat.v2f32.v2i1(<2 x float> %f)
     ret <2 x i1> %x
@@ -796,9 +796,9 @@ define <2 x i64> @test_signed_v2f32_v2i64(<2 x float> %f) {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    mov s1, v0.s[1]
 ; CHECK-NEXT:    fcvtzs x8, s0
+; CHECK-NEXT:    fcvtzs x9, s1
 ; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fcvtzs x8, s1
-; CHECK-NEXT:    mov v0.d[1], x8
+; CHECK-NEXT:    mov v0.d[1], x9
 ; CHECK-NEXT:    ret
     %x = call <2 x i64> @llvm.fptosi.sat.v2f32.v2i64(<2 x float> %f)
     ret <2 x i64> %x
@@ -830,9 +830,9 @@ define <2 x i100> @test_signed_v2f32_v2i100(<2 x float> %f) {
 ; CHECK-NEXT:    movi v9.2s, #241, lsl #24
 ; CHECK-NEXT:    mov w8, #1895825407 // =0x70ffffff
 ; CHECK-NEXT:    mov x21, #-34359738368 // =0xfffffff800000000
+; CHECK-NEXT:    fmov s10, w8
 ; CHECK-NEXT:    mov x22, #34359738367 // =0x7ffffffff
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    fmov s10, w8
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
@@ -847,8 +847,8 @@ define <2 x i100> @test_signed_v2f32_v2i100(<2 x float> %f) {
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov x2, x19
 ; CHECK-NEXT:    mov x3, x20
-; CHECK-NEXT:    ldr x30, [sp, #40] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #40] // 8-byte Folded Reload
 ; CHECK-NEXT:    fcmp s0, s9
 ; CHECK-NEXT:    ldp d9, d8, [sp, #24] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x8, x21, x1, lt
@@ -896,9 +896,9 @@ define <2 x i128> @test_signed_v2f32_v2i128(<2 x float> %f) {
 ; CHECK-NEXT:    movi v9.2s, #255, lsl #24
 ; CHECK-NEXT:    mov w8, #2130706431 // =0x7effffff
 ; CHECK-NEXT:    mov x21, #-9223372036854775808 // =0x8000000000000000
+; CHECK-NEXT:    fmov s10, w8
 ; CHECK-NEXT:    mov x22, #9223372036854775807 // =0x7fffffffffffffff
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    fmov s10, w8
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
@@ -913,8 +913,8 @@ define <2 x i128> @test_signed_v2f32_v2i128(<2 x float> %f) {
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov x2, x19
 ; CHECK-NEXT:    mov x3, x20
-; CHECK-NEXT:    ldr x30, [sp, #40] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #40] // 8-byte Folded Reload
 ; CHECK-NEXT:    fcmp s0, s9
 ; CHECK-NEXT:    ldp d9, d8, [sp, #24] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x8, x21, x1, lt
@@ -1028,30 +1028,30 @@ define <4 x i50> @test_signed_v4f32_v4i50(<4 x float> %f) {
 ; CHECK-LABEL: test_signed_v4f32_v4i50:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    mov x9, #562949953421311 // =0x1ffffffffffff
-; CHECK-NEXT:    mov x10, #-562949953421312 // =0xfffe000000000000
+; CHECK-NEXT:    mov x8, #562949953421311 // =0x1ffffffffffff
+; CHECK-NEXT:    mov x11, #-562949953421312 // =0xfffe000000000000
 ; CHECK-NEXT:    fcvtzs x12, s0
 ; CHECK-NEXT:    mov s2, v1.s[1]
-; CHECK-NEXT:    fcvtzs x8, s1
+; CHECK-NEXT:    fcvtzs x9, s1
 ; CHECK-NEXT:    mov s1, v0.s[1]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    fcvtzs x11, s2
-; CHECK-NEXT:    csel x8, x8, x9, lt
-; CHECK-NEXT:    cmp x8, x10
-; CHECK-NEXT:    csel x2, x8, x10, gt
-; CHECK-NEXT:    cmp x11, x9
-; CHECK-NEXT:    csel x8, x11, x9, lt
-; CHECK-NEXT:    fcvtzs x11, s1
-; CHECK-NEXT:    cmp x8, x10
-; CHECK-NEXT:    csel x3, x8, x10, gt
-; CHECK-NEXT:    cmp x12, x9
-; CHECK-NEXT:    csel x8, x12, x9, lt
-; CHECK-NEXT:    cmp x8, x10
-; CHECK-NEXT:    csel x0, x8, x10, gt
-; CHECK-NEXT:    cmp x11, x9
-; CHECK-NEXT:    csel x8, x11, x9, lt
-; CHECK-NEXT:    cmp x8, x10
-; CHECK-NEXT:    csel x1, x8, x10, gt
+; CHECK-NEXT:    fcvtzs x10, s2
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    csel x9, x9, x8, lt
+; CHECK-NEXT:    cmp x9, x11
+; CHECK-NEXT:    csel x2, x9, x11, gt
+; CHECK-NEXT:    cmp x10, x8
+; CHECK-NEXT:    csel x9, x10, x8, lt
+; CHECK-NEXT:    fcvtzs x10, s1
+; CHECK-NEXT:    cmp x9, x11
+; CHECK-NEXT:    csel x3, x9, x11, gt
+; CHECK-NEXT:    cmp x12, x8
+; CHECK-NEXT:    csel x9, x12, x8, lt
+; CHECK-NEXT:    cmp x9, x11
+; CHECK-NEXT:    csel x0, x9, x11, gt
+; CHECK-NEXT:    cmp x10, x8
+; CHECK-NEXT:    csel x8, x10, x8, lt
+; CHECK-NEXT:    cmp x8, x11
+; CHECK-NEXT:    csel x1, x8, x11, gt
 ; CHECK-NEXT:    ret
     %x = call <4 x i50> @llvm.fptosi.sat.v4f32.v4i50(<4 x float> %f)
     ret <4 x i50> %x
@@ -1065,12 +1065,12 @@ define <4 x i64> @test_signed_v4f32_v4i64(<4 x float> %f) {
 ; CHECK-NEXT:    fcvtzs x9, s0
 ; CHECK-NEXT:    mov s2, v1.s[1]
 ; CHECK-NEXT:    fcvtzs x8, s1
+; CHECK-NEXT:    fcvtzs x11, s3
 ; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    fcvtzs x9, s3
+; CHECK-NEXT:    fcvtzs x10, s2
 ; CHECK-NEXT:    fmov d1, x8
-; CHECK-NEXT:    fcvtzs x8, s2
-; CHECK-NEXT:    mov v0.d[1], x9
-; CHECK-NEXT:    mov v1.d[1], x8
+; CHECK-NEXT:    mov v0.d[1], x11
+; CHECK-NEXT:    mov v1.d[1], x10
 ; CHECK-NEXT:    ret
     %x = call <4 x i64> @llvm.fptosi.sat.v4f32.v4i64(<4 x float> %f)
     ret <4 x i64> %x
@@ -1107,11 +1107,11 @@ define <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    movi v9.2s, #241, lsl #24
 ; CHECK-NEXT:    mov w8, #1895825407 // =0x70ffffff
 ; CHECK-NEXT:    mov x25, #-34359738368 // =0xfffffff800000000
+; CHECK-NEXT:    fmov s10, w8
 ; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov x26, #34359738367 // =0x7ffffffff
-; CHECK-NEXT:    fmov s10, w8
-; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, x25, x1, lt
 ; CHECK-NEXT:    fcmp s8, s10
@@ -1156,22 +1156,22 @@ define <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    mov x6, x23
 ; CHECK-NEXT:    fcmp s0, s9
 ; CHECK-NEXT:    mov x7, x24
+; CHECK-NEXT:    ldr x30, [sp, #56] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp x20, x19, [sp, #112] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #96] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x8, x25, x1, lt
 ; CHECK-NEXT:    csel x9, xzr, x0, lt
 ; CHECK-NEXT:    fcmp s0, s10
-; CHECK-NEXT:    ldr x30, [sp, #56] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x24, x23, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr d10, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #40] // 16-byte Folded Reload
 ; CHECK-NEXT:    csinv x9, x9, xzr, le
 ; CHECK-NEXT:    csel x8, x26, x8, gt
 ; CHECK-NEXT:    fcmp s0, s0
-; CHECK-NEXT:    ldr d10, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp x24, x23, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x26, x25, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x9, xzr, x9, vs
 ; CHECK-NEXT:    csel x1, xzr, x8, vs
-; CHECK-NEXT:    ldp x26, x25, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    ldp d9, d8, [sp, #40] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov v0.d[1], x1
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    add sp, sp, #128
@@ -1211,11 +1211,11 @@ define <4 x i128> @test_signed_v4f32_v4i128(<4 x float> %f) {
 ; CHECK-NEXT:    movi v9.2s, #255, lsl #24
 ; CHECK-NEXT:    mov w8, #2130706431 // =0x7effffff
 ; CHECK-NEXT:    mov x25, #-9223372036854775808 // =0x8000000000000000
+; CHECK-NEXT:    fmov s10, w8
 ; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov x26, #9223372036854775807 // =0x7fffffffffffffff
-; CHECK-NEXT:    fmov s10, w8
-; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, x25, x1, lt
 ; CHECK-NEXT:    fcmp s8, s10
@@ -1260,22 +1260,22 @@ define <4 x i128> @test_signed_v4f32_v4i128(<4 x float> %f) {
 ; CHECK-NEXT:    mov x6, x23
 ; CHECK-NEXT:    fcmp s0, s9
 ; CHECK-NEXT:    mov x7, x24
+; CHECK-NEXT:    ldr x30, [sp, #56] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp x20, x19, [sp, #112] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #96] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x8, x25, x1, lt
 ; CHECK-NEXT:    csel x9, xzr, x0, lt
 ; CHECK-NEXT:    fcmp s0, s10
-; CHECK-NEXT:    ldr x30, [sp, #56] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x24, x23, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr d10, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #40] // 16-byte Folded Reload
 ; CHECK-NEXT:    csinv x9, x9, xzr, le
 ; CHECK-NEXT:    csel x8, x26, x8, gt
 ; CHECK-NEXT:    fcmp s0, s0
-; CHECK-NEXT:    ldr d10, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp x24, x23, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x26, x25, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x9, xzr, x9, vs
 ; CHECK-NEXT:    csel x1, xzr, x8, vs
-; CHECK-NEXT:    ldp x26, x25, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    ldp d9, d8, [sp, #40] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov v0.d[1], x1
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    add sp, sp, #128
@@ -1320,8 +1320,8 @@ define <2 x i8> @test_signed_v2f64_v2i8(<2 x double> %f) {
 ; CHECK-LABEL: test_signed_v2f64_v2i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov d1, v0.d[1]
-; CHECK-NEXT:    mov w8, #127 // =0x7f
 ; CHECK-NEXT:    fcvtzs w10, d0
+; CHECK-NEXT:    mov w8, #127 // =0x7f
 ; CHECK-NEXT:    mov w11, #-128 // =0xffffff80
 ; CHECK-NEXT:    fcvtzs w9, d1
 ; CHECK-NEXT:    cmp w9, #127
@@ -1344,8 +1344,8 @@ define <2 x i13> @test_signed_v2f64_v2i13(<2 x double> %f) {
 ; CHECK-LABEL: test_signed_v2f64_v2i13:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov d1, v0.d[1]
-; CHECK-NEXT:    mov w8, #4095 // =0xfff
 ; CHECK-NEXT:    fcvtzs w10, d0
+; CHECK-NEXT:    mov w8, #4095 // =0xfff
 ; CHECK-NEXT:    mov w11, #-4096 // =0xfffff000
 ; CHECK-NEXT:    fcvtzs w9, d1
 ; CHECK-NEXT:    cmp w9, #4095
@@ -1417,9 +1417,9 @@ define <2 x i32> @test_signed_v2f64_v2i32_duplicate(<2 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov d1, v0.d[1]
 ; CHECK-NEXT:    fcvtzs w8, d0
+; CHECK-NEXT:    fcvtzs w9, d1
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fcvtzs w8, d1
-; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    mov v0.s[1], w9
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
     %x = call <2 x i32> @llvm.fptosi.sat.v2f64.v2i32(<2 x double> %f)
@@ -1483,12 +1483,12 @@ define <2 x i100> @test_signed_v2f64_v2i100(<2 x double> %f) {
 ; CHECK-NEXT:    mov x8, #-4170333254945079296 // =0xc620000000000000
 ; CHECK-NEXT:    mov x21, #-34359738368 // =0xfffffff800000000
 ; CHECK-NEXT:    mov x22, #34359738367 // =0x7ffffffff
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    fmov d9, x8
 ; CHECK-NEXT:    mov x8, #5053038781909696511 // =0x461fffffffffffff
-; CHECK-NEXT:    fcmp d8, d9
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov d10, x8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    fcmp d8, d9
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, x21, x1, lt
 ; CHECK-NEXT:    fcmp d8, d10
@@ -1501,8 +1501,8 @@ define <2 x i100> @test_signed_v2f64_v2i100(<2 x double> %f) {
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov x2, x19
 ; CHECK-NEXT:    mov x3, x20
-; CHECK-NEXT:    ldr x30, [sp, #40] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #40] // 8-byte Folded Reload
 ; CHECK-NEXT:    fcmp d0, d9
 ; CHECK-NEXT:    ldp d9, d8, [sp, #24] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x8, x21, x1, lt
@@ -1549,12 +1549,12 @@ define <2 x i128> @test_signed_v2f64_v2i128(<2 x double> %f) {
 ; CHECK-NEXT:    mov x8, #-4044232465378705408 // =0xc7e0000000000000
 ; CHECK-NEXT:    mov x21, #-9223372036854775808 // =0x8000000000000000
 ; CHECK-NEXT:    mov x22, #9223372036854775807 // =0x7fffffffffffffff
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    fmov d9, x8
 ; CHECK-NEXT:    mov x8, #5179139571476070399 // =0x47dfffffffffffff
-; CHECK-NEXT:    fcmp d8, d9
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov d10, x8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    fcmp d8, d9
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, x21, x1, lt
 ; CHECK-NEXT:    fcmp d8, d10
@@ -1567,8 +1567,8 @@ define <2 x i128> @test_signed_v2f64_v2i128(<2 x double> %f) {
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov x2, x19
 ; CHECK-NEXT:    mov x3, x20
-; CHECK-NEXT:    ldr x30, [sp, #40] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #40] // 8-byte Folded Reload
 ; CHECK-NEXT:    fcmp d0, d9
 ; CHECK-NEXT:    ldp d9, d8, [sp, #24] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x8, x21, x1, lt
@@ -1620,9 +1620,9 @@ define <4 x i1> @test_signed_v4f16_v4i1(<4 x half> %f) {
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-FP16-NEXT:    fcvtzs v0.4h, v0.4h
-; CHECK-FP16-NEXT:    movi v2.2d, #0xffffffffffffffff
 ; CHECK-FP16-NEXT:    smin v0.4h, v0.4h, v1.4h
-; CHECK-FP16-NEXT:    smax v0.4h, v0.4h, v2.4h
+; CHECK-FP16-NEXT:    movi v1.2d, #0xffffffffffffffff
+; CHECK-FP16-NEXT:    smax v0.4h, v0.4h, v1.4h
 ; CHECK-FP16-NEXT:    ret
     %x = call <4 x i1> @llvm.fptosi.sat.v4f16.v4i1(<4 x half> %f)
     ret <4 x i1> %x
@@ -1667,9 +1667,9 @@ define <4 x i13> @test_signed_v4f16_v4i13(<4 x half> %f) {
 ; CHECK-FP16-LABEL: test_signed_v4f16_v4i13:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcvtzs v0.4h, v0.4h
+; CHECK-FP16-NEXT:    mvni v1.4h, #240, lsl #8
+; CHECK-FP16-NEXT:    smin v0.4h, v0.4h, v1.4h
 ; CHECK-FP16-NEXT:    movi v1.4h, #240, lsl #8
-; CHECK-FP16-NEXT:    mvni v2.4h, #240, lsl #8
-; CHECK-FP16-NEXT:    smin v0.4h, v0.4h, v2.4h
 ; CHECK-FP16-NEXT:    smax v0.4h, v0.4h, v1.4h
 ; CHECK-FP16-NEXT:    ret
     %x = call <4 x i13> @llvm.fptosi.sat.v4f16.v4i13(<4 x half> %f)
@@ -1722,27 +1722,27 @@ define <4 x i50> @test_signed_v4f16_v4i50(<4 x half> %f) {
 ; CHECK-CVT-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-CVT-NEXT:    mov h1, v0.h[1]
 ; CHECK-CVT-NEXT:    fcvt s2, h0
+; CHECK-CVT-NEXT:    mov x8, #562949953421311 // =0x1ffffffffffff
 ; CHECK-CVT-NEXT:    mov h3, v0.h[2]
 ; CHECK-CVT-NEXT:    mov h0, v0.h[3]
-; CHECK-CVT-NEXT:    mov x8, #562949953421311 // =0x1ffffffffffff
 ; CHECK-CVT-NEXT:    mov x11, #-562949953421312 // =0xfffe000000000000
 ; CHECK-CVT-NEXT:    fcvt s1, h1
 ; CHECK-CVT-NEXT:    fcvtzs x9, s2
+; CHECK-CVT-NEXT:    fcvt s2, h3
 ; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    cmp x9, x8
 ; CHECK-CVT-NEXT:    fcvtzs x10, s1
-; CHECK-CVT-NEXT:    fcvt s1, h3
+; CHECK-CVT-NEXT:    cmp x9, x8
 ; CHECK-CVT-NEXT:    csel x9, x9, x8, lt
+; CHECK-CVT-NEXT:    fcvtzs x12, s2
 ; CHECK-CVT-NEXT:    cmp x9, x11
 ; CHECK-CVT-NEXT:    csel x0, x9, x11, gt
 ; CHECK-CVT-NEXT:    cmp x10, x8
-; CHECK-CVT-NEXT:    fcvtzs x9, s1
-; CHECK-CVT-NEXT:    csel x10, x10, x8, lt
-; CHECK-CVT-NEXT:    cmp x10, x11
-; CHECK-CVT-NEXT:    csel x1, x10, x11, gt
+; CHECK-CVT-NEXT:    csel x9, x10, x8, lt
 ; CHECK-CVT-NEXT:    fcvtzs x10, s0
-; CHECK-CVT-NEXT:    cmp x9, x8
-; CHECK-CVT-NEXT:    csel x9, x9, x8, lt
+; CHECK-CVT-NEXT:    cmp x9, x11
+; CHECK-CVT-NEXT:    csel x1, x9, x11, gt
+; CHECK-CVT-NEXT:    cmp x12, x8
+; CHECK-CVT-NEXT:    csel x9, x12, x8, lt
 ; CHECK-CVT-NEXT:    cmp x9, x11
 ; CHECK-CVT-NEXT:    csel x2, x9, x11, gt
 ; CHECK-CVT-NEXT:    cmp x10, x8
@@ -1757,22 +1757,22 @@ define <4 x i50> @test_signed_v4f16_v4i50(<4 x half> %f) {
 ; CHECK-FP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-FP16-NEXT:    fcvtzs x9, h0
 ; CHECK-FP16-NEXT:    mov x8, #562949953421311 // =0x1ffffffffffff
+; CHECK-FP16-NEXT:    mov h2, v0.h[2]
 ; CHECK-FP16-NEXT:    mov x11, #-562949953421312 // =0xfffe000000000000
-; CHECK-FP16-NEXT:    cmp x9, x8
+; CHECK-FP16-NEXT:    mov h0, v0.h[3]
 ; CHECK-FP16-NEXT:    fcvtzs x10, h1
-; CHECK-FP16-NEXT:    mov h1, v0.h[2]
+; CHECK-FP16-NEXT:    cmp x9, x8
 ; CHECK-FP16-NEXT:    csel x9, x9, x8, lt
-; CHECK-FP16-NEXT:    mov h0, v0.h[3]
+; CHECK-FP16-NEXT:    fcvtzs x12, h2
 ; CHECK-FP16-NEXT:    cmp x9, x11
 ; CHECK-FP16-NEXT:    csel x0, x9, x11, gt
 ; CHECK-FP16-NEXT:    cmp x10, x8
-; CHECK-FP16-NEXT:    fcvtzs x9, h1
-; CHECK-FP16-NEXT:    csel x10, x10, x8, lt
-; CHECK-FP16-NEXT:    cmp x10, x11
-; CHECK-FP16-NEXT:    csel x1, x10, x11, gt
+; CHECK-FP16-NEXT:    csel x9, x10, x8, lt
 ; CHECK-FP16-NEXT:    fcvtzs x10, h0
-; CHECK-FP16-NEXT:    cmp x9, x8
-; CHECK-FP16-NEXT:    csel x9, x9, x8, lt
+; CHECK-FP16-NEXT:    cmp x9, x11
+; CHECK-FP16-NEXT:    csel x1, x9, x11, gt
+; CHECK-FP16-NEXT:    cmp x12, x8
+; CHECK-FP16-NEXT:    csel x9, x12, x8, lt
 ; CHECK-FP16-NEXT:    cmp x9, x11
 ; CHECK-FP16-NEXT:    csel x2, x9, x11, gt
 ; CHECK-FP16-NEXT:    cmp x10, x8
@@ -1790,19 +1790,19 @@ define <4 x i64> @test_signed_v4f16_v4i64(<4 x half> %f) {
 ; CHECK-CVT-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-CVT-NEXT:    mov h1, v0.h[2]
 ; CHECK-CVT-NEXT:    mov h2, v0.h[1]
-; CHECK-CVT-NEXT:    fcvt s3, h0
-; CHECK-CVT-NEXT:    mov h0, v0.h[3]
+; CHECK-CVT-NEXT:    mov h3, v0.h[3]
+; CHECK-CVT-NEXT:    fcvt s0, h0
 ; CHECK-CVT-NEXT:    fcvt s1, h1
 ; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    fcvtzs x8, s3
-; CHECK-CVT-NEXT:    fcvt s3, h0
+; CHECK-CVT-NEXT:    fcvt s3, h3
+; CHECK-CVT-NEXT:    fcvtzs x8, s0
 ; CHECK-CVT-NEXT:    fcvtzs x9, s1
+; CHECK-CVT-NEXT:    fcvtzs x10, s2
+; CHECK-CVT-NEXT:    fcvtzs x11, s3
 ; CHECK-CVT-NEXT:    fmov d0, x8
-; CHECK-CVT-NEXT:    fcvtzs x8, s2
 ; CHECK-CVT-NEXT:    fmov d1, x9
-; CHECK-CVT-NEXT:    fcvtzs x9, s3
-; CHECK-CVT-NEXT:    mov v0.d[1], x8
-; CHECK-CVT-NEXT:    mov v1.d[1], x9
+; CHECK-CVT-NEXT:    mov v0.d[1], x10
+; CHECK-CVT-NEXT:    mov v1.d[1], x11
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_signed_v4f16_v4i64:
@@ -1813,12 +1813,12 @@ define <4 x i64> @test_signed_v4f16_v4i64(<4 x half> %f) {
 ; CHECK-FP16-NEXT:    mov h3, v0.h[3]
 ; CHECK-FP16-NEXT:    fcvtzs x8, h0
 ; CHECK-FP16-NEXT:    fcvtzs x9, h1
+; CHECK-FP16-NEXT:    fcvtzs x10, h2
+; CHECK-FP16-NEXT:    fcvtzs x11, h3
 ; CHECK-FP16-NEXT:    fmov d0, x8
-; CHECK-FP16-NEXT:    fcvtzs x8, h2
 ; CHECK-FP16-NEXT:    fmov d1, x9
-; CHECK-FP16-NEXT:    fcvtzs x9, h3
-; CHECK-FP16-NEXT:    mov v0.d[1], x8
-; CHECK-FP16-NEXT:    mov v1.d[1], x9
+; CHECK-FP16-NEXT:    mov v0.d[1], x10
+; CHECK-FP16-NEXT:    mov v1.d[1], x11
 ; CHECK-FP16-NEXT:    ret
     %x = call <4 x i64> @llvm.fptosi.sat.v4f16.v4i64(<4 x half> %f)
     ret <4 x i64> %x
@@ -1857,11 +1857,11 @@ define <4 x i100> @test_signed_v4f16_v4i100(<4 x half> %f) {
 ; CHECK-NEXT:    movi v9.2s, #241, lsl #24
 ; CHECK-NEXT:    mov w8, #1895825407 // =0x70ffffff
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    fmov s10, w8
 ; CHECK-NEXT:    mov x25, #-34359738368 // =0xfffffff800000000
 ; CHECK-NEXT:    mov x26, #34359738367 // =0x7ffffffff
-; CHECK-NEXT:    fmov s10, w8
-; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    mov h0, v0.h[2]
+; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, x25, x1, lt
 ; CHECK-NEXT:    fcmp s8, s10
@@ -1873,8 +1873,8 @@ define <4 x i100> @test_signed_v4f16_v4i100(<4 x half> %f) {
 ; CHECK-NEXT:    csel x20, xzr, x9, vs
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    fcmp s8, s9
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov h0, v0.h[3]
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, x25, x1, lt
@@ -1906,24 +1906,24 @@ define <4 x i100> @test_signed_v4f16_v4i100(<4 x half> %f) {
 ; CHECK-NEXT:    mov x4, x21
 ; CHECK-NEXT:    mov x5, x22
 ; CHECK-NEXT:    mov x6, x23
+; CHECK-NEXT:    mov x7, x24
+; CHECK-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x8, x25, x1, lt
 ; CHECK-NEXT:    csel x9, xzr, x0, lt
 ; CHECK-NEXT:    fcmp s8, s10
-; CHECK-NEXT:    mov x7, x24
-; CHECK-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #40] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x24, x23, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr d10, [sp, #16] // 8-byte Folded Reload
 ; CHECK-NEXT:    csinv x9, x9, xzr, le
 ; CHECK-NEXT:    csel x8, x26, x8, gt
 ; CHECK-NEXT:    fcmp s8, s8
-; CHECK-NEXT:    ldr x30, [sp, #40] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x26, x25, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #24] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x9, xzr, x9, vs
 ; CHECK-NEXT:    csel x1, xzr, x8, vs
-; CHECK-NEXT:    ldp x24, x23, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    ldr d10, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp x26, x25, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov v0.d[1], x1
-; CHECK-NEXT:    ldp d9, d8, [sp, #24] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    add sp, sp, #112
 ; CHECK-NEXT:    ret
@@ -1964,11 +1964,11 @@ define <4 x i128> @test_signed_v4f16_v4i128(<4 x half> %f) {
 ; CHECK-NEXT:    movi v9.2s, #255, lsl #24
 ; CHECK-NEXT:    mov w8, #2130706431 // =0x7effffff
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    fmov s10, w8
 ; CHECK-NEXT:    mov x25, #-9223372036854775808 // =0x8000000000000000
 ; CHECK-NEXT:    mov x26, #9223372036854775807 // =0x7fffffffffffffff
-; CHECK-NEXT:    fmov s10, w8
-; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    mov h0, v0.h[2]
+; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, x25, x1, lt
 ; CHECK-NEXT:    fcmp s8, s10
@@ -1980,8 +1980,8 @@ define <4 x i128> @test_signed_v4f16_v4i128(<4 x half> %f) {
 ; CHECK-NEXT:    csel x20, xzr, x9, vs
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    fcmp s8, s9
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov h0, v0.h[3]
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, x25, x1, lt
@@ -2013,24 +2013,24 @@ define <4 x i128> @test_signed_v4f16_v4i128(<4 x half> %f) {
 ; CHECK-NEXT:    mov x4, x21
 ; CHECK-NEXT:    mov x5, x22
 ; CHECK-NEXT:    mov x6, x23
+; CHECK-NEXT:    mov x7, x24
+; CHECK-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x8, x25, x1, lt
 ; CHECK-NEXT:    csel x9, xzr, x0, lt
 ; CHECK-NEXT:    fcmp s8, s10
-; CHECK-NEXT:    mov x7, x24
-; CHECK-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #40] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x24, x23, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr d10, [sp, #16] // 8-byte Folded Reload
 ; CHECK-NEXT:    csinv x9, x9, xzr, le
 ; CHECK-NEXT:    csel x8, x26, x8, gt
 ; CHECK-NEXT:    fcmp s8, s8
-; CHECK-NEXT:    ldr x30, [sp, #40] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x26, x25, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #24] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x9, xzr, x9, vs
 ; CHECK-NEXT:    csel x1, xzr, x8, vs
-; CHECK-NEXT:    ldp x24, x23, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    ldr d10, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp x26, x25, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov v0.d[1], x1
-; CHECK-NEXT:    ldp d9, d8, [sp, #24] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    add sp, sp, #112
 ; CHECK-NEXT:    ret
@@ -2058,44 +2058,44 @@ define <8 x i1> @test_signed_v8f16_v8i1(<8 x half> %f) {
 ; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
 ; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-CVT-NEXT:    mov s2, v1.s[1]
-; CHECK-CVT-NEXT:    mov s3, v1.s[2]
-; CHECK-CVT-NEXT:    mov s4, v1.s[3]
-; CHECK-CVT-NEXT:    mov s5, v0.s[1]
 ; CHECK-CVT-NEXT:    fcvtzs w9, s1
-; CHECK-CVT-NEXT:    fcvtzs w10, s0
-; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
+; CHECK-CVT-NEXT:    fcvtzs w13, s0
 ; CHECK-CVT-NEXT:    fcvtzs w8, s2
-; CHECK-CVT-NEXT:    fcvtzs w11, s3
-; CHECK-CVT-NEXT:    fcvtzs w12, s4
-; CHECK-CVT-NEXT:    fcvtzs w13, s5
+; CHECK-CVT-NEXT:    mov s2, v1.s[2]
+; CHECK-CVT-NEXT:    mov s1, v1.s[3]
 ; CHECK-CVT-NEXT:    ands w8, w8, w8, asr #31
+; CHECK-CVT-NEXT:    fcvtzs w10, s2
+; CHECK-CVT-NEXT:    mov s2, v0.s[1]
+; CHECK-CVT-NEXT:    fcvtzs w11, s1
+; CHECK-CVT-NEXT:    mov s1, v0.s[2]
+; CHECK-CVT-NEXT:    mov s0, v0.s[3]
 ; CHECK-CVT-NEXT:    csinv w8, w8, wzr, ge
 ; CHECK-CVT-NEXT:    ands w9, w9, w9, asr #31
 ; CHECK-CVT-NEXT:    csinv w9, w9, wzr, ge
+; CHECK-CVT-NEXT:    ands w10, w10, w10, asr #31
+; CHECK-CVT-NEXT:    fcvtzs w12, s2
+; CHECK-CVT-NEXT:    fcvtzs w14, s1
+; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    fcvtzs w9, s0
+; CHECK-CVT-NEXT:    csinv w10, w10, wzr, ge
 ; CHECK-CVT-NEXT:    ands w11, w11, w11, asr #31
 ; CHECK-CVT-NEXT:    csinv w11, w11, wzr, ge
 ; CHECK-CVT-NEXT:    ands w12, w12, w12, asr #31
+; CHECK-CVT-NEXT:    mov v1.s[1], w8
 ; CHECK-CVT-NEXT:    csinv w12, w12, wzr, ge
 ; CHECK-CVT-NEXT:    ands w13, w13, w13, asr #31
 ; CHECK-CVT-NEXT:    csinv w13, w13, wzr, ge
-; CHECK-CVT-NEXT:    ands w10, w10, w10, asr #31
-; CHECK-CVT-NEXT:    csinv w10, w10, wzr, ge
-; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    fcvtzs w9, s1
-; CHECK-CVT-NEXT:    fmov s3, w10
-; CHECK-CVT-NEXT:    mov v2.s[1], w8
-; CHECK-CVT-NEXT:    ands w8, w9, w9, asr #31
+; CHECK-CVT-NEXT:    ands w8, w14, w14, asr #31
+; CHECK-CVT-NEXT:    mov v1.s[2], w10
+; CHECK-CVT-NEXT:    fmov s2, w13
 ; CHECK-CVT-NEXT:    csinv w8, w8, wzr, ge
-; CHECK-CVT-NEXT:    fcvtzs w9, s0
-; CHECK-CVT-NEXT:    mov v3.s[1], w13
-; CHECK-CVT-NEXT:    mov v2.s[2], w11
-; CHECK-CVT-NEXT:    mov v3.s[2], w8
+; CHECK-CVT-NEXT:    mov v2.s[1], w12
+; CHECK-CVT-NEXT:    mov v1.s[3], w11
+; CHECK-CVT-NEXT:    mov v2.s[2], w8
 ; CHECK-CVT-NEXT:    ands w8, w9, w9, asr #31
 ; CHECK-CVT-NEXT:    csinv w8, w8, wzr, ge
-; CHECK-CVT-NEXT:    mov v2.s[3], w12
-; CHECK-CVT-NEXT:    mov v3.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v3.8h, v2.8h
+; CHECK-CVT-NEXT:    mov v2.s[3], w8
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
 ; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
@@ -2103,9 +2103,9 @@ define <8 x i1> @test_signed_v8f16_v8i1(<8 x half> %f) {
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-FP16-NEXT:    fcvtzs v0.8h, v0.8h
-; CHECK-FP16-NEXT:    movi v2.2d, #0xffffffffffffffff
 ; CHECK-FP16-NEXT:    smin v0.8h, v0.8h, v1.8h
-; CHECK-FP16-NEXT:    smax v0.8h, v0.8h, v2.8h
+; CHECK-FP16-NEXT:    movi v1.2d, #0xffffffffffffffff
+; CHECK-FP16-NEXT:    smax v0.8h, v0.8h, v1.8h
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
     %x = call <8 x i1> @llvm.fptosi.sat.v8f16.v8i1(<8 x half> %f)
@@ -2118,62 +2118,62 @@ define <8 x i8> @test_signed_v8f16_v8i8(<8 x half> %f) {
 ; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
 ; CHECK-CVT-NEXT:    mov w8, #127 // =0x7f
 ; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    mov w10, #-128 // =0xffffff80
+; CHECK-CVT-NEXT:    mov w11, #-128 // =0xffffff80
 ; CHECK-CVT-NEXT:    mov s2, v1.s[1]
-; CHECK-CVT-NEXT:    fcvtzs w11, s1
+; CHECK-CVT-NEXT:    fcvtzs w10, s1
 ; CHECK-CVT-NEXT:    fcvtzs w15, s0
 ; CHECK-CVT-NEXT:    fcvtzs w9, s2
 ; CHECK-CVT-NEXT:    mov s2, v1.s[2]
 ; CHECK-CVT-NEXT:    mov s1, v1.s[3]
 ; CHECK-CVT-NEXT:    cmp w9, #127
-; CHECK-CVT-NEXT:    csel w9, w9, w8, lt
 ; CHECK-CVT-NEXT:    fcvtzs w12, s2
-; CHECK-CVT-NEXT:    cmn w9, #128
 ; CHECK-CVT-NEXT:    mov s2, v0.s[1]
-; CHECK-CVT-NEXT:    csel w9, w9, w10, gt
-; CHECK-CVT-NEXT:    cmp w11, #127
-; CHECK-CVT-NEXT:    csel w11, w11, w8, lt
+; CHECK-CVT-NEXT:    csel w9, w9, w8, lt
 ; CHECK-CVT-NEXT:    fcvtzs w13, s1
-; CHECK-CVT-NEXT:    cmn w11, #128
 ; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    csel w11, w11, w10, gt
+; CHECK-CVT-NEXT:    cmn w9, #128
+; CHECK-CVT-NEXT:    mov s0, v0.s[3]
+; CHECK-CVT-NEXT:    csel w9, w9, w11, gt
+; CHECK-CVT-NEXT:    cmp w10, #127
+; CHECK-CVT-NEXT:    csel w10, w10, w8, lt
+; CHECK-CVT-NEXT:    fcvtzs w14, s2
+; CHECK-CVT-NEXT:    cmn w10, #128
+; CHECK-CVT-NEXT:    fcvtzs w16, s1
+; CHECK-CVT-NEXT:    csel w10, w10, w11, gt
 ; CHECK-CVT-NEXT:    cmp w12, #127
 ; CHECK-CVT-NEXT:    csel w12, w12, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w14, s2
+; CHECK-CVT-NEXT:    fmov s1, w10
 ; CHECK-CVT-NEXT:    cmn w12, #128
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    csel w12, w12, w10, gt
+; CHECK-CVT-NEXT:    csel w12, w12, w11, gt
 ; CHECK-CVT-NEXT:    cmp w13, #127
 ; CHECK-CVT-NEXT:    csel w13, w13, w8, lt
-; CHECK-CVT-NEXT:    fmov s2, w11
+; CHECK-CVT-NEXT:    mov v1.s[1], w9
+; CHECK-CVT-NEXT:    fcvtzs w9, s0
 ; CHECK-CVT-NEXT:    cmn w13, #128
-; CHECK-CVT-NEXT:    csel w13, w13, w10, gt
+; CHECK-CVT-NEXT:    csel w13, w13, w11, gt
 ; CHECK-CVT-NEXT:    cmp w14, #127
 ; CHECK-CVT-NEXT:    csel w14, w14, w8, lt
 ; CHECK-CVT-NEXT:    cmn w14, #128
-; CHECK-CVT-NEXT:    csel w14, w14, w10, gt
+; CHECK-CVT-NEXT:    mov v1.s[2], w12
+; CHECK-CVT-NEXT:    csel w14, w14, w11, gt
 ; CHECK-CVT-NEXT:    cmp w15, #127
 ; CHECK-CVT-NEXT:    csel w15, w15, w8, lt
 ; CHECK-CVT-NEXT:    cmn w15, #128
-; CHECK-CVT-NEXT:    csel w11, w15, w10, gt
-; CHECK-CVT-NEXT:    fcvtzs w15, s1
-; CHECK-CVT-NEXT:    mov v2.s[1], w9
-; CHECK-CVT-NEXT:    fmov s1, w11
-; CHECK-CVT-NEXT:    cmp w15, #127
-; CHECK-CVT-NEXT:    csel w9, w15, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w11, s0
-; CHECK-CVT-NEXT:    cmn w9, #128
-; CHECK-CVT-NEXT:    mov v1.s[1], w14
-; CHECK-CVT-NEXT:    csel w9, w9, w10, gt
-; CHECK-CVT-NEXT:    cmp w11, #127
-; CHECK-CVT-NEXT:    csel w8, w11, w8, lt
-; CHECK-CVT-NEXT:    mov v2.s[2], w12
+; CHECK-CVT-NEXT:    csel w10, w15, w11, gt
+; CHECK-CVT-NEXT:    cmp w16, #127
+; CHECK-CVT-NEXT:    mov v1.s[3], w13
+; CHECK-CVT-NEXT:    fmov s2, w10
+; CHECK-CVT-NEXT:    csel w10, w16, w8, lt
+; CHECK-CVT-NEXT:    cmn w10, #128
+; CHECK-CVT-NEXT:    csel w10, w10, w11, gt
+; CHECK-CVT-NEXT:    cmp w9, #127
+; CHECK-CVT-NEXT:    mov v2.s[1], w14
+; CHECK-CVT-NEXT:    csel w8, w9, w8, lt
 ; CHECK-CVT-NEXT:    cmn w8, #128
-; CHECK-CVT-NEXT:    mov v1.s[2], w9
-; CHECK-CVT-NEXT:    csel w8, w8, w10, gt
-; CHECK-CVT-NEXT:    mov v2.s[3], w13
-; CHECK-CVT-NEXT:    mov v1.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v1.8h, v2.8h
+; CHECK-CVT-NEXT:    csel w8, w8, w11, gt
+; CHECK-CVT-NEXT:    mov v2.s[2], w10
+; CHECK-CVT-NEXT:    mov v2.s[3], w8
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
 ; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
@@ -2192,70 +2192,70 @@ define <8 x i13> @test_signed_v8f16_v8i13(<8 x half> %f) {
 ; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
 ; CHECK-CVT-NEXT:    mov w8, #4095 // =0xfff
 ; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    mov w10, #-4096 // =0xfffff000
+; CHECK-CVT-NEXT:    mov w11, #-4096 // =0xfffff000
 ; CHECK-CVT-NEXT:    mov s2, v1.s[1]
-; CHECK-CVT-NEXT:    fcvtzs w11, s1
+; CHECK-CVT-NEXT:    fcvtzs w10, s1
 ; CHECK-CVT-NEXT:    fcvtzs w15, s0
 ; CHECK-CVT-NEXT:    fcvtzs w9, s2
 ; CHECK-CVT-NEXT:    mov s2, v1.s[2]
 ; CHECK-CVT-NEXT:    mov s1, v1.s[3]
 ; CHECK-CVT-NEXT:    cmp w9, #4095
-; CHECK-CVT-NEXT:    csel w9, w9, w8, lt
 ; CHECK-CVT-NEXT:    fcvtzs w12, s2
-; CHECK-CVT-NEXT:    cmn w9, #1, lsl #12 // =4096
 ; CHECK-CVT-NEXT:    mov s2, v0.s[1]
-; CHECK-CVT-NEXT:    csel w9, w9, w10, gt
-; CHECK-CVT-NEXT:    cmp w11, #4095
-; CHECK-CVT-NEXT:    csel w11, w11, w8, lt
+; CHECK-CVT-NEXT:    csel w9, w9, w8, lt
 ; CHECK-CVT-NEXT:    fcvtzs w13, s1
-; CHECK-CVT-NEXT:    cmn w11, #1, lsl #12 // =4096
 ; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    csel w11, w11, w10, gt
+; CHECK-CVT-NEXT:    cmn w9, #1, lsl #12 // =4096
+; CHECK-CVT-NEXT:    mov s0, v0.s[3]
+; CHECK-CVT-NEXT:    csel w9, w9, w11, gt
+; CHECK-CVT-NEXT:    cmp w10, #4095
+; CHECK-CVT-NEXT:    csel w10, w10, w8, lt
+; CHECK-CVT-NEXT:    fcvtzs w14, s2
+; CHECK-CVT-NEXT:    cmn w10, #1, lsl #12 // =4096
+; CHECK-CVT-NEXT:    fcvtzs w16, s1
+; CHECK-CVT-NEXT:    csel w10, w10, w11, gt
 ; CHECK-CVT-NEXT:    cmp w12, #4095
 ; CHECK-CVT-NEXT:    csel w12, w12, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w14, s2
+; CHECK-CVT-NEXT:    fmov s1, w10
 ; CHECK-CVT-NEXT:    cmn w12, #1, lsl #12 // =4096
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    csel w12, w12, w10, gt
+; CHECK-CVT-NEXT:    csel w12, w12, w11, gt
 ; CHECK-CVT-NEXT:    cmp w13, #4095
 ; CHECK-CVT-NEXT:    csel w13, w13, w8, lt
-; CHECK-CVT-NEXT:    fmov s2, w11
+; CHECK-CVT-NEXT:    mov v1.s[1], w9
+; CHECK-CVT-NEXT:    fcvtzs w9, s0
 ; CHECK-CVT-NEXT:    cmn w13, #1, lsl #12 // =4096
-; CHECK-CVT-NEXT:    csel w13, w13, w10, gt
+; CHECK-CVT-NEXT:    csel w13, w13, w11, gt
 ; CHECK-CVT-NEXT:    cmp w14, #4095
 ; CHECK-CVT-NEXT:    csel w14, w14, w8, lt
 ; CHECK-CVT-NEXT:    cmn w14, #1, lsl #12 // =4096
-; CHECK-CVT-NEXT:    csel w14, w14, w10, gt
+; CHECK-CVT-NEXT:    mov v1.s[2], w12
+; CHECK-CVT-NEXT:    csel w14, w14, w11, gt
 ; CHECK-CVT-NEXT:    cmp w15, #4095
 ; CHECK-CVT-NEXT:    csel w15, w15, w8, lt
 ; CHECK-CVT-NEXT:    cmn w15, #1, lsl #12 // =4096
-; CHECK-CVT-NEXT:    csel w11, w15, w10, gt
-; CHECK-CVT-NEXT:    fcvtzs w15, s1
-; CHECK-CVT-NEXT:    mov v2.s[1], w9
-; CHECK-CVT-NEXT:    fmov s1, w11
-; CHECK-CVT-NEXT:    cmp w15, #4095
-; CHECK-CVT-NEXT:    csel w9, w15, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w11, s0
-; CHECK-CVT-NEXT:    cmn w9, #1, lsl #12 // =4096
-; CHECK-CVT-NEXT:    mov v1.s[1], w14
-; CHECK-CVT-NEXT:    csel w9, w9, w10, gt
-; CHECK-CVT-NEXT:    cmp w11, #4095
-; CHECK-CVT-NEXT:    csel w8, w11, w8, lt
-; CHECK-CVT-NEXT:    mov v2.s[2], w12
+; CHECK-CVT-NEXT:    csel w10, w15, w11, gt
+; CHECK-CVT-NEXT:    cmp w16, #4095
+; CHECK-CVT-NEXT:    mov v1.s[3], w13
+; CHECK-CVT-NEXT:    fmov s2, w10
+; CHECK-CVT-NEXT:    csel w10, w16, w8, lt
+; CHECK-CVT-NEXT:    cmn w10, #1, lsl #12 // =4096
+; CHECK-CVT-NEXT:    csel w10, w10, w11, gt
+; CHECK-CVT-NEXT:    cmp w9, #4095
+; CHECK-CVT-NEXT:    mov v2.s[1], w14
+; CHECK-CVT-NEXT:    csel w8, w9, w8, lt
 ; CHECK-CVT-NEXT:    cmn w8, #1, lsl #12 // =4096
-; CHECK-CVT-NEXT:    mov v1.s[2], w9
-; CHECK-CVT-NEXT:    csel w8, w8, w10, gt
-; CHECK-CVT-NEXT:    mov v2.s[3], w13
-; CHECK-CVT-NEXT:    mov v1.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v1.8h, v2.8h
+; CHECK-CVT-NEXT:    csel w8, w8, w11, gt
+; CHECK-CVT-NEXT:    mov v2.s[2], w10
+; CHECK-CVT-NEXT:    mov v2.s[3], w8
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_signed_v8f16_v8i13:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcvtzs v0.8h, v0.8h
+; CHECK-FP16-NEXT:    mvni v1.8h, #240, lsl #8
+; CHECK-FP16-NEXT:    smin v0.8h, v0.8h, v1.8h
 ; CHECK-FP16-NEXT:    movi v1.8h, #240, lsl #8
-; CHECK-FP16-NEXT:    mvni v2.8h, #240, lsl #8
-; CHECK-FP16-NEXT:    smin v0.8h, v0.8h, v2.8h
 ; CHECK-FP16-NEXT:    smax v0.8h, v0.8h, v1.8h
 ; CHECK-FP16-NEXT:    ret
     %x = call <8 x i13> @llvm.fptosi.sat.v8f16.v8i13(<8 x half> %f)
@@ -2268,62 +2268,62 @@ define <8 x i16> @test_signed_v8f16_v8i16(<8 x half> %f) {
 ; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
 ; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    mov w10, #-32768 // =0xffff8000
+; CHECK-CVT-NEXT:    mov w11, #-32768 // =0xffff8000
 ; CHECK-CVT-NEXT:    mov s2, v1.s[1]
-; CHECK-CVT-NEXT:    fcvtzs w11, s1
+; CHECK-CVT-NEXT:    fcvtzs w10, s1
 ; CHECK-CVT-NEXT:    fcvtzs w15, s0
 ; CHECK-CVT-NEXT:    fcvtzs w9, s2
 ; CHECK-CVT-NEXT:    mov s2, v1.s[2]
 ; CHECK-CVT-NEXT:    mov s1, v1.s[3]
 ; CHECK-CVT-NEXT:    cmp w9, w8
-; CHECK-CVT-NEXT:    csel w9, w9, w8, lt
 ; CHECK-CVT-NEXT:    fcvtzs w12, s2
-; CHECK-CVT-NEXT:    cmn w9, #8, lsl #12 // =32768
 ; CHECK-CVT-NEXT:    mov s2, v0.s[1]
-; CHECK-CVT-NEXT:    csel w9, w9, w10, gt
-; CHECK-CVT-NEXT:    cmp w11, w8
-; CHECK-CVT-NEXT:    csel w11, w11, w8, lt
+; CHECK-CVT-NEXT:    csel w9, w9, w8, lt
 ; CHECK-CVT-NEXT:    fcvtzs w13, s1
-; CHECK-CVT-NEXT:    cmn w11, #8, lsl #12 // =32768
 ; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    csel w11, w11, w10, gt
+; CHECK-CVT-NEXT:    cmn w9, #8, lsl #12 // =32768
+; CHECK-CVT-NEXT:    mov s0, v0.s[3]
+; CHECK-CVT-NEXT:    csel w9, w9, w11, gt
+; CHECK-CVT-NEXT:    cmp w10, w8
+; CHECK-CVT-NEXT:    csel w10, w10, w8, lt
+; CHECK-CVT-NEXT:    fcvtzs w14, s2
+; CHECK-CVT-NEXT:    cmn w10, #8, lsl #12 // =32768
+; CHECK-CVT-NEXT:    fcvtzs w16, s1
+; CHECK-CVT-NEXT:    csel w10, w10, w11, gt
 ; CHECK-CVT-NEXT:    cmp w12, w8
 ; CHECK-CVT-NEXT:    csel w12, w12, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w14, s2
+; CHECK-CVT-NEXT:    fmov s1, w10
 ; CHECK-CVT-NEXT:    cmn w12, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    csel w12, w12, w10, gt
+; CHECK-CVT-NEXT:    csel w12, w12, w11, gt
 ; CHECK-CVT-NEXT:    cmp w13, w8
 ; CHECK-CVT-NEXT:    csel w13, w13, w8, lt
-; CHECK-CVT-NEXT:    fmov s2, w11
+; CHECK-CVT-NEXT:    mov v1.s[1], w9
+; CHECK-CVT-NEXT:    fcvtzs w9, s0
 ; CHECK-CVT-NEXT:    cmn w13, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w13, w13, w10, gt
+; CHECK-CVT-NEXT:    csel w13, w13, w11, gt
 ; CHECK-CVT-NEXT:    cmp w14, w8
 ; CHECK-CVT-NEXT:    csel w14, w14, w8, lt
 ; CHECK-CVT-NEXT:    cmn w14, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w14, w14, w10, gt
+; CHECK-CVT-NEXT:    mov v1.s[2], w12
+; CHECK-CVT-NEXT:    csel w14, w14, w11, gt
 ; CHECK-CVT-NEXT:    cmp w15, w8
 ; CHECK-CVT-NEXT:    csel w15, w15, w8, lt
 ; CHECK-CVT-NEXT:    cmn w15, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w11, w15, w10, gt
-; CHECK-CVT-NEXT:    fcvtzs w15, s1
-; CHECK-CVT-NEXT:    mov v2.s[1], w9
-; CHECK-CVT-NEXT:    fmov s1, w11
-; CHECK-CVT-NEXT:    cmp w15, w8
-; CHECK-CVT-NEXT:    csel w9, w15, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w11, s0
-; CHECK-CVT-NEXT:    cmn w9, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    mov v1.s[1], w14
-; CHECK-CVT-NEXT:    csel w9, w9, w10, gt
-; CHECK-CVT-NEXT:    cmp w11, w8
-; CHECK-CVT-NEXT:    csel w8, w11, w8, lt
-; CHECK-CVT-NEXT:    mov v2.s[2], w12
+; CHECK-CVT-NEXT:    csel w10, w15, w11, gt
+; CHECK-CVT-NEXT:    cmp w16, w8
+; CHECK-CVT-NEXT:    mov v1.s[3], w13
+; CHECK-CVT-NEXT:    fmov s2, w10
+; CHECK-CVT-NEXT:    csel w10, w16, w8, lt
+; CHECK-CVT-NEXT:    cmn w10, #8, lsl #12 // =32768
+; CHECK-CVT-NEXT:    csel w10, w10, w11, gt
+; CHECK-CVT-NEXT:    cmp w9, w8
+; CHECK-CVT-NEXT:    mov v2.s[1], w14
+; CHECK-CVT-NEXT:    csel w8, w9, w8, lt
 ; CHECK-CVT-NEXT:    cmn w8, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    mov v1.s[2], w9
-; CHECK-CVT-NEXT:    csel w8, w8, w10, gt
-; CHECK-CVT-NEXT:    mov v2.s[3], w13
-; CHECK-CVT-NEXT:    mov v1.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v1.8h, v2.8h
+; CHECK-CVT-NEXT:    csel w8, w8, w11, gt
+; CHECK-CVT-NEXT:    mov v2.s[2], w10
+; CHECK-CVT-NEXT:    mov v2.s[3], w8
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_signed_v8f16_v8i16:
@@ -2340,21 +2340,21 @@ define <8 x i19> @test_signed_v8f16_v8i19(<8 x half> %f) {
 ; CHECK-NEXT:    fcvtl v2.4s, v0.4h
 ; CHECK-NEXT:    fcvtl2 v0.4s, v0.8h
 ; CHECK-NEXT:    movi v1.4s, #3, msl #16
+; CHECK-NEXT:    mvni v3.4s, #3, msl #16
 ; CHECK-NEXT:    fcvtzs v2.4s, v2.4s
 ; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
 ; CHECK-NEXT:    smin v2.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    mvni v1.4s, #3, msl #16
-; CHECK-NEXT:    smax v2.4s, v2.4s, v1.4s
-; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    mov w1, v2.s[1]
-; CHECK-NEXT:    mov w2, v2.s[2]
+; CHECK-NEXT:    smax v1.4s, v2.4s, v3.4s
+; CHECK-NEXT:    smax v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    mov w1, v1.s[1]
+; CHECK-NEXT:    mov w2, v1.s[2]
+; CHECK-NEXT:    mov w3, v1.s[3]
 ; CHECK-NEXT:    mov w5, v0.s[1]
-; CHECK-NEXT:    mov w3, v2.s[3]
 ; CHECK-NEXT:    mov w6, v0.s[2]
 ; CHECK-NEXT:    mov w7, v0.s[3]
 ; CHECK-NEXT:    fmov w4, s0
-; CHECK-NEXT:    fmov w0, s2
+; CHECK-NEXT:    fmov w0, s1
 ; CHECK-NEXT:    ret
     %x = call <8 x i19> @llvm.fptosi.sat.v8f16.v8i19(<8 x half> %f)
     ret <8 x i19> %x
@@ -2377,61 +2377,61 @@ define <8 x i50> @test_signed_v8f16_v8i50(<8 x half> %f) {
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; CHECK-CVT-NEXT:    mov x8, #562949953421311 // =0x1ffffffffffff
-; CHECK-CVT-NEXT:    mov x12, #-562949953421312 // =0xfffe000000000000
-; CHECK-CVT-NEXT:    fcvt s5, h0
+; CHECK-CVT-NEXT:    mov x11, #-562949953421312 // =0xfffe000000000000
 ; CHECK-CVT-NEXT:    mov h2, v1.h[1]
 ; CHECK-CVT-NEXT:    fcvt s3, h1
 ; CHECK-CVT-NEXT:    mov h4, v1.h[2]
 ; CHECK-CVT-NEXT:    mov h1, v1.h[3]
-; CHECK-CVT-NEXT:    fcvtzs x10, s5
 ; CHECK-CVT-NEXT:    fcvt s2, h2
 ; CHECK-CVT-NEXT:    fcvtzs x9, s3
 ; CHECK-CVT-NEXT:    fcvt s3, h4
 ; CHECK-CVT-NEXT:    fcvt s1, h1
+; CHECK-CVT-NEXT:    fcvtzs x10, s2
 ; CHECK-CVT-NEXT:    cmp x9, x8
-; CHECK-CVT-NEXT:    fcvtzs x11, s2
+; CHECK-CVT-NEXT:    fcvtzs x12, s3
 ; CHECK-CVT-NEXT:    csel x9, x9, x8, lt
-; CHECK-CVT-NEXT:    cmp x9, x12
-; CHECK-CVT-NEXT:    fcvtzs x13, s3
-; CHECK-CVT-NEXT:    csel x4, x9, x12, gt
 ; CHECK-CVT-NEXT:    mov h2, v0.h[1]
-; CHECK-CVT-NEXT:    cmp x11, x8
-; CHECK-CVT-NEXT:    fcvtzs x9, s1
-; CHECK-CVT-NEXT:    csel x11, x11, x8, lt
+; CHECK-CVT-NEXT:    fcvt s3, h0
+; CHECK-CVT-NEXT:    cmp x9, x11
+; CHECK-CVT-NEXT:    csel x4, x9, x11, gt
+; CHECK-CVT-NEXT:    cmp x10, x8
+; CHECK-CVT-NEXT:    csel x9, x10, x8, lt
+; CHECK-CVT-NEXT:    fcvtzs x10, s1
 ; CHECK-CVT-NEXT:    mov h1, v0.h[2]
-; CHECK-CVT-NEXT:    cmp x11, x12
-; CHECK-CVT-NEXT:    mov h0, v0.h[3]
-; CHECK-CVT-NEXT:    csel x5, x11, x12, gt
-; CHECK-CVT-NEXT:    cmp x13, x8
-; CHECK-CVT-NEXT:    csel x11, x13, x8, lt
+; CHECK-CVT-NEXT:    cmp x9, x11
 ; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    cmp x11, x12
+; CHECK-CVT-NEXT:    mov h0, v0.h[3]
+; CHECK-CVT-NEXT:    csel x5, x9, x11, gt
+; CHECK-CVT-NEXT:    cmp x12, x8
+; CHECK-CVT-NEXT:    csel x9, x12, x8, lt
+; CHECK-CVT-NEXT:    fcvtzs x12, s3
+; CHECK-CVT-NEXT:    cmp x9, x11
 ; CHECK-CVT-NEXT:    fcvt s1, h1
-; CHECK-CVT-NEXT:    csel x6, x11, x12, gt
-; CHECK-CVT-NEXT:    cmp x9, x8
-; CHECK-CVT-NEXT:    csel x9, x9, x8, lt
-; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    cmp x9, x12
-; CHECK-CVT-NEXT:    fcvtzs x11, s2
-; CHECK-CVT-NEXT:    csel x7, x9, x12, gt
+; CHECK-CVT-NEXT:    csel x6, x9, x11, gt
 ; CHECK-CVT-NEXT:    cmp x10, x8
+; CHECK-CVT-NEXT:    fcvt s0, h0
 ; CHECK-CVT-NEXT:    csel x9, x10, x8, lt
-; CHECK-CVT-NEXT:    fcvtzs x10, s1
-; CHECK-CVT-NEXT:    cmp x9, x12
-; CHECK-CVT-NEXT:    csel x0, x9, x12, gt
-; CHECK-CVT-NEXT:    cmp x11, x8
-; CHECK-CVT-NEXT:    csel x9, x11, x8, lt
-; CHECK-CVT-NEXT:    fcvtzs x11, s0
-; CHECK-CVT-NEXT:    cmp x9, x12
-; CHECK-CVT-NEXT:    csel x1, x9, x12, gt
+; CHECK-CVT-NEXT:    fcvtzs x10, s2
+; CHECK-CVT-NEXT:    cmp x9, x11
+; CHECK-CVT-NEXT:    csel x7, x9, x11, gt
+; CHECK-CVT-NEXT:    cmp x12, x8
+; CHECK-CVT-NEXT:    csel x9, x12, x8, lt
+; CHECK-CVT-NEXT:    fcvtzs x12, s1
+; CHECK-CVT-NEXT:    cmp x9, x11
+; CHECK-CVT-NEXT:    csel x0, x9, x11, gt
 ; CHECK-CVT-NEXT:    cmp x10, x8
 ; CHECK-CVT-NEXT:    csel x9, x10, x8, lt
-; CHECK-CVT-NEXT:    cmp x9, x12
-; CHECK-CVT-NEXT:    csel x2, x9, x12, gt
-; CHECK-CVT-NEXT:    cmp x11, x8
-; CHECK-CVT-NEXT:    csel x8, x11, x8, lt
-; CHECK-CVT-NEXT:    cmp x8, x12
-; CHECK-CVT-NEXT:    csel x3, x8, x12, gt
+; CHECK-CVT-NEXT:    fcvtzs x10, s0
+; CHECK-CVT-NEXT:    cmp x9, x11
+; CHECK-CVT-NEXT:    csel x1, x9, x11, gt
+; CHECK-CVT-NEXT:    cmp x12, x8
+; CHECK-CVT-NEXT:    csel x9, x12, x8, lt
+; CHECK-CVT-NEXT:    cmp x9, x11
+; CHECK-CVT-NEXT:    csel x2, x9, x11, gt
+; CHECK-CVT-NEXT:    cmp x10, x8
+; CHECK-CVT-NEXT:    csel x8, x10, x8, lt
+; CHECK-CVT-NEXT:    cmp x8, x11
+; CHECK-CVT-NEXT:    csel x3, x8, x11, gt
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_signed_v8f16_v8i50:
@@ -2443,24 +2443,24 @@ define <8 x i50> @test_signed_v8f16_v8i50(<8 x half> %f) {
 ; CHECK-FP16-NEXT:    fcvtzs x9, h1
 ; CHECK-FP16-NEXT:    mov h3, v1.h[2]
 ; CHECK-FP16-NEXT:    mov h1, v1.h[3]
-; CHECK-FP16-NEXT:    cmp x9, x8
 ; CHECK-FP16-NEXT:    fcvtzs x10, h2
+; CHECK-FP16-NEXT:    cmp x9, x8
+; CHECK-FP16-NEXT:    fcvtzs x12, h3
 ; CHECK-FP16-NEXT:    csel x9, x9, x8, lt
+; CHECK-FP16-NEXT:    mov h2, v0.h[2]
 ; CHECK-FP16-NEXT:    cmp x9, x11
-; CHECK-FP16-NEXT:    fcvtzs x12, h3
 ; CHECK-FP16-NEXT:    csel x4, x9, x11, gt
-; CHECK-FP16-NEXT:    mov h2, v0.h[2]
 ; CHECK-FP16-NEXT:    cmp x10, x8
 ; CHECK-FP16-NEXT:    csel x9, x10, x8, lt
 ; CHECK-FP16-NEXT:    fcvtzs x10, h1
-; CHECK-FP16-NEXT:    cmp x9, x11
 ; CHECK-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-FP16-NEXT:    cmp x9, x11
 ; CHECK-FP16-NEXT:    csel x5, x9, x11, gt
 ; CHECK-FP16-NEXT:    cmp x12, x8
 ; CHECK-FP16-NEXT:    csel x9, x12, x8, lt
 ; CHECK-FP16-NEXT:    fcvtzs x12, h0
-; CHECK-FP16-NEXT:    cmp x9, x11
 ; CHECK-FP16-NEXT:    mov h0, v0.h[3]
+; CHECK-FP16-NEXT:    cmp x9, x11
 ; CHECK-FP16-NEXT:    csel x6, x9, x11, gt
 ; CHECK-FP16-NEXT:    cmp x10, x8
 ; CHECK-FP16-NEXT:    csel x9, x10, x8, lt
@@ -2494,63 +2494,63 @@ define <8 x i64> @test_signed_v8f16_v8i64(<8 x half> %f) {
 ; CHECK-CVT-LABEL: test_signed_v8f16_v8i64:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-CVT-NEXT:    mov h2, v0.h[2]
-; CHECK-CVT-NEXT:    fcvt s3, h0
-; CHECK-CVT-NEXT:    mov h7, v0.h[1]
-; CHECK-CVT-NEXT:    mov h0, v0.h[3]
-; CHECK-CVT-NEXT:    mov h4, v1.h[1]
-; CHECK-CVT-NEXT:    mov h6, v1.h[2]
-; CHECK-CVT-NEXT:    fcvt s5, h1
-; CHECK-CVT-NEXT:    mov h1, v1.h[3]
+; CHECK-CVT-NEXT:    mov h4, v0.h[2]
+; CHECK-CVT-NEXT:    mov h3, v0.h[1]
+; CHECK-CVT-NEXT:    mov h7, v0.h[3]
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    mov h2, v1.h[2]
+; CHECK-CVT-NEXT:    mov h5, v1.h[1]
+; CHECK-CVT-NEXT:    mov h6, v1.h[3]
+; CHECK-CVT-NEXT:    fcvt s1, h1
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    fcvt s3, h3
+; CHECK-CVT-NEXT:    fcvt s7, h7
+; CHECK-CVT-NEXT:    fcvtzs x9, s0
 ; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    fcvtzs x8, s3
-; CHECK-CVT-NEXT:    fcvt s3, h4
-; CHECK-CVT-NEXT:    fcvt s4, h6
-; CHECK-CVT-NEXT:    fcvtzs x9, s5
-; CHECK-CVT-NEXT:    fcvt s5, h7
-; CHECK-CVT-NEXT:    fcvt s6, h0
-; CHECK-CVT-NEXT:    fcvt s7, h1
-; CHECK-CVT-NEXT:    fcvtzs x10, s2
-; CHECK-CVT-NEXT:    fmov d0, x8
-; CHECK-CVT-NEXT:    fmov d2, x9
-; CHECK-CVT-NEXT:    fcvtzs x9, s4
+; CHECK-CVT-NEXT:    fcvt s5, h5
+; CHECK-CVT-NEXT:    fcvt s6, h6
+; CHECK-CVT-NEXT:    fcvtzs x8, s1
+; CHECK-CVT-NEXT:    fcvtzs x12, s4
 ; CHECK-CVT-NEXT:    fcvtzs x11, s3
-; CHECK-CVT-NEXT:    fcvtzs x8, s5
-; CHECK-CVT-NEXT:    fmov d1, x10
-; CHECK-CVT-NEXT:    fcvtzs x10, s6
-; CHECK-CVT-NEXT:    fmov d3, x9
-; CHECK-CVT-NEXT:    fcvtzs x9, s7
-; CHECK-CVT-NEXT:    mov v2.d[1], x11
-; CHECK-CVT-NEXT:    mov v0.d[1], x8
-; CHECK-CVT-NEXT:    mov v1.d[1], x10
-; CHECK-CVT-NEXT:    mov v3.d[1], x9
+; CHECK-CVT-NEXT:    fcvtzs x15, s7
+; CHECK-CVT-NEXT:    fmov d0, x9
+; CHECK-CVT-NEXT:    fcvtzs x10, s2
+; CHECK-CVT-NEXT:    fcvtzs x13, s5
+; CHECK-CVT-NEXT:    fcvtzs x14, s6
+; CHECK-CVT-NEXT:    fmov d2, x8
+; CHECK-CVT-NEXT:    fmov d1, x12
+; CHECK-CVT-NEXT:    mov v0.d[1], x11
+; CHECK-CVT-NEXT:    fmov d3, x10
+; CHECK-CVT-NEXT:    mov v2.d[1], x13
+; CHECK-CVT-NEXT:    mov v1.d[1], x15
+; CHECK-CVT-NEXT:    mov v3.d[1], x14
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_signed_v8f16_v8i64:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-FP16-NEXT:    mov h2, v0.h[2]
-; CHECK-FP16-NEXT:    mov h5, v0.h[1]
-; CHECK-FP16-NEXT:    mov h6, v0.h[3]
-; CHECK-FP16-NEXT:    fcvtzs x8, h0
-; CHECK-FP16-NEXT:    mov h4, v1.h[2]
-; CHECK-FP16-NEXT:    fcvtzs x9, h1
-; CHECK-FP16-NEXT:    mov h3, v1.h[1]
-; CHECK-FP16-NEXT:    mov h7, v1.h[3]
-; CHECK-FP16-NEXT:    fcvtzs x10, h2
-; CHECK-FP16-NEXT:    fmov d0, x8
-; CHECK-FP16-NEXT:    fmov d2, x9
-; CHECK-FP16-NEXT:    fcvtzs x8, h5
-; CHECK-FP16-NEXT:    fcvtzs x9, h4
+; CHECK-FP16-NEXT:    mov h4, v0.h[2]
+; CHECK-FP16-NEXT:    mov h3, v0.h[1]
+; CHECK-FP16-NEXT:    mov h7, v0.h[3]
+; CHECK-FP16-NEXT:    fcvtzs x9, h0
+; CHECK-FP16-NEXT:    mov h2, v1.h[2]
+; CHECK-FP16-NEXT:    mov h5, v1.h[1]
+; CHECK-FP16-NEXT:    mov h6, v1.h[3]
+; CHECK-FP16-NEXT:    fcvtzs x8, h1
+; CHECK-FP16-NEXT:    fcvtzs x12, h4
 ; CHECK-FP16-NEXT:    fcvtzs x11, h3
-; CHECK-FP16-NEXT:    fmov d1, x10
-; CHECK-FP16-NEXT:    fcvtzs x10, h6
-; CHECK-FP16-NEXT:    fmov d3, x9
-; CHECK-FP16-NEXT:    fcvtzs x9, h7
-; CHECK-FP16-NEXT:    mov v2.d[1], x11
-; CHECK-FP16-NEXT:    mov v0.d[1], x8
-; CHECK-FP16-NEXT:    mov v1.d[1], x10
-; CHECK-FP16-NEXT:    mov v3.d[1], x9
+; CHECK-FP16-NEXT:    fcvtzs x15, h7
+; CHECK-FP16-NEXT:    fmov d0, x9
+; CHECK-FP16-NEXT:    fcvtzs x10, h2
+; CHECK-FP16-NEXT:    fcvtzs x13, h5
+; CHECK-FP16-NEXT:    fcvtzs x14, h6
+; CHECK-FP16-NEXT:    fmov d2, x8
+; CHECK-FP16-NEXT:    fmov d1, x12
+; CHECK-FP16-NEXT:    mov v0.d[1], x11
+; CHECK-FP16-NEXT:    fmov d3, x10
+; CHECK-FP16-NEXT:    mov v2.d[1], x13
+; CHECK-FP16-NEXT:    mov v1.d[1], x15
+; CHECK-FP16-NEXT:    mov v3.d[1], x14
 ; CHECK-FP16-NEXT:    ret
     %x = call <8 x i64> @llvm.fptosi.sat.v8f16.v8i64(<8 x half> %f)
     ret <8 x i64> %x
@@ -2585,8 +2585,8 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    .cfi_offset b9, -112
 ; CHECK-NEXT:    .cfi_offset b10, -128
 ; CHECK-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    mov x19, x8
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    mov x19, x8
 ; CHECK-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov h0, v0.h[1]
 ; CHECK-NEXT:    fcvt s8, h0
@@ -2595,76 +2595,76 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    movi v10.2s, #241, lsl #24
 ; CHECK-NEXT:    mov w8, #1895825407 // =0x70ffffff
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    mov x25, #-34359738368 // =0xfffffff800000000
-; CHECK-NEXT:    mov x22, #34359738367 // =0x7ffffffff
 ; CHECK-NEXT:    fmov s9, w8
-; CHECK-NEXT:    fcmp s8, s10
+; CHECK-NEXT:    mov x21, #-34359738368 // =0xfffffff800000000
+; CHECK-NEXT:    mov x23, #34359738367 // =0x7ffffffff
 ; CHECK-NEXT:    mov h0, v0.h[3]
-; CHECK-NEXT:    csel x8, x25, x1, lt
+; CHECK-NEXT:    fcmp s8, s10
+; CHECK-NEXT:    csel x8, x21, x1, lt
 ; CHECK-NEXT:    csel x9, xzr, x0, lt
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    csinv x9, x9, xzr, le
-; CHECK-NEXT:    csel x8, x22, x8, gt
+; CHECK-NEXT:    csel x8, x23, x8, gt
 ; CHECK-NEXT:    fcmp s8, s8
 ; CHECK-NEXT:    fcvt s8, h0
 ; CHECK-NEXT:    csel x8, xzr, x8, vs
-; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    str x8, [sp, #72] // 8-byte Folded Spill
 ; CHECK-NEXT:    csel x8, xzr, x9, vs
+; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    str x8, [sp, #24] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl __fixsfti
 ; CHECK-NEXT:    fcmp s8, s10
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
-; CHECK-NEXT:    csel x9, x25, x1, lt
+; CHECK-NEXT:    csel x9, x21, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
-; CHECK-NEXT:    csel x9, x22, x9, gt
+; CHECK-NEXT:    csel x9, x23, x9, gt
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
 ; CHECK-NEXT:    fcmp s8, s8
 ; CHECK-NEXT:    fcvt s8, h0
 ; CHECK-NEXT:    csel x10, xzr, x8, vs
 ; CHECK-NEXT:    csel x8, xzr, x9, vs
-; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    stp x8, x10, [sp, #8] // 16-byte Folded Spill
+; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    fcmp s8, s10
+; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov h0, v0.h[2]
-; CHECK-NEXT:    csel x8, x25, x1, lt
+; CHECK-NEXT:    csel x8, x21, x1, lt
 ; CHECK-NEXT:    csel x9, xzr, x0, lt
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    csinv x9, x9, xzr, le
-; CHECK-NEXT:    csel x8, x22, x8, gt
+; CHECK-NEXT:    csel x8, x23, x8, gt
 ; CHECK-NEXT:    fcmp s8, s8
 ; CHECK-NEXT:    fcvt s8, h0
 ; CHECK-NEXT:    csel x26, xzr, x8, vs
 ; CHECK-NEXT:    csel x8, xzr, x9, vs
-; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    str x8, [sp, #32] // 8-byte Folded Spill
+; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    fcmp s8, s10
+; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov h0, v0.h[1]
-; CHECK-NEXT:    csel x8, x25, x1, lt
+; CHECK-NEXT:    csel x8, x21, x1, lt
 ; CHECK-NEXT:    csel x9, xzr, x0, lt
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    csinv x9, x9, xzr, le
-; CHECK-NEXT:    csel x8, x22, x8, gt
+; CHECK-NEXT:    csel x8, x23, x8, gt
 ; CHECK-NEXT:    fcmp s8, s8
 ; CHECK-NEXT:    fcvt s8, h0
 ; CHECK-NEXT:    csel x28, xzr, x8, vs
 ; CHECK-NEXT:    csel x8, xzr, x9, vs
-; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    str x8, [sp] // 8-byte Folded Spill
+; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    fcmp s8, s10
+; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov h0, v0.h[3]
-; CHECK-NEXT:    csel x8, x25, x1, lt
+; CHECK-NEXT:    csel x8, x21, x1, lt
 ; CHECK-NEXT:    csel x9, xzr, x0, lt
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    csinv x9, x9, xzr, le
-; CHECK-NEXT:    csel x8, x22, x8, gt
+; CHECK-NEXT:    csel x8, x23, x8, gt
 ; CHECK-NEXT:    fcmp s8, s8
 ; CHECK-NEXT:    fcvt s8, h0
 ; CHECK-NEXT:    csel x27, xzr, x8, vs
@@ -2674,79 +2674,79 @@ define <8 x i100> @test_signed_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    fcmp s8, s10
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
-; CHECK-NEXT:    csel x9, x25, x1, lt
+; CHECK-NEXT:    csel x9, x21, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
-; CHECK-NEXT:    csel x9, x22, x9, gt
+; CHECK-NEXT:    csel x9, x23, x9, gt
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
 ; CHECK-NEXT:    fcmp s8, s8
 ; CHECK-NEXT:    fcvt s8, h0
-; CHECK-NEXT:    csel x29, xzr, x8, vs
-; CHECK-NEXT:    csel x21, xzr, x9, vs
+; CHECK-NEXT:    csel x22, xzr, x8, vs
+; CHECK-NEXT:    csel x29, xzr, x9, vs
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    fcmp s8, s10
+; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov h0, v0.h[2]
-; CHECK-NEXT:    csel x8, x25, x1, lt
+; CHECK-NEXT:    csel x8, x21, x1, lt
 ; CHECK-NEXT:    csel x9, xzr, x0, lt
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    csinv x9, x9, xzr, le
-; CHECK-NEXT:    csel x8, x22, x8, gt
+; CHECK-NEXT:    csel x8, x23, x8, gt
 ; CHECK-NEXT:    fcmp s8, s8
 ; CHECK-NEXT:    fcvt s8, h0
-; CHECK-NEXT:    csel x23, xzr, x8, vs
-; CHECK-NEXT:    csel x24, xzr, x9, vs
+; CHECK-NEXT:    csel x24, xzr, x8, vs
+; CHECK-NEXT:    csel x25, xzr, x9, vs
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixsfti
+; CHECK-NEXT:    ldr x9, [sp] // 8-byte Folded Reload
+; CHECK-NEXT:    extr x8, x29, x22, #28
 ; CHECK-NEXT:    fcmp s8, s10
-; CHECK-NEXT:    extr x9, x21, x29, #28
-; CHECK-NEXT:    bfi x23, x20, #36, #28
-; CHECK-NEXT:    extr x11, x27, x20, #28
-; CHECK-NEXT:    str x24, [x19]
-; CHECK-NEXT:    csel x8, x25, x1, lt
-; CHECK-NEXT:    csel x10, xzr, x0, lt
-; CHECK-NEXT:    fcmp s8, s9
-; CHECK-NEXT:    stur x9, [x19, #41]
-; CHECK-NEXT:    stp x23, x11, [x19, #8]
+; CHECK-NEXT:    bfi x24, x20, #36, #28
 ; CHECK-NEXT:    lsr x11, x27, #28
-; CHECK-NEXT:    csinv x9, x10, xzr, le
-; CHECK-NEXT:    lsr x10, x21, #28
-; CHECK-NEXT:    csel x8, x22, x8, gt
+; CHECK-NEXT:    stur x9, [x19, #75]
+; CHECK-NEXT:    extr x9, x27, x20, #28
+; CHECK-NEXT:    stur x8, [x19, #41]
+; CHECK-NEXT:    csel x8, x21, x1, lt
+; CHECK-NEXT:    str x9, [x19, #16]
+; CHECK-NEXT:    csel x9, xzr, x0, lt
+; CHECK-NEXT:    fcmp s8, s9
+; CHECK-NEXT:    ldr x10, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    stp x25, x24, [x19]
+; CHECK-NEXT:    stur x10, [x19, #50]
+; CHECK-NEXT:    lsr x10, x29, #28
+; CHECK-NEXT:    csinv x9, x9, xzr, le
+; CHECK-NEXT:    csel x8, x23, x8, gt
 ; CHECK-NEXT:    fcmp s8, s8
 ; CHECK-NEXT:    strb w10, [x19, #49]
+; CHECK-NEXT:    ldp x14, x12, [sp, #8] // 16-byte Folded Reload
+; CHECK-NEXT:    strb w11, [x19, #24]
 ; CHECK-NEXT:    csel x8, xzr, x8, vs
-; CHECK-NEXT:    ldr x10, [sp] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x13, [sp, #24] // 8-byte Folded Reload
 ; CHECK-NEXT:    csel x9, xzr, x9, vs
-; CHECK-NEXT:    bfi x8, x29, #36, #28
-; CHECK-NEXT:    strb w11, [x19, #24]
-; CHECK-NEXT:    stur x10, [x19, #75]
-; CHECK-NEXT:    ldp x12, x11, [sp, #8] // 16-byte Folded Reload
+; CHECK-NEXT:    bfi x8, x22, #36, #28
+; CHECK-NEXT:    extr x10, x14, x12, #28
+; CHECK-NEXT:    bfi x28, x12, #36, #28
+; CHECK-NEXT:    ldr x12, [sp, #72] // 8-byte Folded Reload
+; CHECK-NEXT:    bfi x26, x13, #36, #28
 ; CHECK-NEXT:    stur x9, [x19, #25]
+; CHECK-NEXT:    lsr x9, x14, #28
+; CHECK-NEXT:    extr x11, x12, x13, #28
 ; CHECK-NEXT:    stur x8, [x19, #33]
-; CHECK-NEXT:    ldr x8, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    extr x10, x12, x11, #28
-; CHECK-NEXT:    bfi x28, x11, #36, #28
-; CHECK-NEXT:    stur x8, [x19, #50]
-; CHECK-NEXT:    ldr x9, [sp, #24] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x11, [sp, #72] // 8-byte Folded Reload
+; CHECK-NEXT:    lsr x8, x12, #28
 ; CHECK-NEXT:    stur x10, [x19, #91]
 ; CHECK-NEXT:    stur x28, [x19, #83]
-; CHECK-NEXT:    extr x8, x11, x9, #28
-; CHECK-NEXT:    bfi x26, x9, #36, #28
-; CHECK-NEXT:    lsr x9, x12, #28
-; CHECK-NEXT:    stur x8, [x19, #66]
-; CHECK-NEXT:    lsr x8, x11, #28
+; CHECK-NEXT:    stur x11, [x19, #66]
 ; CHECK-NEXT:    stur x26, [x19, #58]
 ; CHECK-NEXT:    strb w9, [x19, #99]
 ; CHECK-NEXT:    strb w8, [x19, #74]
 ; CHECK-NEXT:    ldp x20, x19, [sp, #176] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr d10, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp x22, x21, [sp, #160] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x24, x23, [sp, #144] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x26, x25, [sp, #128] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x28, x27, [sp, #112] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x29, x30, [sp, #96] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr d10, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #192
 ; CHECK-NEXT:    ret
     %x = call <8 x i100> @llvm.fptosi.sat.v8f16.v8i100(<8 x half> %f)
@@ -2782,69 +2782,69 @@ define <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) {
 ; CHECK-NEXT:    .cfi_offset b9, -112
 ; CHECK-NEXT:    .cfi_offset b10, -128
 ; CHECK-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    mov x19, x8
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    mov x19, x8
 ; CHECK-NEXT:    fcvt s8, h0
 ; CHECK-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    movi v10.2s, #255, lsl #24
+; CHECK-NEXT:    movi v9.2s, #255, lsl #24
 ; CHECK-NEXT:    mov w8, #2130706431 // =0x7effffff
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    mov x21, #-9223372036854775808 // =0x8000000000000000
+; CHECK-NEXT:    fmov s10, w8
+; CHECK-NEXT:    mov x23, #-9223372036854775808 // =0x8000000000000000
 ; CHECK-NEXT:    mov x22, #9223372036854775807 // =0x7fffffffffffffff
-; CHECK-NEXT:    fmov s9, w8
-; CHECK-NEXT:    fcmp s8, s10
 ; CHECK-NEXT:    mov h0, v0.h[1]
-; CHECK-NEXT:    csel x8, xzr, x0, lt
-; CHECK-NEXT:    csel x9, x21, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
+; CHECK-NEXT:    csel x8, xzr, x0, lt
+; CHECK-NEXT:    csel x9, x23, x1, lt
+; CHECK-NEXT:    fcmp s8, s10
 ; CHECK-NEXT:    csel x9, x22, x9, gt
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
 ; CHECK-NEXT:    fcmp s8, s8
 ; CHECK-NEXT:    fcvt s8, h0
 ; CHECK-NEXT:    csel x8, xzr, x8, vs
-; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    str x8, [sp, #72] // 8-byte Folded Spill
 ; CHECK-NEXT:    csel x8, xzr, x9, vs
+; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    str x8, [sp, #24] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl __fixsfti
+; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    fcmp s8, s10
 ; CHECK-NEXT:    mov h0, v0.h[2]
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
-; CHECK-NEXT:    csel x9, x21, x1, lt
-; CHECK-NEXT:    fcmp s8, s9
+; CHECK-NEXT:    csel x9, x23, x1, lt
+; CHECK-NEXT:    fcmp s8, s10
 ; CHECK-NEXT:    csel x9, x22, x9, gt
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
 ; CHECK-NEXT:    fcmp s8, s8
 ; CHECK-NEXT:    fcvt s8, h0
 ; CHECK-NEXT:    csel x10, xzr, x8, vs
 ; CHECK-NEXT:    csel x8, xzr, x9, vs
-; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    stp x8, x10, [sp, #8] // 16-byte Folded Spill
+; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    fcmp s8, s10
-; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    csel x8, xzr, x0, lt
-; CHECK-NEXT:    csel x9, x21, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
+; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov h0, v0.h[3]
+; CHECK-NEXT:    csel x8, xzr, x0, lt
+; CHECK-NEXT:    csel x9, x23, x1, lt
+; CHECK-NEXT:    fcmp s8, s10
 ; CHECK-NEXT:    csel x9, x22, x9, gt
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
 ; CHECK-NEXT:    fcmp s8, s8
 ; CHECK-NEXT:    fcvt s8, h0
 ; CHECK-NEXT:    csel x8, xzr, x8, vs
-; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    str x8, [sp, #32] // 8-byte Folded Spill
 ; CHECK-NEXT:    csel x8, xzr, x9, vs
+; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    str x8, [sp] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    fcmp s8, s10
+; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
-; CHECK-NEXT:    csel x9, x21, x1, lt
-; CHECK-NEXT:    fcmp s8, s9
+; CHECK-NEXT:    csel x9, x23, x1, lt
+; CHECK-NEXT:    fcmp s8, s10
 ; CHECK-NEXT:    csel x9, x22, x9, gt
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
 ; CHECK-NEXT:    fcmp s8, s8
@@ -2853,26 +2853,26 @@ define <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) {
 ; CHECK-NEXT:    csel x29, xzr, x9, vs
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixsfti
+; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    fcmp s8, s10
 ; CHECK-NEXT:    mov h0, v0.h[1]
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
-; CHECK-NEXT:    csel x9, x21, x1, lt
-; CHECK-NEXT:    fcmp s8, s9
+; CHECK-NEXT:    csel x9, x23, x1, lt
+; CHECK-NEXT:    fcmp s8, s10
 ; CHECK-NEXT:    csel x9, x22, x9, gt
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
 ; CHECK-NEXT:    fcmp s8, s8
 ; CHECK-NEXT:    fcvt s8, h0
 ; CHECK-NEXT:    csel x20, xzr, x8, vs
-; CHECK-NEXT:    csel x23, xzr, x9, vs
+; CHECK-NEXT:    csel x21, xzr, x9, vs
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixsfti
+; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    fcmp s8, s10
 ; CHECK-NEXT:    mov h0, v0.h[2]
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
-; CHECK-NEXT:    csel x9, x21, x1, lt
-; CHECK-NEXT:    fcmp s8, s9
+; CHECK-NEXT:    csel x9, x23, x1, lt
+; CHECK-NEXT:    fcmp s8, s10
 ; CHECK-NEXT:    csel x9, x22, x9, gt
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
 ; CHECK-NEXT:    fcmp s8, s8
@@ -2881,12 +2881,12 @@ define <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) {
 ; CHECK-NEXT:    csel x25, xzr, x9, vs
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixsfti
+; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    fcmp s8, s10
 ; CHECK-NEXT:    mov h0, v0.h[3]
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
-; CHECK-NEXT:    csel x9, x21, x1, lt
-; CHECK-NEXT:    fcmp s8, s9
+; CHECK-NEXT:    csel x9, x23, x1, lt
+; CHECK-NEXT:    fcmp s8, s10
 ; CHECK-NEXT:    csel x9, x22, x9, gt
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
 ; CHECK-NEXT:    fcmp s8, s8
@@ -2895,24 +2895,24 @@ define <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) {
 ; CHECK-NEXT:    csel x27, xzr, x9, vs
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixsfti
-; CHECK-NEXT:    fcmp s8, s10
+; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    stp x26, x27, [x19, #32]
 ; CHECK-NEXT:    stp x24, x25, [x19, #16]
-; CHECK-NEXT:    stp x20, x23, [x19]
+; CHECK-NEXT:    stp x20, x21, [x19]
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
-; CHECK-NEXT:    csel x9, x21, x1, lt
-; CHECK-NEXT:    fcmp s8, s9
+; CHECK-NEXT:    csel x9, x23, x1, lt
+; CHECK-NEXT:    fcmp s8, s10
 ; CHECK-NEXT:    stp x28, x29, [x19, #112]
-; CHECK-NEXT:    ldr x10, [sp] // 8-byte Folded Reload
 ; CHECK-NEXT:    csel x9, x22, x9, gt
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
 ; CHECK-NEXT:    fcmp s8, s8
-; CHECK-NEXT:    str x10, [x19, #104]
-; CHECK-NEXT:    ldr x10, [sp, #32] // 8-byte Folded Reload
 ; CHECK-NEXT:    csel x9, xzr, x9, vs
 ; CHECK-NEXT:    csel x8, xzr, x8, vs
-; CHECK-NEXT:    str x10, [x19, #96]
 ; CHECK-NEXT:    stp x8, x9, [x19, #48]
+; CHECK-NEXT:    ldr x8, [sp] // 8-byte Folded Reload
+; CHECK-NEXT:    str x8, [x19, #104]
+; CHECK-NEXT:    ldr x8, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    str x8, [x19, #96]
 ; CHECK-NEXT:    ldr x8, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    str x8, [x19, #88]
 ; CHECK-NEXT:    ldr x8, [sp, #16] // 8-byte Folded Reload
@@ -2922,13 +2922,13 @@ define <8 x i128> @test_signed_v8f16_v8i128(<8 x half> %f) {
 ; CHECK-NEXT:    ldr x8, [sp, #72] // 8-byte Folded Reload
 ; CHECK-NEXT:    str x8, [x19, #64]
 ; CHECK-NEXT:    ldp x20, x19, [sp, #176] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr d10, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp x22, x21, [sp, #160] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x24, x23, [sp, #144] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x26, x25, [sp, #128] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x28, x27, [sp, #112] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x29, x30, [sp, #96] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr d10, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #192
 ; CHECK-NEXT:    ret
     %x = call <8 x i128> @llvm.fptosi.sat.v8f16.v8i128(<8 x half> %f)
@@ -3010,11 +3010,11 @@ define <16 x i16> @test_signed_v16f32_v16i16(<16 x float> %f) {
 ; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
 ; CHECK-NEXT:    fcvtzs v2.4s, v2.4s
 ; CHECK-NEXT:    fcvtzs v4.4s, v1.4s
-; CHECK-NEXT:    fcvtzs v3.4s, v3.4s
 ; CHECK-NEXT:    sqxtn v0.4h, v0.4s
 ; CHECK-NEXT:    sqxtn v1.4h, v2.4s
+; CHECK-NEXT:    fcvtzs v2.4s, v3.4s
 ; CHECK-NEXT:    sqxtn2 v0.8h, v4.4s
-; CHECK-NEXT:    sqxtn2 v1.8h, v3.4s
+; CHECK-NEXT:    sqxtn2 v1.8h, v2.4s
 ; CHECK-NEXT:    ret
     %x = call <16 x i16> @llvm.fptosi.sat.v16f32.v16i16(<16 x float> %f)
     ret <16 x i16> %x
@@ -3028,119 +3028,119 @@ define <16 x i8> @test_signed_v16f16_v16i8(<16 x half> %f) {
 ; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
 ; CHECK-CVT-NEXT:    mov w8, #127 // =0x7f
 ; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    mov w9, #-128 // =0xffffff80
 ; CHECK-CVT-NEXT:    mov s3, v2.s[1]
-; CHECK-CVT-NEXT:    fcvtzs w11, s2
-; CHECK-CVT-NEXT:    fcvtzs w10, s3
+; CHECK-CVT-NEXT:    fcvtzs w10, s2
+; CHECK-CVT-NEXT:    fcvtzs w9, s3
 ; CHECK-CVT-NEXT:    mov s3, v2.s[2]
 ; CHECK-CVT-NEXT:    mov s2, v2.s[3]
-; CHECK-CVT-NEXT:    cmp w10, #127
-; CHECK-CVT-NEXT:    csel w10, w10, w8, lt
+; CHECK-CVT-NEXT:    cmp w9, #127
 ; CHECK-CVT-NEXT:    fcvtzs w12, s3
-; CHECK-CVT-NEXT:    cmn w10, #128
 ; CHECK-CVT-NEXT:    mov s3, v1.s[1]
-; CHECK-CVT-NEXT:    csel w10, w10, w9, gt
-; CHECK-CVT-NEXT:    cmp w11, #127
-; CHECK-CVT-NEXT:    csel w11, w11, w8, lt
+; CHECK-CVT-NEXT:    csel w11, w9, w8, lt
+; CHECK-CVT-NEXT:    mov w9, #-128 // =0xffffff80
 ; CHECK-CVT-NEXT:    fcvtzs w14, s2
 ; CHECK-CVT-NEXT:    cmn w11, #128
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-CVT-NEXT:    mov s2, v1.s[2]
 ; CHECK-CVT-NEXT:    csel w11, w11, w9, gt
-; CHECK-CVT-NEXT:    cmp w12, #127
-; CHECK-CVT-NEXT:    csel w12, w12, w8, lt
+; CHECK-CVT-NEXT:    cmp w10, #127
+; CHECK-CVT-NEXT:    csel w10, w10, w8, lt
 ; CHECK-CVT-NEXT:    fcvtzs w15, s3
-; CHECK-CVT-NEXT:    cmn w12, #128
-; CHECK-CVT-NEXT:    mov s3, v1.s[2]
-; CHECK-CVT-NEXT:    csel w13, w12, w9, gt
+; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-NEXT:    cmn w10, #128
+; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-NEXT:    csel w13, w10, w9, gt
+; CHECK-CVT-NEXT:    cmp w12, #127
+; CHECK-CVT-NEXT:    fcvtzs w16, s2
+; CHECK-CVT-NEXT:    csel w10, w12, w8, lt
+; CHECK-CVT-NEXT:    cmn w10, #128
+; CHECK-CVT-NEXT:    mov s2, v3.s[1]
+; CHECK-CVT-NEXT:    fcvtzs w0, s3
+; CHECK-CVT-NEXT:    csel w10, w10, w9, gt
 ; CHECK-CVT-NEXT:    cmp w14, #127
+; CHECK-CVT-NEXT:    fcvtzs w4, s0
 ; CHECK-CVT-NEXT:    csel w12, w14, w8, lt
 ; CHECK-CVT-NEXT:    fcvtzs w14, s1
-; CHECK-CVT-NEXT:    cmn w12, #128
 ; CHECK-CVT-NEXT:    mov s1, v1.s[3]
+; CHECK-CVT-NEXT:    cmn w12, #128
 ; CHECK-CVT-NEXT:    csel w12, w12, w9, gt
 ; CHECK-CVT-NEXT:    cmp w15, #127
+; CHECK-CVT-NEXT:    fcvtzs w18, s2
 ; CHECK-CVT-NEXT:    csel w15, w15, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w16, s3
+; CHECK-CVT-NEXT:    mov s2, v3.s[3]
 ; CHECK-CVT-NEXT:    cmn w15, #128
-; CHECK-CVT-NEXT:    mov s3, v2.s[1]
+; CHECK-CVT-NEXT:    fcvtzs w17, s1
+; CHECK-CVT-NEXT:    mov s1, v3.s[2]
 ; CHECK-CVT-NEXT:    csel w15, w15, w9, gt
 ; CHECK-CVT-NEXT:    cmp w14, #127
 ; CHECK-CVT-NEXT:    csel w14, w14, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w17, s1
 ; CHECK-CVT-NEXT:    cmn w14, #128
-; CHECK-CVT-NEXT:    mov s1, v2.s[2]
+; CHECK-CVT-NEXT:    fcvtzs w2, s2
+; CHECK-CVT-NEXT:    fmov s2, w13
 ; CHECK-CVT-NEXT:    csel w14, w14, w9, gt
 ; CHECK-CVT-NEXT:    cmp w16, #127
+; CHECK-CVT-NEXT:    fcvtzs w1, s1
 ; CHECK-CVT-NEXT:    csel w16, w16, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w18, s3
+; CHECK-CVT-NEXT:    mov s1, v0.s[1]
 ; CHECK-CVT-NEXT:    cmn w16, #128
-; CHECK-CVT-NEXT:    fcvtzs w0, s2
+; CHECK-CVT-NEXT:    mov v2.s[1], w11
 ; CHECK-CVT-NEXT:    csel w16, w16, w9, gt
 ; CHECK-CVT-NEXT:    cmp w17, #127
 ; CHECK-CVT-NEXT:    csel w17, w17, w8, lt
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-CVT-NEXT:    cmn w17, #128
-; CHECK-CVT-NEXT:    mov s2, v2.s[3]
+; CHECK-CVT-NEXT:    fcvtzs w3, s1
+; CHECK-CVT-NEXT:    mov s1, v0.s[2]
 ; CHECK-CVT-NEXT:    csel w17, w17, w9, gt
 ; CHECK-CVT-NEXT:    cmp w18, #127
+; CHECK-CVT-NEXT:    mov s0, v0.s[3]
 ; CHECK-CVT-NEXT:    csel w18, w18, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w1, s1
+; CHECK-CVT-NEXT:    mov v2.s[2], w10
 ; CHECK-CVT-NEXT:    cmn w18, #128
-; CHECK-CVT-NEXT:    mov s1, v0.s[1]
 ; CHECK-CVT-NEXT:    csel w18, w18, w9, gt
 ; CHECK-CVT-NEXT:    cmp w0, #127
 ; CHECK-CVT-NEXT:    csel w0, w0, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w2, s2
 ; CHECK-CVT-NEXT:    cmn w0, #128
-; CHECK-CVT-NEXT:    fcvtzs w4, s0
+; CHECK-CVT-NEXT:    mov v2.s[3], w12
 ; CHECK-CVT-NEXT:    csel w0, w0, w9, gt
 ; CHECK-CVT-NEXT:    cmp w1, #127
 ; CHECK-CVT-NEXT:    csel w1, w1, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w3, s1
+; CHECK-CVT-NEXT:    fmov s3, w0
 ; CHECK-CVT-NEXT:    cmn w1, #128
-; CHECK-CVT-NEXT:    mov s1, v0.s[2]
 ; CHECK-CVT-NEXT:    csel w1, w1, w9, gt
 ; CHECK-CVT-NEXT:    cmp w2, #127
 ; CHECK-CVT-NEXT:    csel w2, w2, w8, lt
-; CHECK-CVT-NEXT:    fmov s2, w11
+; CHECK-CVT-NEXT:    mov v3.s[1], w18
 ; CHECK-CVT-NEXT:    cmn w2, #128
-; CHECK-CVT-NEXT:    fmov s3, w14
 ; CHECK-CVT-NEXT:    csel w2, w2, w9, gt
 ; CHECK-CVT-NEXT:    cmp w3, #127
 ; CHECK-CVT-NEXT:    csel w3, w3, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w14, s1
 ; CHECK-CVT-NEXT:    cmn w3, #128
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    csel w3, w3, w9, gt
+; CHECK-CVT-NEXT:    mov v3.s[2], w1
+; CHECK-CVT-NEXT:    csel w13, w3, w9, gt
 ; CHECK-CVT-NEXT:    cmp w4, #127
-; CHECK-CVT-NEXT:    csel w11, w4, w8, lt
-; CHECK-CVT-NEXT:    fmov s4, w0
-; CHECK-CVT-NEXT:    cmn w11, #128
-; CHECK-CVT-NEXT:    csel w11, w11, w9, gt
-; CHECK-CVT-NEXT:    cmp w14, #127
-; CHECK-CVT-NEXT:    mov v2.s[1], w10
-; CHECK-CVT-NEXT:    csel w10, w14, w8, lt
-; CHECK-CVT-NEXT:    mov v3.s[1], w15
-; CHECK-CVT-NEXT:    cmn w10, #128
-; CHECK-CVT-NEXT:    fmov s1, w11
-; CHECK-CVT-NEXT:    csel w10, w10, w9, gt
+; CHECK-CVT-NEXT:    csel w3, w4, w8, lt
+; CHECK-CVT-NEXT:    fcvtzs w4, s1
+; CHECK-CVT-NEXT:    fmov s1, w14
+; CHECK-CVT-NEXT:    cmn w3, #128
+; CHECK-CVT-NEXT:    csel w11, w3, w9, gt
+; CHECK-CVT-NEXT:    mov v3.s[3], w2
+; CHECK-CVT-NEXT:    fmov s4, w11
+; CHECK-CVT-NEXT:    mov v1.s[1], w15
 ; CHECK-CVT-NEXT:    fcvtzs w11, s0
-; CHECK-CVT-NEXT:    mov v4.s[1], w18
-; CHECK-CVT-NEXT:    mov v1.s[1], w3
+; CHECK-CVT-NEXT:    cmp w4, #127
+; CHECK-CVT-NEXT:    mov v4.s[1], w13
+; CHECK-CVT-NEXT:    csel w13, w4, w8, lt
+; CHECK-CVT-NEXT:    cmn w13, #128
+; CHECK-CVT-NEXT:    mov v1.s[2], w16
+; CHECK-CVT-NEXT:    csel w10, w13, w9, gt
 ; CHECK-CVT-NEXT:    cmp w11, #127
 ; CHECK-CVT-NEXT:    csel w8, w11, w8, lt
-; CHECK-CVT-NEXT:    mov v2.s[2], w13
+; CHECK-CVT-NEXT:    mov v4.s[2], w10
 ; CHECK-CVT-NEXT:    cmn w8, #128
-; CHECK-CVT-NEXT:    mov v3.s[2], w16
 ; CHECK-CVT-NEXT:    csel w8, w8, w9, gt
-; CHECK-CVT-NEXT:    mov v4.s[2], w1
-; CHECK-CVT-NEXT:    mov v1.s[2], w10
-; CHECK-CVT-NEXT:    mov v2.s[3], w12
-; CHECK-CVT-NEXT:    mov v3.s[3], w17
-; CHECK-CVT-NEXT:    mov v4.s[3], w2
-; CHECK-CVT-NEXT:    mov v1.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v3.8h, v2.8h
-; CHECK-CVT-NEXT:    uzp1 v1.8h, v1.8h, v4.8h
+; CHECK-CVT-NEXT:    mov v1.s[3], w17
+; CHECK-CVT-NEXT:    mov v4.s[3], w8
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v1.8h, v2.8h
+; CHECK-CVT-NEXT:    uzp1 v1.8h, v4.8h, v3.8h
 ; CHECK-CVT-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
 ; CHECK-CVT-NEXT:    ret
 ;
@@ -3161,119 +3161,119 @@ define <16 x i16> @test_signed_v16f16_v16i16(<16 x half> %f) {
 ; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
 ; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    mov w9, #-32768 // =0xffff8000
 ; CHECK-CVT-NEXT:    mov s3, v2.s[1]
-; CHECK-CVT-NEXT:    fcvtzs w11, s2
-; CHECK-CVT-NEXT:    fcvtzs w10, s3
+; CHECK-CVT-NEXT:    fcvtzs w10, s2
+; CHECK-CVT-NEXT:    fcvtzs w9, s3
 ; CHECK-CVT-NEXT:    mov s3, v2.s[2]
 ; CHECK-CVT-NEXT:    mov s2, v2.s[3]
-; CHECK-CVT-NEXT:    cmp w10, w8
-; CHECK-CVT-NEXT:    csel w10, w10, w8, lt
+; CHECK-CVT-NEXT:    cmp w9, w8
 ; CHECK-CVT-NEXT:    fcvtzs w12, s3
-; CHECK-CVT-NEXT:    cmn w10, #8, lsl #12 // =32768
 ; CHECK-CVT-NEXT:    mov s3, v0.s[1]
-; CHECK-CVT-NEXT:    csel w10, w10, w9, gt
-; CHECK-CVT-NEXT:    cmp w11, w8
-; CHECK-CVT-NEXT:    csel w11, w11, w8, lt
+; CHECK-CVT-NEXT:    csel w11, w9, w8, lt
+; CHECK-CVT-NEXT:    mov w9, #-32768 // =0xffff8000
 ; CHECK-CVT-NEXT:    fcvtzs w14, s2
 ; CHECK-CVT-NEXT:    cmn w11, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-NEXT:    mov s2, v0.s[2]
 ; CHECK-CVT-NEXT:    csel w11, w11, w9, gt
-; CHECK-CVT-NEXT:    cmp w12, w8
-; CHECK-CVT-NEXT:    csel w12, w12, w8, lt
+; CHECK-CVT-NEXT:    cmp w10, w8
+; CHECK-CVT-NEXT:    csel w10, w10, w8, lt
 ; CHECK-CVT-NEXT:    fcvtzs w15, s3
-; CHECK-CVT-NEXT:    cmn w12, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    mov s3, v0.s[2]
-; CHECK-CVT-NEXT:    csel w13, w12, w9, gt
+; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v1.8h
+; CHECK-CVT-NEXT:    cmn w10, #8, lsl #12 // =32768
+; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-NEXT:    csel w13, w10, w9, gt
+; CHECK-CVT-NEXT:    cmp w12, w8
+; CHECK-CVT-NEXT:    fcvtzs w16, s2
+; CHECK-CVT-NEXT:    csel w10, w12, w8, lt
+; CHECK-CVT-NEXT:    cmn w10, #8, lsl #12 // =32768
+; CHECK-CVT-NEXT:    mov s2, v3.s[1]
+; CHECK-CVT-NEXT:    fcvtzs w0, s3
+; CHECK-CVT-NEXT:    csel w10, w10, w9, gt
 ; CHECK-CVT-NEXT:    cmp w14, w8
+; CHECK-CVT-NEXT:    fcvtzs w4, s1
 ; CHECK-CVT-NEXT:    csel w12, w14, w8, lt
 ; CHECK-CVT-NEXT:    fcvtzs w14, s0
-; CHECK-CVT-NEXT:    cmn w12, #8, lsl #12 // =32768
 ; CHECK-CVT-NEXT:    mov s0, v0.s[3]
+; CHECK-CVT-NEXT:    cmn w12, #8, lsl #12 // =32768
 ; CHECK-CVT-NEXT:    csel w12, w12, w9, gt
 ; CHECK-CVT-NEXT:    cmp w15, w8
+; CHECK-CVT-NEXT:    fcvtzs w18, s2
 ; CHECK-CVT-NEXT:    csel w15, w15, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w16, s3
+; CHECK-CVT-NEXT:    mov s2, v3.s[3]
 ; CHECK-CVT-NEXT:    cmn w15, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    mov s3, v2.s[1]
+; CHECK-CVT-NEXT:    fcvtzs w17, s0
+; CHECK-CVT-NEXT:    mov s0, v3.s[2]
 ; CHECK-CVT-NEXT:    csel w15, w15, w9, gt
 ; CHECK-CVT-NEXT:    cmp w14, w8
 ; CHECK-CVT-NEXT:    csel w14, w14, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w17, s0
 ; CHECK-CVT-NEXT:    cmn w14, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v1.4h
+; CHECK-CVT-NEXT:    fcvtzs w2, s2
+; CHECK-CVT-NEXT:    fmov s2, w13
 ; CHECK-CVT-NEXT:    csel w14, w14, w9, gt
 ; CHECK-CVT-NEXT:    cmp w16, w8
+; CHECK-CVT-NEXT:    fcvtzs w1, s0
 ; CHECK-CVT-NEXT:    csel w16, w16, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w18, s3
+; CHECK-CVT-NEXT:    mov s0, v1.s[1]
 ; CHECK-CVT-NEXT:    cmn w16, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    mov s1, v2.s[2]
+; CHECK-CVT-NEXT:    mov v2.s[1], w11
 ; CHECK-CVT-NEXT:    csel w16, w16, w9, gt
 ; CHECK-CVT-NEXT:    cmp w17, w8
 ; CHECK-CVT-NEXT:    csel w17, w17, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w0, s2
 ; CHECK-CVT-NEXT:    cmn w17, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    mov s2, v2.s[3]
+; CHECK-CVT-NEXT:    fcvtzs w3, s0
+; CHECK-CVT-NEXT:    mov s0, v1.s[2]
 ; CHECK-CVT-NEXT:    csel w17, w17, w9, gt
 ; CHECK-CVT-NEXT:    cmp w18, w8
+; CHECK-CVT-NEXT:    mov v2.s[2], w10
 ; CHECK-CVT-NEXT:    csel w18, w18, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w1, s1
 ; CHECK-CVT-NEXT:    cmn w18, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    mov s1, v0.s[1]
 ; CHECK-CVT-NEXT:    csel w18, w18, w9, gt
 ; CHECK-CVT-NEXT:    cmp w0, w8
 ; CHECK-CVT-NEXT:    csel w0, w0, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w2, s2
+; CHECK-CVT-NEXT:    mov v2.s[3], w12
 ; CHECK-CVT-NEXT:    cmn w0, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    fcvtzs w4, s0
 ; CHECK-CVT-NEXT:    csel w0, w0, w9, gt
 ; CHECK-CVT-NEXT:    cmp w1, w8
 ; CHECK-CVT-NEXT:    csel w1, w1, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w3, s1
+; CHECK-CVT-NEXT:    fmov s3, w0
 ; CHECK-CVT-NEXT:    cmn w1, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    mov s1, v0.s[2]
 ; CHECK-CVT-NEXT:    csel w1, w1, w9, gt
 ; CHECK-CVT-NEXT:    cmp w2, w8
 ; CHECK-CVT-NEXT:    csel w2, w2, w8, lt
-; CHECK-CVT-NEXT:    fmov s2, w11
+; CHECK-CVT-NEXT:    mov v3.s[1], w18
 ; CHECK-CVT-NEXT:    cmn w2, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    fmov s3, w14
 ; CHECK-CVT-NEXT:    csel w2, w2, w9, gt
 ; CHECK-CVT-NEXT:    cmp w3, w8
 ; CHECK-CVT-NEXT:    csel w3, w3, w8, lt
-; CHECK-CVT-NEXT:    fcvtzs w14, s1
 ; CHECK-CVT-NEXT:    cmn w3, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    csel w3, w3, w9, gt
+; CHECK-CVT-NEXT:    mov v3.s[2], w1
+; CHECK-CVT-NEXT:    csel w13, w3, w9, gt
+; CHECK-CVT-NEXT:    cmp w4, w8
+; CHECK-CVT-NEXT:    csel w3, w4, w8, lt
+; CHECK-CVT-NEXT:    fcvtzs w4, s0
+; CHECK-CVT-NEXT:    mov s0, v1.s[3]
+; CHECK-CVT-NEXT:    cmn w3, #8, lsl #12 // =32768
+; CHECK-CVT-NEXT:    fmov s1, w14
+; CHECK-CVT-NEXT:    csel w11, w3, w9, gt
+; CHECK-CVT-NEXT:    mov v3.s[3], w2
+; CHECK-CVT-NEXT:    fmov s4, w11
+; CHECK-CVT-NEXT:    mov v1.s[1], w15
 ; CHECK-CVT-NEXT:    cmp w4, w8
-; CHECK-CVT-NEXT:    csel w11, w4, w8, lt
-; CHECK-CVT-NEXT:    fmov s4, w0
-; CHECK-CVT-NEXT:    cmn w11, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    csel w11, w11, w9, gt
-; CHECK-CVT-NEXT:    cmp w14, w8
-; CHECK-CVT-NEXT:    mov v2.s[1], w10
-; CHECK-CVT-NEXT:    csel w10, w14, w8, lt
-; CHECK-CVT-NEXT:    mov v3.s[1], w15
-; CHECK-CVT-NEXT:    cmn w10, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    fmov s1, w11
-; CHECK-CVT-NEXT:    csel w10, w10, w9, gt
 ; CHECK-CVT-NEXT:    fcvtzs w11, s0
-; CHECK-CVT-NEXT:    mov v4.s[1], w18
-; CHECK-CVT-NEXT:    mov v1.s[1], w3
+; CHECK-CVT-NEXT:    mov v4.s[1], w13
+; CHECK-CVT-NEXT:    csel w13, w4, w8, lt
+; CHECK-CVT-NEXT:    cmn w13, #8, lsl #12 // =32768
+; CHECK-CVT-NEXT:    csel w10, w13, w9, gt
+; CHECK-CVT-NEXT:    mov v1.s[2], w16
 ; CHECK-CVT-NEXT:    cmp w11, w8
 ; CHECK-CVT-NEXT:    csel w8, w11, w8, lt
-; CHECK-CVT-NEXT:    mov v2.s[2], w13
+; CHECK-CVT-NEXT:    mov v4.s[2], w10
 ; CHECK-CVT-NEXT:    cmn w8, #8, lsl #12 // =32768
-; CHECK-CVT-NEXT:    mov v3.s[2], w16
 ; CHECK-CVT-NEXT:    csel w8, w8, w9, gt
-; CHECK-CVT-NEXT:    mov v4.s[2], w1
-; CHECK-CVT-NEXT:    mov v1.s[2], w10
-; CHECK-CVT-NEXT:    mov v2.s[3], w12
-; CHECK-CVT-NEXT:    mov v3.s[3], w17
-; CHECK-CVT-NEXT:    mov v4.s[3], w2
-; CHECK-CVT-NEXT:    mov v1.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v3.8h, v2.8h
-; CHECK-CVT-NEXT:    uzp1 v1.8h, v1.8h, v4.8h
+; CHECK-CVT-NEXT:    mov v1.s[3], w17
+; CHECK-CVT-NEXT:    mov v4.s[3], w8
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v1.8h, v2.8h
+; CHECK-CVT-NEXT:    uzp1 v1.8h, v4.8h, v3.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_signed_v16f16_v16i16:
@@ -3289,62 +3289,62 @@ define <8 x i8> @test_signed_v8f64_v8i8(<8 x double> %f) {
 ; CHECK-LABEL: test_signed_v8f64_v8i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov d4, v3.d[1]
-; CHECK-NEXT:    mov w8, #127 // =0x7f
-; CHECK-NEXT:    fcvtzs w10, d3
+; CHECK-NEXT:    fcvtzs w9, d3
+; CHECK-NEXT:    mov w10, #127 // =0x7f
 ; CHECK-NEXT:    mov w11, #-128 // =0xffffff80
 ; CHECK-NEXT:    mov d3, v1.d[1]
 ; CHECK-NEXT:    fcvtzs w13, d2
 ; CHECK-NEXT:    fcvtzs w15, d1
-; CHECK-NEXT:    mov d1, v0.d[1]
-; CHECK-NEXT:    fcvtzs w9, d4
+; CHECK-NEXT:    fcvtzs w17, d0
+; CHECK-NEXT:    fcvtzs w8, d4
 ; CHECK-NEXT:    mov d4, v2.d[1]
+; CHECK-NEXT:    mov d2, v0.d[1]
 ; CHECK-NEXT:    fcvtzs w14, d3
-; CHECK-NEXT:    cmp w9, #127
-; CHECK-NEXT:    csel w9, w9, w8, lt
+; CHECK-NEXT:    cmp w8, #127
 ; CHECK-NEXT:    fcvtzs w12, d4
+; CHECK-NEXT:    fcvtzs w16, d2
+; CHECK-NEXT:    csel w8, w8, w10, lt
+; CHECK-NEXT:    cmn w8, #128
+; CHECK-NEXT:    csel w8, w8, w11, gt
+; CHECK-NEXT:    cmp w9, #127
+; CHECK-NEXT:    csel w9, w9, w10, lt
 ; CHECK-NEXT:    cmn w9, #128
 ; CHECK-NEXT:    csel w9, w9, w11, gt
-; CHECK-NEXT:    cmp w10, #127
-; CHECK-NEXT:    csel w10, w10, w8, lt
-; CHECK-NEXT:    cmn w10, #128
-; CHECK-NEXT:    csel w10, w10, w11, gt
 ; CHECK-NEXT:    cmp w12, #127
-; CHECK-NEXT:    csel w12, w12, w8, lt
+; CHECK-NEXT:    csel w12, w12, w10, lt
+; CHECK-NEXT:    fmov s3, w9
 ; CHECK-NEXT:    cmn w12, #128
 ; CHECK-NEXT:    csel w12, w12, w11, gt
 ; CHECK-NEXT:    cmp w13, #127
-; CHECK-NEXT:    csel w13, w13, w8, lt
-; CHECK-NEXT:    fmov s5, w10
+; CHECK-NEXT:    csel w13, w13, w10, lt
+; CHECK-NEXT:    mov v3.s[1], w8
 ; CHECK-NEXT:    cmn w13, #128
 ; CHECK-NEXT:    csel w13, w13, w11, gt
 ; CHECK-NEXT:    cmp w14, #127
-; CHECK-NEXT:    csel w14, w14, w8, lt
+; CHECK-NEXT:    csel w14, w14, w10, lt
+; CHECK-NEXT:    fmov s2, w13
 ; CHECK-NEXT:    cmn w14, #128
-; CHECK-NEXT:    csel w10, w14, w11, gt
+; CHECK-NEXT:    csel w14, w14, w11, gt
 ; CHECK-NEXT:    cmp w15, #127
-; CHECK-NEXT:    fcvtzs w14, d1
-; CHECK-NEXT:    csel w15, w15, w8, lt
+; CHECK-NEXT:    csel w15, w15, w10, lt
+; CHECK-NEXT:    mov v2.s[1], w12
 ; CHECK-NEXT:    cmn w15, #128
-; CHECK-NEXT:    mov v5.s[1], w9
-; CHECK-NEXT:    csel w9, w15, w11, gt
-; CHECK-NEXT:    cmp w14, #127
-; CHECK-NEXT:    fcvtzs w15, d0
-; CHECK-NEXT:    fmov s4, w13
-; CHECK-NEXT:    csel w13, w14, w8, lt
-; CHECK-NEXT:    cmn w13, #128
-; CHECK-NEXT:    csel w13, w13, w11, gt
-; CHECK-NEXT:    cmp w15, #127
-; CHECK-NEXT:    mov v4.s[1], w12
-; CHECK-NEXT:    csel w8, w15, w8, lt
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    cmn w8, #128
-; CHECK-NEXT:    csel w8, w8, w11, gt
-; CHECK-NEXT:    mov v3.s[1], w10
-; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    csel w15, w15, w11, gt
+; CHECK-NEXT:    cmp w16, #127
+; CHECK-NEXT:    csel w9, w16, w10, lt
+; CHECK-NEXT:    fmov s1, w15
+; CHECK-NEXT:    cmn w9, #128
+; CHECK-NEXT:    csel w8, w9, w11, gt
+; CHECK-NEXT:    cmp w17, #127
+; CHECK-NEXT:    csel w9, w17, w10, lt
+; CHECK-NEXT:    mov v1.s[1], w14
+; CHECK-NEXT:    cmn w9, #128
+; CHECK-NEXT:    csel w9, w9, w11, gt
+; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    mov v0.s[1], w8
 ; CHECK-NEXT:    adrp x8, .LCPI82_0
-; CHECK-NEXT:    mov v2.s[1], w13
-; CHECK-NEXT:    ldr d0, [x8, :lo12:.LCPI82_0]
-; CHECK-NEXT:    tbl v0.8b, { v2.16b, v3.16b, v4.16b, v5.16b }, v0.8b
+; CHECK-NEXT:    ldr d4, [x8, :lo12:.LCPI82_0]
+; CHECK-NEXT:    tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.8b
 ; CHECK-NEXT:    ret
     %x = call <8 x i8> @llvm.fptosi.sat.v8f64.v8i8(<8 x double> %f)
     ret <8 x i8> %x
@@ -3354,130 +3354,130 @@ define <16 x i8> @test_signed_v16f64_v16i8(<16 x double> %f) {
 ; CHECK-LABEL: test_signed_v16f64_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov d16, v0.d[1]
+; CHECK-NEXT:    fcvtzs w10, d0
 ; CHECK-NEXT:    mov w8, #127 // =0x7f
-; CHECK-NEXT:    fcvtzs w11, d0
-; CHECK-NEXT:    mov w9, #-128 // =0xffffff80
-; CHECK-NEXT:    mov d0, v2.d[1]
+; CHECK-NEXT:    mov d0, v1.d[1]
 ; CHECK-NEXT:    fcvtzs w13, d1
-; CHECK-NEXT:    fcvtzs w10, d16
-; CHECK-NEXT:    mov d16, v1.d[1]
-; CHECK-NEXT:    fcvtzs w14, d0
+; CHECK-NEXT:    mov d1, v2.d[1]
+; CHECK-NEXT:    fcvtzs w9, d16
+; CHECK-NEXT:    fcvtzs w12, d0
+; CHECK-NEXT:    cmp w9, #127
+; CHECK-NEXT:    csel w11, w9, w8, lt
+; CHECK-NEXT:    mov w9, #-128 // =0xffffff80
+; CHECK-NEXT:    cmn w11, #128
+; CHECK-NEXT:    csel w11, w11, w9, gt
 ; CHECK-NEXT:    cmp w10, #127
 ; CHECK-NEXT:    csel w10, w10, w8, lt
-; CHECK-NEXT:    fcvtzs w12, d16
 ; CHECK-NEXT:    cmn w10, #128
 ; CHECK-NEXT:    csel w10, w10, w9, gt
-; CHECK-NEXT:    cmp w11, #127
-; CHECK-NEXT:    csel w11, w11, w8, lt
-; CHECK-NEXT:    cmn w11, #128
-; CHECK-NEXT:    csel w11, w11, w9, gt
 ; CHECK-NEXT:    cmp w12, #127
-; CHECK-NEXT:    csel w12, w12, w8, lt
+; CHECK-NEXT:    fmov s0, w10
+; CHECK-NEXT:    csel w10, w12, w8, lt
+; CHECK-NEXT:    cmn w10, #128
+; CHECK-NEXT:    csel w10, w10, w9, gt
+; CHECK-NEXT:    cmp w13, #127
+; CHECK-NEXT:    csel w12, w13, w8, lt
+; CHECK-NEXT:    mov v0.s[1], w11
+; CHECK-NEXT:    fcvtzs w11, d1
 ; CHECK-NEXT:    cmn w12, #128
 ; CHECK-NEXT:    csel w12, w12, w9, gt
-; CHECK-NEXT:    cmp w13, #127
-; CHECK-NEXT:    fmov s0, w11
-; CHECK-NEXT:    csel w11, w13, w8, lt
-; CHECK-NEXT:    cmn w11, #128
-; CHECK-NEXT:    fcvtzs w13, d2
-; CHECK-NEXT:    csel w11, w11, w9, gt
-; CHECK-NEXT:    cmp w14, #127
-; CHECK-NEXT:    mov v0.s[1], w10
-; CHECK-NEXT:    csel w14, w14, w8, lt
-; CHECK-NEXT:    cmn w14, #128
-; CHECK-NEXT:    mov d2, v3.d[1]
-; CHECK-NEXT:    fmov s1, w11
-; CHECK-NEXT:    csel w11, w14, w9, gt
-; CHECK-NEXT:    cmp w13, #127
-; CHECK-NEXT:    fcvtzs w10, d3
-; CHECK-NEXT:    mov w14, v0.s[1]
-; CHECK-NEXT:    csel w13, w13, w8, lt
-; CHECK-NEXT:    cmn w13, #128
-; CHECK-NEXT:    mov d3, v4.d[1]
-; CHECK-NEXT:    csel w13, w13, w9, gt
-; CHECK-NEXT:    mov v1.s[1], w12
+; CHECK-NEXT:    fmov s1, w12
 ; CHECK-NEXT:    fcvtzs w12, d2
-; CHECK-NEXT:    mov v0.b[1], w14
-; CHECK-NEXT:    fmov s2, w13
+; CHECK-NEXT:    mov d2, v3.d[1]
+; CHECK-NEXT:    cmp w11, #127
+; CHECK-NEXT:    mov w13, v0.s[1]
+; CHECK-NEXT:    mov v1.s[1], w10
+; CHECK-NEXT:    csel w10, w11, w8, lt
+; CHECK-NEXT:    cmn w10, #128
+; CHECK-NEXT:    fcvtzs w11, d2
+; CHECK-NEXT:    csel w10, w10, w9, gt
 ; CHECK-NEXT:    cmp w12, #127
-; CHECK-NEXT:    fcvtzs w13, d3
+; CHECK-NEXT:    mov v0.b[1], w13
 ; CHECK-NEXT:    csel w12, w12, w8, lt
-; CHECK-NEXT:    fcvtzs w14, d4
 ; CHECK-NEXT:    cmn w12, #128
-; CHECK-NEXT:    mov d3, v5.d[1]
-; CHECK-NEXT:    mov v2.s[1], w11
-; CHECK-NEXT:    mov w11, v1.s[1]
-; CHECK-NEXT:    mov v0.b[2], v1.b[0]
+; CHECK-NEXT:    mov w13, v1.s[1]
 ; CHECK-NEXT:    csel w12, w12, w9, gt
-; CHECK-NEXT:    cmp w10, #127
-; CHECK-NEXT:    mov d4, v6.d[1]
-; CHECK-NEXT:    csel w10, w10, w8, lt
+; CHECK-NEXT:    cmp w11, #127
+; CHECK-NEXT:    fmov s2, w12
+; CHECK-NEXT:    fcvtzs w12, d3
+; CHECK-NEXT:    mov d3, v4.d[1]
+; CHECK-NEXT:    mov v0.b[2], v1.b[0]
+; CHECK-NEXT:    mov v2.s[1], w10
+; CHECK-NEXT:    csel w10, w11, w8, lt
 ; CHECK-NEXT:    cmn w10, #128
-; CHECK-NEXT:    csel w10, w10, w9, gt
-; CHECK-NEXT:    cmp w13, #127
-; CHECK-NEXT:    mov v0.b[3], w11
-; CHECK-NEXT:    csel w13, w13, w8, lt
-; CHECK-NEXT:    cmn w13, #128
 ; CHECK-NEXT:    fcvtzs w11, d3
-; CHECK-NEXT:    csel w13, w13, w9, gt
-; CHECK-NEXT:    cmp w14, #127
-; CHECK-NEXT:    fmov s3, w10
-; CHECK-NEXT:    csel w10, w14, w8, lt
-; CHECK-NEXT:    mov w14, v2.s[1]
-; CHECK-NEXT:    cmn w10, #128
-; CHECK-NEXT:    mov v0.b[4], v2.b[0]
 ; CHECK-NEXT:    csel w10, w10, w9, gt
-; CHECK-NEXT:    mov v3.s[1], w12
+; CHECK-NEXT:    cmp w12, #127
+; CHECK-NEXT:    mov v0.b[3], w13
+; CHECK-NEXT:    csel w12, w12, w8, lt
+; CHECK-NEXT:    cmn w12, #128
+; CHECK-NEXT:    mov w13, v2.s[1]
+; CHECK-NEXT:    csel w12, w12, w9, gt
 ; CHECK-NEXT:    cmp w11, #127
-; CHECK-NEXT:    csel w11, w11, w8, lt
-; CHECK-NEXT:    fcvtzs w12, d5
-; CHECK-NEXT:    cmn w11, #128
-; CHECK-NEXT:    mov v0.b[5], w14
-; CHECK-NEXT:    fcvtzs w14, d4
-; CHECK-NEXT:    fmov s4, w10
-; CHECK-NEXT:    csel w10, w11, w9, gt
-; CHECK-NEXT:    mov w11, v3.s[1]
+; CHECK-NEXT:    fmov s3, w12
+; CHECK-NEXT:    fcvtzs w12, d4
+; CHECK-NEXT:    mov v0.b[4], v2.b[0]
+; CHECK-NEXT:    mov d4, v5.d[1]
+; CHECK-NEXT:    mov v3.s[1], w10
+; CHECK-NEXT:    csel w10, w11, w8, lt
+; CHECK-NEXT:    cmn w10, #128
+; CHECK-NEXT:    mov v0.b[5], w13
+; CHECK-NEXT:    csel w10, w10, w9, gt
 ; CHECK-NEXT:    cmp w12, #127
+; CHECK-NEXT:    fcvtzs w11, d4
 ; CHECK-NEXT:    csel w12, w12, w8, lt
-; CHECK-NEXT:    mov v0.b[6], v3.b[0]
 ; CHECK-NEXT:    cmn w12, #128
-; CHECK-NEXT:    mov v4.s[1], w13
+; CHECK-NEXT:    mov w13, v3.s[1]
 ; CHECK-NEXT:    csel w12, w12, w9, gt
-; CHECK-NEXT:    cmp w14, #127
-; CHECK-NEXT:    csel w13, w14, w8, lt
-; CHECK-NEXT:    mov v0.b[7], w11
-; CHECK-NEXT:    fcvtzs w11, d6
-; CHECK-NEXT:    cmn w13, #128
-; CHECK-NEXT:    fmov s5, w12
-; CHECK-NEXT:    csel w12, w13, w9, gt
-; CHECK-NEXT:    mov w13, v4.s[1]
+; CHECK-NEXT:    mov v0.b[6], v3.b[0]
+; CHECK-NEXT:    fmov s4, w12
+; CHECK-NEXT:    fcvtzs w12, d5
 ; CHECK-NEXT:    cmp w11, #127
-; CHECK-NEXT:    mov d6, v7.d[1]
-; CHECK-NEXT:    mov v0.b[8], v4.b[0]
-; CHECK-NEXT:    csel w11, w11, w8, lt
+; CHECK-NEXT:    mov d5, v6.d[1]
+; CHECK-NEXT:    mov v4.s[1], w10
+; CHECK-NEXT:    csel w10, w11, w8, lt
+; CHECK-NEXT:    mov v0.b[7], w13
+; CHECK-NEXT:    cmn w10, #128
+; CHECK-NEXT:    csel w10, w10, w9, gt
+; CHECK-NEXT:    cmp w12, #127
+; CHECK-NEXT:    fcvtzs w13, d5
+; CHECK-NEXT:    csel w11, w12, w8, lt
 ; CHECK-NEXT:    cmn w11, #128
-; CHECK-NEXT:    mov v5.s[1], w10
-; CHECK-NEXT:    csel w10, w11, w9, gt
+; CHECK-NEXT:    mov w12, v4.s[1]
+; CHECK-NEXT:    mov v0.b[8], v4.b[0]
+; CHECK-NEXT:    csel w11, w11, w9, gt
+; CHECK-NEXT:    fmov s5, w11
 ; CHECK-NEXT:    fcvtzs w11, d6
-; CHECK-NEXT:    mov v0.b[9], w13
-; CHECK-NEXT:    fcvtzs w13, d7
-; CHECK-NEXT:    fmov s6, w10
-; CHECK-NEXT:    mov w10, v5.s[1]
+; CHECK-NEXT:    cmp w13, #127
+; CHECK-NEXT:    mov d6, v7.d[1]
+; CHECK-NEXT:    mov v0.b[9], w12
+; CHECK-NEXT:    mov v5.s[1], w10
+; CHECK-NEXT:    csel w10, w13, w8, lt
+; CHECK-NEXT:    cmn w10, #128
+; CHECK-NEXT:    csel w10, w10, w9, gt
 ; CHECK-NEXT:    cmp w11, #127
+; CHECK-NEXT:    fcvtzs w13, d6
 ; CHECK-NEXT:    csel w11, w11, w8, lt
-; CHECK-NEXT:    mov v0.b[10], v5.b[0]
 ; CHECK-NEXT:    cmn w11, #128
-; CHECK-NEXT:    mov v6.s[1], w12
-; CHECK-NEXT:    mov v0.b[11], w10
-; CHECK-NEXT:    csel w10, w11, w9, gt
+; CHECK-NEXT:    mov v0.b[10], v5.b[0]
+; CHECK-NEXT:    mov w12, v5.s[1]
+; CHECK-NEXT:    csel w11, w11, w9, gt
+; CHECK-NEXT:    fmov s6, w11
+; CHECK-NEXT:    fcvtzs w11, d7
 ; CHECK-NEXT:    cmp w13, #127
-; CHECK-NEXT:    csel w8, w13, w8, lt
+; CHECK-NEXT:    mov v0.b[11], w12
+; CHECK-NEXT:    mov v6.s[1], w10
+; CHECK-NEXT:    csel w10, w13, w8, lt
+; CHECK-NEXT:    cmn w10, #128
+; CHECK-NEXT:    csel w10, w10, w9, gt
+; CHECK-NEXT:    cmp w11, #127
+; CHECK-NEXT:    csel w8, w11, w8, lt
 ; CHECK-NEXT:    cmn w8, #128
-; CHECK-NEXT:    csel w8, w8, w9, gt
-; CHECK-NEXT:    mov w9, v6.s[1]
 ; CHECK-NEXT:    mov v0.b[12], v6.b[0]
+; CHECK-NEXT:    mov w11, v6.s[1]
+; CHECK-NEXT:    csel w8, w8, w9, gt
 ; CHECK-NEXT:    fmov s7, w8
-; CHECK-NEXT:    mov v0.b[13], w9
+; CHECK-NEXT:    mov v0.b[13], w11
 ; CHECK-NEXT:    mov v7.s[1], w10
 ; CHECK-NEXT:    mov v0.b[14], v7.b[0]
 ; CHECK-NEXT:    mov w8, v7.s[1]
@@ -3491,62 +3491,62 @@ define <8 x i16> @test_signed_v8f64_v8i16(<8 x double> %f) {
 ; CHECK-LABEL: test_signed_v8f64_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov d4, v3.d[1]
-; CHECK-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-NEXT:    mov w9, #32767 // =0x7fff
 ; CHECK-NEXT:    fcvtzs w10, d3
 ; CHECK-NEXT:    mov w11, #-32768 // =0xffff8000
 ; CHECK-NEXT:    mov d3, v1.d[1]
 ; CHECK-NEXT:    fcvtzs w13, d2
 ; CHECK-NEXT:    fcvtzs w15, d1
-; CHECK-NEXT:    mov d1, v0.d[1]
-; CHECK-NEXT:    fcvtzs w9, d4
+; CHECK-NEXT:    fcvtzs w17, d0
+; CHECK-NEXT:    fcvtzs w8, d4
 ; CHECK-NEXT:    mov d4, v2.d[1]
+; CHECK-NEXT:    mov d2, v0.d[1]
 ; CHECK-NEXT:    fcvtzs w14, d3
-; CHECK-NEXT:    cmp w9, w8
-; CHECK-NEXT:    csel w9, w9, w8, lt
+; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    fcvtzs w12, d4
-; CHECK-NEXT:    cmn w9, #8, lsl #12 // =32768
-; CHECK-NEXT:    csel w9, w9, w11, gt
-; CHECK-NEXT:    cmp w10, w8
-; CHECK-NEXT:    csel w10, w10, w8, lt
+; CHECK-NEXT:    fcvtzs w16, d2
+; CHECK-NEXT:    csel w8, w8, w9, lt
+; CHECK-NEXT:    cmn w8, #8, lsl #12 // =32768
+; CHECK-NEXT:    csel w8, w8, w11, gt
+; CHECK-NEXT:    cmp w10, w9
+; CHECK-NEXT:    csel w10, w10, w9, lt
 ; CHECK-NEXT:    cmn w10, #8, lsl #12 // =32768
 ; CHECK-NEXT:    csel w10, w10, w11, gt
-; CHECK-NEXT:    cmp w12, w8
-; CHECK-NEXT:    csel w12, w12, w8, lt
+; CHECK-NEXT:    cmp w12, w9
+; CHECK-NEXT:    csel w12, w12, w9, lt
+; CHECK-NEXT:    fmov s3, w10
 ; CHECK-NEXT:    cmn w12, #8, lsl #12 // =32768
 ; CHECK-NEXT:    csel w12, w12, w11, gt
-; CHECK-NEXT:    cmp w13, w8
-; CHECK-NEXT:    csel w13, w13, w8, lt
-; CHECK-NEXT:    fmov s5, w10
+; CHECK-NEXT:    cmp w13, w9
+; CHECK-NEXT:    csel w13, w13, w9, lt
+; CHECK-NEXT:    mov v3.s[1], w8
 ; CHECK-NEXT:    cmn w13, #8, lsl #12 // =32768
 ; CHECK-NEXT:    csel w13, w13, w11, gt
-; CHECK-NEXT:    cmp w14, w8
-; CHECK-NEXT:    csel w14, w14, w8, lt
+; CHECK-NEXT:    cmp w14, w9
+; CHECK-NEXT:    csel w14, w14, w9, lt
+; CHECK-NEXT:    fmov s2, w13
 ; CHECK-NEXT:    cmn w14, #8, lsl #12 // =32768
-; CHECK-NEXT:    csel w10, w14, w11, gt
-; CHECK-NEXT:    cmp w15, w8
-; CHECK-NEXT:    fcvtzs w14, d1
-; CHECK-NEXT:    csel w15, w15, w8, lt
+; CHECK-NEXT:    csel w14, w14, w11, gt
+; CHECK-NEXT:    cmp w15, w9
+; CHECK-NEXT:    csel w15, w15, w9, lt
+; CHECK-NEXT:    mov v2.s[1], w12
 ; CHECK-NEXT:    cmn w15, #8, lsl #12 // =32768
-; CHECK-NEXT:    mov v5.s[1], w9
-; CHECK-NEXT:    csel w9, w15, w11, gt
-; CHECK-NEXT:    cmp w14, w8
-; CHECK-NEXT:    fcvtzs w15, d0
-; CHECK-NEXT:    fmov s4, w13
-; CHECK-NEXT:    csel w13, w14, w8, lt
-; CHECK-NEXT:    cmn w13, #8, lsl #12 // =32768
-; CHECK-NEXT:    csel w13, w13, w11, gt
-; CHECK-NEXT:    cmp w15, w8
-; CHECK-NEXT:    mov v4.s[1], w12
-; CHECK-NEXT:    csel w8, w15, w8, lt
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    cmn w8, #8, lsl #12 // =32768
-; CHECK-NEXT:    csel w8, w8, w11, gt
-; CHECK-NEXT:    mov v3.s[1], w10
-; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    csel w15, w15, w11, gt
+; CHECK-NEXT:    cmp w16, w9
+; CHECK-NEXT:    csel w10, w16, w9, lt
+; CHECK-NEXT:    fmov s1, w15
+; CHECK-NEXT:    cmn w10, #8, lsl #12 // =32768
+; CHECK-NEXT:    csel w8, w10, w11, gt
+; CHECK-NEXT:    cmp w17, w9
+; CHECK-NEXT:    csel w9, w17, w9, lt
+; CHECK-NEXT:    mov v1.s[1], w14
+; CHECK-NEXT:    cmn w9, #8, lsl #12 // =32768
+; CHECK-NEXT:    csel w9, w9, w11, gt
+; CHECK-NEXT:    fmov s0, w9
+; CHECK-NEXT:    mov v0.s[1], w8
 ; CHECK-NEXT:    adrp x8, .LCPI84_0
-; CHECK-NEXT:    mov v2.s[1], w13
-; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI84_0]
-; CHECK-NEXT:    tbl v0.16b, { v2.16b, v3.16b, v4.16b, v5.16b }, v0.16b
+; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI84_0]
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
 ; CHECK-NEXT:    ret
     %x = call <8 x i16> @llvm.fptosi.sat.v8f64.v8i16(<8 x double> %f)
     ret <8 x i16> %x
@@ -3558,44 +3558,48 @@ define <16 x i16> @test_signed_v16f64_v16i16(<16 x double> %f) {
 ; CHECK-NEXT:    mov d16, v3.d[1]
 ; CHECK-NEXT:    mov w9, #32767 // =0x7fff
 ; CHECK-NEXT:    fcvtzs w11, d3
-; CHECK-NEXT:    mov w8, #-32768 // =0xffff8000
 ; CHECK-NEXT:    mov d3, v1.d[1]
 ; CHECK-NEXT:    fcvtzs w14, d2
 ; CHECK-NEXT:    fcvtzs w15, d1
 ; CHECK-NEXT:    mov d1, v7.d[1]
-; CHECK-NEXT:    fcvtzs w10, d16
-; CHECK-NEXT:    mov d16, v2.d[1]
-; CHECK-NEXT:    mov d2, v0.d[1]
 ; CHECK-NEXT:    fcvtzs w18, d0
-; CHECK-NEXT:    mov d0, v6.d[1]
 ; CHECK-NEXT:    fcvtzs w0, d7
-; CHECK-NEXT:    cmp w10, w9
 ; CHECK-NEXT:    fcvtzs w2, d6
-; CHECK-NEXT:    csel w10, w10, w9, lt
-; CHECK-NEXT:    fcvtzs w12, d16
-; CHECK-NEXT:    cmn w10, #8, lsl #12 // =32768
+; CHECK-NEXT:    fcvtzs w4, d5
+; CHECK-NEXT:    fcvtzs w6, d4
+; CHECK-NEXT:    fcvtzs w8, d16
+; CHECK-NEXT:    mov d16, v2.d[1]
+; CHECK-NEXT:    mov d2, v0.d[1]
+; CHECK-NEXT:    mov d0, v6.d[1]
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    fcvtzs w13, d16
 ; CHECK-NEXT:    fcvtzs w17, d2
+; CHECK-NEXT:    csel w10, w8, w9, lt
+; CHECK-NEXT:    mov w8, #-32768 // =0xffff8000
+; CHECK-NEXT:    fcvtzs w1, d0
+; CHECK-NEXT:    cmn w10, #8, lsl #12 // =32768
+; CHECK-NEXT:    mov d0, v5.d[1]
 ; CHECK-NEXT:    csel w10, w10, w8, gt
 ; CHECK-NEXT:    cmp w11, w9
 ; CHECK-NEXT:    csel w11, w11, w9, lt
-; CHECK-NEXT:    fcvtzs w1, d0
 ; CHECK-NEXT:    cmn w11, #8, lsl #12 // =32768
+; CHECK-NEXT:    csel w12, w11, w8, gt
+; CHECK-NEXT:    cmp w13, w9
+; CHECK-NEXT:    fcvtzs w3, d0
+; CHECK-NEXT:    csel w11, w13, w9, lt
+; CHECK-NEXT:    fcvtzs w13, d3
 ; CHECK-NEXT:    mov d0, v4.d[1]
-; CHECK-NEXT:    csel w13, w11, w8, gt
-; CHECK-NEXT:    cmp w12, w9
-; CHECK-NEXT:    csel w11, w12, w9, lt
-; CHECK-NEXT:    fcvtzs w12, d3
 ; CHECK-NEXT:    cmn w11, #8, lsl #12 // =32768
 ; CHECK-NEXT:    csel w11, w11, w8, gt
 ; CHECK-NEXT:    cmp w14, w9
 ; CHECK-NEXT:    csel w14, w14, w9, lt
-; CHECK-NEXT:    fmov s19, w13
 ; CHECK-NEXT:    cmn w14, #8, lsl #12 // =32768
+; CHECK-NEXT:    fcvtzs w5, d0
 ; CHECK-NEXT:    csel w14, w14, w8, gt
-; CHECK-NEXT:    cmp w12, w9
-; CHECK-NEXT:    csel w12, w12, w9, lt
-; CHECK-NEXT:    cmn w12, #8, lsl #12 // =32768
-; CHECK-NEXT:    csel w12, w12, w8, gt
+; CHECK-NEXT:    cmp w13, w9
+; CHECK-NEXT:    csel w13, w13, w9, lt
+; CHECK-NEXT:    cmn w13, #8, lsl #12 // =32768
+; CHECK-NEXT:    csel w13, w13, w8, gt
 ; CHECK-NEXT:    cmp w15, w9
 ; CHECK-NEXT:    csel w15, w15, w9, lt
 ; CHECK-NEXT:    cmn w15, #8, lsl #12 // =32768
@@ -3603,68 +3607,64 @@ define <16 x i16> @test_signed_v16f64_v16i16(<16 x double> %f) {
 ; CHECK-NEXT:    cmp w17, w9
 ; CHECK-NEXT:    csel w15, w17, w9, lt
 ; CHECK-NEXT:    fcvtzs w17, d1
+; CHECK-NEXT:    fmov s3, w12
 ; CHECK-NEXT:    cmn w15, #8, lsl #12 // =32768
-; CHECK-NEXT:    mov d1, v5.d[1]
 ; CHECK-NEXT:    csel w15, w15, w8, gt
 ; CHECK-NEXT:    cmp w18, w9
 ; CHECK-NEXT:    csel w18, w18, w9, lt
+; CHECK-NEXT:    mov v3.s[1], w10
 ; CHECK-NEXT:    cmn w18, #8, lsl #12 // =32768
+; CHECK-NEXT:    fmov s2, w14
 ; CHECK-NEXT:    csel w18, w18, w8, gt
 ; CHECK-NEXT:    cmp w17, w9
 ; CHECK-NEXT:    csel w17, w17, w9, lt
 ; CHECK-NEXT:    cmn w17, #8, lsl #12 // =32768
+; CHECK-NEXT:    mov v2.s[1], w11
 ; CHECK-NEXT:    csel w17, w17, w8, gt
 ; CHECK-NEXT:    cmp w0, w9
+; CHECK-NEXT:    fmov s1, w16
 ; CHECK-NEXT:    csel w0, w0, w9, lt
 ; CHECK-NEXT:    cmn w0, #8, lsl #12 // =32768
-; CHECK-NEXT:    csel w13, w0, w8, gt
+; CHECK-NEXT:    csel w0, w0, w8, gt
 ; CHECK-NEXT:    cmp w1, w9
+; CHECK-NEXT:    mov v1.s[1], w13
 ; CHECK-NEXT:    csel w1, w1, w9, lt
-; CHECK-NEXT:    fcvtzs w0, d1
+; CHECK-NEXT:    fmov s7, w0
+; CHECK-NEXT:    fmov s0, w18
 ; CHECK-NEXT:    cmn w1, #8, lsl #12 // =32768
-; CHECK-NEXT:    mov v19.s[1], w10
-; CHECK-NEXT:    csel w10, w1, w8, gt
+; CHECK-NEXT:    csel w1, w1, w8, gt
 ; CHECK-NEXT:    cmp w2, w9
-; CHECK-NEXT:    fcvtzs w1, d5
 ; CHECK-NEXT:    csel w2, w2, w9, lt
-; CHECK-NEXT:    fmov s18, w14
+; CHECK-NEXT:    mov v7.s[1], w17
+; CHECK-NEXT:    mov v0.s[1], w15
 ; CHECK-NEXT:    cmn w2, #8, lsl #12 // =32768
-; CHECK-NEXT:    fmov s23, w13
 ; CHECK-NEXT:    csel w2, w2, w8, gt
-; CHECK-NEXT:    cmp w0, w9
-; CHECK-NEXT:    csel w14, w0, w9, lt
-; CHECK-NEXT:    cmn w14, #8, lsl #12 // =32768
-; CHECK-NEXT:    csel w13, w14, w8, gt
-; CHECK-NEXT:    cmp w1, w9
-; CHECK-NEXT:    fcvtzs w14, d0
-; CHECK-NEXT:    csel w0, w1, w9, lt
-; CHECK-NEXT:    cmn w0, #8, lsl #12 // =32768
-; CHECK-NEXT:    mov v18.s[1], w11
-; CHECK-NEXT:    csel w11, w0, w8, gt
-; CHECK-NEXT:    mov v23.s[1], w17
-; CHECK-NEXT:    cmp w14, w9
-; CHECK-NEXT:    fcvtzs w17, d4
-; CHECK-NEXT:    csel w14, w14, w9, lt
-; CHECK-NEXT:    fmov s22, w2
-; CHECK-NEXT:    cmn w14, #8, lsl #12 // =32768
-; CHECK-NEXT:    csel w14, w14, w8, gt
-; CHECK-NEXT:    fmov s17, w16
-; CHECK-NEXT:    cmp w17, w9
-; CHECK-NEXT:    mov v22.s[1], w10
-; CHECK-NEXT:    csel w9, w17, w9, lt
-; CHECK-NEXT:    fmov s21, w11
+; CHECK-NEXT:    cmp w3, w9
+; CHECK-NEXT:    csel w3, w3, w9, lt
+; CHECK-NEXT:    fmov s6, w2
+; CHECK-NEXT:    cmn w3, #8, lsl #12 // =32768
+; CHECK-NEXT:    csel w3, w3, w8, gt
+; CHECK-NEXT:    cmp w4, w9
+; CHECK-NEXT:    csel w4, w4, w9, lt
+; CHECK-NEXT:    mov v6.s[1], w1
+; CHECK-NEXT:    cmn w4, #8, lsl #12 // =32768
+; CHECK-NEXT:    csel w12, w4, w8, gt
+; CHECK-NEXT:    cmp w5, w9
+; CHECK-NEXT:    csel w10, w5, w9, lt
+; CHECK-NEXT:    fmov s5, w12
+; CHECK-NEXT:    cmn w10, #8, lsl #12 // =32768
+; CHECK-NEXT:    csel w10, w10, w8, gt
+; CHECK-NEXT:    cmp w6, w9
+; CHECK-NEXT:    csel w9, w6, w9, lt
+; CHECK-NEXT:    mov v5.s[1], w3
 ; CHECK-NEXT:    cmn w9, #8, lsl #12 // =32768
 ; CHECK-NEXT:    csel w8, w9, w8, gt
-; CHECK-NEXT:    adrp x9, .LCPI85_0
-; CHECK-NEXT:    mov v17.s[1], w12
-; CHECK-NEXT:    mov v21.s[1], w13
-; CHECK-NEXT:    fmov s16, w18
-; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI85_0]
-; CHECK-NEXT:    fmov s20, w8
-; CHECK-NEXT:    mov v16.s[1], w15
-; CHECK-NEXT:    mov v20.s[1], w14
-; CHECK-NEXT:    tbl v0.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v1.16b
-; CHECK-NEXT:    tbl v1.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v1.16b
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    adrp x8, .LCPI85_0
+; CHECK-NEXT:    ldr q16, [x8, :lo12:.LCPI85_0]
+; CHECK-NEXT:    mov v4.s[1], w10
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v16.16b
+; CHECK-NEXT:    tbl v1.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v16.16b
 ; CHECK-NEXT:    ret
     %x = call <16 x i16> @llvm.fptosi.sat.v16f64.v16i16(<16 x double> %f)
     ret <16 x i16> %x

diff  --git a/llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll
index e1148a51751f15..1e1e7327f71fdc 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll
@@ -32,7 +32,7 @@ define i8 @test_unsigned_i8_f32(float %f) nounwind {
 ; CHECK-LABEL: test_unsigned_i8_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtzu w9, s0
-; CHECK-NEXT:    mov w8, #255
+; CHECK-NEXT:    mov w8, #255 // =0xff
 ; CHECK-NEXT:    cmp w9, #255
 ; CHECK-NEXT:    csel w0, w9, w8, lo
 ; CHECK-NEXT:    ret
@@ -44,7 +44,7 @@ define i13 @test_unsigned_i13_f32(float %f) nounwind {
 ; CHECK-LABEL: test_unsigned_i13_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtzu w8, s0
-; CHECK-NEXT:    mov w9, #8191
+; CHECK-NEXT:    mov w9, #8191 // =0x1fff
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    csel w0, w8, w9, lo
 ; CHECK-NEXT:    ret
@@ -56,7 +56,7 @@ define i16 @test_unsigned_i16_f32(float %f) nounwind {
 ; CHECK-LABEL: test_unsigned_i16_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtzu w8, s0
-; CHECK-NEXT:    mov w9, #65535
+; CHECK-NEXT:    mov w9, #65535 // =0xffff
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    csel w0, w8, w9, lo
 ; CHECK-NEXT:    ret
@@ -68,7 +68,7 @@ define i19 @test_unsigned_i19_f32(float %f) nounwind {
 ; CHECK-LABEL: test_unsigned_i19_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtzu w8, s0
-; CHECK-NEXT:    mov w9, #524287
+; CHECK-NEXT:    mov w9, #524287 // =0x7ffff
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    csel w0, w8, w9, lo
 ; CHECK-NEXT:    ret
@@ -89,7 +89,7 @@ define i50 @test_unsigned_i50_f32(float %f) nounwind {
 ; CHECK-LABEL: test_unsigned_i50_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtzu x8, s0
-; CHECK-NEXT:    mov x9, #1125899906842623
+; CHECK-NEXT:    mov x9, #1125899906842623 // =0x3ffffffffffff
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    csel x0, x8, x9, lo
 ; CHECK-NEXT:    ret
@@ -113,11 +113,11 @@ define i100 @test_unsigned_i100_f32(float %f) nounwind {
 ; CHECK-NEXT:    str x30, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    fmov s8, s0
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    mov w8, #1904214015
+; CHECK-NEXT:    mov w8, #1904214015 // =0x717fffff
 ; CHECK-NEXT:    fcmp s8, #0.0
-; CHECK-NEXT:    ldr x30, [sp, #8] // 8-byte Folded Reload
-; CHECK-NEXT:    mov x10, #68719476735
+; CHECK-NEXT:    mov x10, #68719476735 // =0xfffffffff
 ; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldr x30, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
 ; CHECK-NEXT:    fcmp s8, s0
@@ -136,7 +136,7 @@ define i128 @test_unsigned_i128_f32(float %f) nounwind {
 ; CHECK-NEXT:    str x30, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    fmov s8, s0
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    mov w8, #2139095039
+; CHECK-NEXT:    mov w8, #2139095039 // =0x7f7fffff
 ; CHECK-NEXT:    fcmp s8, #0.0
 ; CHECK-NEXT:    ldr x30, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    fmov s0, w8
@@ -181,7 +181,7 @@ define i8 @test_unsigned_i8_f64(double %f) nounwind {
 ; CHECK-LABEL: test_unsigned_i8_f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtzu w9, d0
-; CHECK-NEXT:    mov w8, #255
+; CHECK-NEXT:    mov w8, #255 // =0xff
 ; CHECK-NEXT:    cmp w9, #255
 ; CHECK-NEXT:    csel w0, w9, w8, lo
 ; CHECK-NEXT:    ret
@@ -193,7 +193,7 @@ define i13 @test_unsigned_i13_f64(double %f) nounwind {
 ; CHECK-LABEL: test_unsigned_i13_f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtzu w8, d0
-; CHECK-NEXT:    mov w9, #8191
+; CHECK-NEXT:    mov w9, #8191 // =0x1fff
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    csel w0, w8, w9, lo
 ; CHECK-NEXT:    ret
@@ -205,7 +205,7 @@ define i16 @test_unsigned_i16_f64(double %f) nounwind {
 ; CHECK-LABEL: test_unsigned_i16_f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtzu w8, d0
-; CHECK-NEXT:    mov w9, #65535
+; CHECK-NEXT:    mov w9, #65535 // =0xffff
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    csel w0, w8, w9, lo
 ; CHECK-NEXT:    ret
@@ -217,7 +217,7 @@ define i19 @test_unsigned_i19_f64(double %f) nounwind {
 ; CHECK-LABEL: test_unsigned_i19_f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtzu w8, d0
-; CHECK-NEXT:    mov w9, #524287
+; CHECK-NEXT:    mov w9, #524287 // =0x7ffff
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    csel w0, w8, w9, lo
 ; CHECK-NEXT:    ret
@@ -238,7 +238,7 @@ define i50 @test_unsigned_i50_f64(double %f) nounwind {
 ; CHECK-LABEL: test_unsigned_i50_f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtzu x8, d0
-; CHECK-NEXT:    mov x9, #1125899906842623
+; CHECK-NEXT:    mov x9, #1125899906842623 // =0x3ffffffffffff
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    csel x0, x8, x9, lo
 ; CHECK-NEXT:    ret
@@ -262,11 +262,11 @@ define i100 @test_unsigned_i100_f64(double %f) nounwind {
 ; CHECK-NEXT:    str x30, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    fmov d8, d0
 ; CHECK-NEXT:    bl __fixunsdfti
-; CHECK-NEXT:    mov x8, #5057542381537067007
+; CHECK-NEXT:    mov x8, #5057542381537067007 // =0x462fffffffffffff
 ; CHECK-NEXT:    fcmp d8, #0.0
-; CHECK-NEXT:    ldr x30, [sp, #8] // 8-byte Folded Reload
-; CHECK-NEXT:    mov x10, #68719476735
+; CHECK-NEXT:    mov x10, #68719476735 // =0xfffffffff
 ; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldr x30, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
 ; CHECK-NEXT:    fcmp d8, d0
@@ -285,7 +285,7 @@ define i128 @test_unsigned_i128_f64(double %f) nounwind {
 ; CHECK-NEXT:    str x30, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    fmov d8, d0
 ; CHECK-NEXT:    bl __fixunsdfti
-; CHECK-NEXT:    mov x8, #5183643171103440895
+; CHECK-NEXT:    mov x8, #5183643171103440895 // =0x47efffffffffffff
 ; CHECK-NEXT:    fcmp d8, #0.0
 ; CHECK-NEXT:    ldr x30, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    fmov d0, x8
@@ -338,7 +338,7 @@ define i8 @test_unsigned_i8_f16(half %f) nounwind {
 ; CHECK-CVT-LABEL: test_unsigned_i8_f16:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov w8, #255
+; CHECK-CVT-NEXT:    mov w8, #255 // =0xff
 ; CHECK-CVT-NEXT:    fcvtzu w9, s0
 ; CHECK-CVT-NEXT:    cmp w9, #255
 ; CHECK-CVT-NEXT:    csel w0, w9, w8, lo
@@ -347,7 +347,7 @@ define i8 @test_unsigned_i8_f16(half %f) nounwind {
 ; CHECK-FP16-LABEL: test_unsigned_i8_f16:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcvtzu w9, h0
-; CHECK-FP16-NEXT:    mov w8, #255
+; CHECK-FP16-NEXT:    mov w8, #255 // =0xff
 ; CHECK-FP16-NEXT:    cmp w9, #255
 ; CHECK-FP16-NEXT:    csel w0, w9, w8, lo
 ; CHECK-FP16-NEXT:    ret
@@ -359,7 +359,7 @@ define i13 @test_unsigned_i13_f16(half %f) nounwind {
 ; CHECK-CVT-LABEL: test_unsigned_i13_f16:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov w9, #8191
+; CHECK-CVT-NEXT:    mov w9, #8191 // =0x1fff
 ; CHECK-CVT-NEXT:    fcvtzu w8, s0
 ; CHECK-CVT-NEXT:    cmp w8, w9
 ; CHECK-CVT-NEXT:    csel w0, w8, w9, lo
@@ -368,7 +368,7 @@ define i13 @test_unsigned_i13_f16(half %f) nounwind {
 ; CHECK-FP16-LABEL: test_unsigned_i13_f16:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcvtzu w8, h0
-; CHECK-FP16-NEXT:    mov w9, #8191
+; CHECK-FP16-NEXT:    mov w9, #8191 // =0x1fff
 ; CHECK-FP16-NEXT:    cmp w8, w9
 ; CHECK-FP16-NEXT:    csel w0, w8, w9, lo
 ; CHECK-FP16-NEXT:    ret
@@ -380,7 +380,7 @@ define i16 @test_unsigned_i16_f16(half %f) nounwind {
 ; CHECK-CVT-LABEL: test_unsigned_i16_f16:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov w9, #65535
+; CHECK-CVT-NEXT:    mov w9, #65535 // =0xffff
 ; CHECK-CVT-NEXT:    fcvtzu w8, s0
 ; CHECK-CVT-NEXT:    cmp w8, w9
 ; CHECK-CVT-NEXT:    csel w0, w8, w9, lo
@@ -389,7 +389,7 @@ define i16 @test_unsigned_i16_f16(half %f) nounwind {
 ; CHECK-FP16-LABEL: test_unsigned_i16_f16:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcvtzu w8, h0
-; CHECK-FP16-NEXT:    mov w9, #65535
+; CHECK-FP16-NEXT:    mov w9, #65535 // =0xffff
 ; CHECK-FP16-NEXT:    cmp w8, w9
 ; CHECK-FP16-NEXT:    csel w0, w8, w9, lo
 ; CHECK-FP16-NEXT:    ret
@@ -401,7 +401,7 @@ define i19 @test_unsigned_i19_f16(half %f) nounwind {
 ; CHECK-CVT-LABEL: test_unsigned_i19_f16:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov w9, #524287
+; CHECK-CVT-NEXT:    mov w9, #524287 // =0x7ffff
 ; CHECK-CVT-NEXT:    fcvtzu w8, s0
 ; CHECK-CVT-NEXT:    cmp w8, w9
 ; CHECK-CVT-NEXT:    csel w0, w8, w9, lo
@@ -410,7 +410,7 @@ define i19 @test_unsigned_i19_f16(half %f) nounwind {
 ; CHECK-FP16-LABEL: test_unsigned_i19_f16:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcvtzu w8, h0
-; CHECK-FP16-NEXT:    mov w9, #524287
+; CHECK-FP16-NEXT:    mov w9, #524287 // =0x7ffff
 ; CHECK-FP16-NEXT:    cmp w8, w9
 ; CHECK-FP16-NEXT:    csel w0, w8, w9, lo
 ; CHECK-FP16-NEXT:    ret
@@ -437,7 +437,7 @@ define i50 @test_unsigned_i50_f16(half %f) nounwind {
 ; CHECK-CVT-LABEL: test_unsigned_i50_f16:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov x9, #1125899906842623
+; CHECK-CVT-NEXT:    mov x9, #1125899906842623 // =0x3ffffffffffff
 ; CHECK-CVT-NEXT:    fcvtzu x8, s0
 ; CHECK-CVT-NEXT:    cmp x8, x9
 ; CHECK-CVT-NEXT:    csel x0, x8, x9, lo
@@ -446,7 +446,7 @@ define i50 @test_unsigned_i50_f16(half %f) nounwind {
 ; CHECK-FP16-LABEL: test_unsigned_i50_f16:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcvtzu x8, h0
-; CHECK-FP16-NEXT:    mov x9, #1125899906842623
+; CHECK-FP16-NEXT:    mov x9, #1125899906842623 // =0x3ffffffffffff
 ; CHECK-FP16-NEXT:    cmp x8, x9
 ; CHECK-FP16-NEXT:    csel x0, x8, x9, lo
 ; CHECK-FP16-NEXT:    ret
@@ -477,11 +477,11 @@ define i100 @test_unsigned_i100_f16(half %f) nounwind {
 ; CHECK-NEXT:    str x30, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    mov w8, #1904214015
+; CHECK-NEXT:    mov w8, #1904214015 // =0x717fffff
 ; CHECK-NEXT:    fcmp s8, #0.0
-; CHECK-NEXT:    ldr x30, [sp, #8] // 8-byte Folded Reload
-; CHECK-NEXT:    mov x10, #68719476735
+; CHECK-NEXT:    mov x10, #68719476735 // =0xfffffffff
 ; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldr x30, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
 ; CHECK-NEXT:    fcmp s8, s0
@@ -501,7 +501,7 @@ define i128 @test_unsigned_i128_f16(half %f) nounwind {
 ; CHECK-NEXT:    str x30, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixunssfti
-; CHECK-NEXT:    mov w8, #2139095039
+; CHECK-NEXT:    mov w8, #2139095039 // =0x7f7fffff
 ; CHECK-NEXT:    fcmp s8, #0.0
 ; CHECK-NEXT:    ldr x30, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    fmov s0, w8

diff  --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index 1dfd8a53149ee9..f23254cbf7b228 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -165,9 +165,9 @@ define <2 x i32> @test_unsigned_v2f64_v2i32(<2 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov d1, v0.d[1]
 ; CHECK-NEXT:    fcvtzu w8, d0
+; CHECK-NEXT:    fcvtzu w9, d1
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fcvtzu w8, d1
-; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    mov v0.s[1], w9
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
     %x = call <2 x i32> @llvm.fptoui.sat.v2f64.v2i32(<2 x double> %f)
@@ -178,10 +178,10 @@ define <3 x i32> @test_unsigned_v3f64_v3i32(<3 x double> %f) {
 ; CHECK-LABEL: test_unsigned_v3f64_v3i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtzu w8, d0
+; CHECK-NEXT:    fcvtzu w9, d1
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fcvtzu w8, d1
-; CHECK-NEXT:    mov v0.s[1], w8
 ; CHECK-NEXT:    fcvtzu w8, d2
+; CHECK-NEXT:    mov v0.s[1], w9
 ; CHECK-NEXT:    mov v0.s[2], w8
 ; CHECK-NEXT:    fcvtzu w8, d0
 ; CHECK-NEXT:    mov v0.s[3], w8
@@ -195,11 +195,11 @@ define <4 x i32> @test_unsigned_v4f64_v4i32(<4 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov d2, v0.d[1]
 ; CHECK-NEXT:    fcvtzu w8, d0
+; CHECK-NEXT:    fcvtzu w9, d2
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fcvtzu w8, d2
-; CHECK-NEXT:    mov v0.s[1], w8
 ; CHECK-NEXT:    fcvtzu w8, d1
 ; CHECK-NEXT:    mov d1, v1.d[1]
+; CHECK-NEXT:    mov v0.s[1], w9
 ; CHECK-NEXT:    mov v0.s[2], w8
 ; CHECK-NEXT:    fcvtzu w8, d1
 ; CHECK-NEXT:    mov v0.s[3], w8
@@ -262,8 +262,8 @@ define <1 x i32> @test_unsigned_v1f128_v1i32(<1 x fp128> %f) {
 ; CHECK-NEXT:    adrp x8, .LCPI14_1
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    cmp w19, #0
-; CHECK-NEXT:    csel w19, wzr, w0, lt
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI14_1]
+; CHECK-NEXT:    csel w19, wzr, w0, lt
 ; CHECK-NEXT:    bl __gttf2
 ; CHECK-NEXT:    cmp w0, #0
 ; CHECK-NEXT:    csinv w8, w19, wzr, le
@@ -285,11 +285,11 @@ define <2 x i32> @test_unsigned_v2f128_v2i32(<2 x fp128> %f) {
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w20, -16
 ; CHECK-NEXT:    .cfi_offset w30, -32
-; CHECK-NEXT:    adrp x8, .LCPI15_0
-; CHECK-NEXT:    stp q1, q0, [sp, #32] // 32-byte Folded Spill
 ; CHECK-NEXT:    mov v2.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    stp q1, q0, [sp, #32] // 32-byte Folded Spill
+; CHECK-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI15_0]
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __getf2
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
@@ -298,21 +298,21 @@ define <2 x i32> @test_unsigned_v2f128_v2i32(<2 x fp128> %f) {
 ; CHECK-NEXT:    adrp x8, .LCPI15_1
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    cmp w19, #0
-; CHECK-NEXT:    csel w19, wzr, w0, lt
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI15_1]
+; CHECK-NEXT:    csel w19, wzr, w0, lt
 ; CHECK-NEXT:    str q1, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __gttf2
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    cmp w0, #0
 ; CHECK-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    cmp w0, #0
 ; CHECK-NEXT:    csinv w20, w19, wzr, le
 ; CHECK-NEXT:    bl __getf2
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov w19, w0
 ; CHECK-NEXT:    bl __fixunstfsi
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    cmp w19, #0
 ; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    cmp w19, #0
 ; CHECK-NEXT:    csel w19, wzr, w0, lt
 ; CHECK-NEXT:    bl __gttf2
 ; CHECK-NEXT:    cmp w0, #0
@@ -338,12 +338,12 @@ define <3 x i32> @test_unsigned_v3f128_v3i32(<3 x fp128> %f) {
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w20, -16
 ; CHECK-NEXT:    .cfi_offset w30, -32
-; CHECK-NEXT:    adrp x8, .LCPI16_0
 ; CHECK-NEXT:    stp q0, q2, [sp, #48] // 32-byte Folded Spill
 ; CHECK-NEXT:    mov v2.16b, v1.16b
+; CHECK-NEXT:    adrp x8, .LCPI16_0
 ; CHECK-NEXT:    str q1, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    str q1, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __getf2
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
@@ -352,8 +352,8 @@ define <3 x i32> @test_unsigned_v3f128_v3i32(<3 x fp128> %f) {
 ; CHECK-NEXT:    adrp x8, .LCPI16_1
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    cmp w19, #0
-; CHECK-NEXT:    csel w19, wzr, w0, lt
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI16_1]
+; CHECK-NEXT:    csel w19, wzr, w0, lt
 ; CHECK-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __gttf2
 ; CHECK-NEXT:    ldp q1, q0, [sp, #32] // 32-byte Folded Reload
@@ -364,8 +364,8 @@ define <3 x i32> @test_unsigned_v3f128_v3i32(<3 x fp128> %f) {
 ; CHECK-NEXT:    mov w19, w0
 ; CHECK-NEXT:    bl __fixunstfsi
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    cmp w19, #0
 ; CHECK-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    cmp w19, #0
 ; CHECK-NEXT:    csel w19, wzr, w0, lt
 ; CHECK-NEXT:    bl __gttf2
 ; CHECK-NEXT:    cmp w0, #0
@@ -380,14 +380,14 @@ define <3 x i32> @test_unsigned_v3f128_v3i32(<3 x fp128> %f) {
 ; CHECK-NEXT:    mov w19, w0
 ; CHECK-NEXT:    bl __fixunstfsi
 ; CHECK-NEXT:    ldr q0, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    cmp w19, #0
 ; CHECK-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    cmp w19, #0
 ; CHECK-NEXT:    csel w19, wzr, w0, lt
 ; CHECK-NEXT:    bl __gttf2
 ; CHECK-NEXT:    cmp w0, #0
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    csinv w8, w19, wzr, le
 ; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT:    csinv w8, w19, wzr, le
 ; CHECK-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov v0.s[2], w8
 ; CHECK-NEXT:    add sp, sp, #112
@@ -406,13 +406,13 @@ define <4 x i32> @test_unsigned_v4f128_v4i32(<4 x fp128> %f) {
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w20, -16
 ; CHECK-NEXT:    .cfi_offset w30, -32
-; CHECK-NEXT:    adrp x8, .LCPI17_0
 ; CHECK-NEXT:    stp q0, q2, [sp, #16] // 32-byte Folded Spill
 ; CHECK-NEXT:    mov v2.16b, v1.16b
+; CHECK-NEXT:    adrp x8, .LCPI17_0
 ; CHECK-NEXT:    str q1, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    str q3, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI17_0]
+; CHECK-NEXT:    str q3, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    str q1, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __getf2
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
@@ -421,21 +421,21 @@ define <4 x i32> @test_unsigned_v4f128_v4i32(<4 x fp128> %f) {
 ; CHECK-NEXT:    adrp x8, .LCPI17_1
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    cmp w19, #0
-; CHECK-NEXT:    csel w19, wzr, w0, lt
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI17_1]
+; CHECK-NEXT:    csel w19, wzr, w0, lt
 ; CHECK-NEXT:    str q1, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __gttf2
 ; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    cmp w0, #0
 ; CHECK-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    cmp w0, #0
 ; CHECK-NEXT:    csinv w20, w19, wzr, le
 ; CHECK-NEXT:    bl __getf2
 ; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov w19, w0
 ; CHECK-NEXT:    bl __fixunstfsi
 ; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    cmp w19, #0
 ; CHECK-NEXT:    ldr q1, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    cmp w19, #0
 ; CHECK-NEXT:    csel w19, wzr, w0, lt
 ; CHECK-NEXT:    bl __gttf2
 ; CHECK-NEXT:    cmp w0, #0
@@ -464,14 +464,14 @@ define <4 x i32> @test_unsigned_v4f128_v4i32(<4 x fp128> %f) {
 ; CHECK-NEXT:    mov w19, w0
 ; CHECK-NEXT:    bl __fixunstfsi
 ; CHECK-NEXT:    ldr q0, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT:    cmp w19, #0
 ; CHECK-NEXT:    ldr q1, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    cmp w19, #0
 ; CHECK-NEXT:    csel w19, wzr, w0, lt
 ; CHECK-NEXT:    bl __gttf2
 ; CHECK-NEXT:    cmp w0, #0
 ; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    csinv w8, w19, wzr, le
 ; CHECK-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT:    csinv w8, w19, wzr, le
 ; CHECK-NEXT:    ldp x20, x19, [sp, #112] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov v0.s[3], w8
 ; CHECK-NEXT:    add sp, sp, #128
@@ -567,10 +567,10 @@ define <6 x i32> @test_unsigned_v6f16_v6i32(<6 x half> %f) {
 ; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
 ; CHECK-NEXT:    mov w1, v1.s[1]
 ; CHECK-NEXT:    mov w2, v1.s[2]
-; CHECK-NEXT:    mov w3, v1.s[3]
 ; CHECK-NEXT:    mov w5, v0.s[1]
-; CHECK-NEXT:    fmov w0, s1
+; CHECK-NEXT:    mov w3, v1.s[3]
 ; CHECK-NEXT:    fmov w4, s0
+; CHECK-NEXT:    fmov w0, s1
 ; CHECK-NEXT:    ret
     %x = call <6 x i32> @llvm.fptoui.sat.v6f16.v6i32(<6 x half> %f)
     ret <6 x i32> %x
@@ -710,9 +710,9 @@ define <2 x i64> @test_unsigned_v2f32_v2i64(<2 x float> %f) {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    mov s1, v0.s[1]
 ; CHECK-NEXT:    fcvtzu x8, s0
+; CHECK-NEXT:    fcvtzu x9, s1
 ; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fcvtzu x8, s1
-; CHECK-NEXT:    mov v0.d[1], x8
+; CHECK-NEXT:    mov v0.d[1], x9
 ; CHECK-NEXT:    ret
     %x = call <2 x i64> @llvm.fptoui.sat.v2f32.v2i64(<2 x float> %f)
     ret <2 x i64> %x
@@ -740,9 +740,9 @@ define <2 x i100> @test_unsigned_v2f32_v2i100(<2 x float> %f) {
 ; CHECK-NEXT:    mov w8, #1904214015 // =0x717fffff
 ; CHECK-NEXT:    fcmp s8, #0.0
 ; CHECK-NEXT:    mov x21, #68719476735 // =0xfffffffff
+; CHECK-NEXT:    fmov s9, w8
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    fmov s9, w8
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
@@ -760,8 +760,8 @@ define <2 x i100> @test_unsigned_v2f32_v2i100(<2 x float> %f) {
 ; CHECK-NEXT:    ldp d9, d8, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
 ; CHECK-NEXT:    csel x1, x21, x9, gt
-; CHECK-NEXT:    ldp x30, x21, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldp x30, x21, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov v0.d[1], x1
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    add sp, sp, #64
@@ -791,8 +791,8 @@ define <2 x i128> @test_unsigned_v2f32_v2i128(<2 x float> %f) {
 ; CHECK-NEXT:    mov w8, #2139095039 // =0x7f7fffff
 ; CHECK-NEXT:    fcmp s8, #0.0
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    fmov s9, w8
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    csel x8, xzr, x1, lt
 ; CHECK-NEXT:    csel x9, xzr, x0, lt
 ; CHECK-NEXT:    fcmp s8, s9
@@ -802,8 +802,8 @@ define <2 x i128> @test_unsigned_v2f32_v2i128(<2 x float> %f) {
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov x2, x19
 ; CHECK-NEXT:    mov x3, x20
-; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
 ; CHECK-NEXT:    fcmp s0, #0.0
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
@@ -904,14 +904,14 @@ define <4 x i50> @test_unsigned_v4f32_v4i50(<4 x float> %f) {
 ; CHECK-LABEL: test_unsigned_v4f32_v4i50:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    mov x8, #1125899906842623 // =0x3ffffffffffff
 ; CHECK-NEXT:    mov s3, v0.s[1]
+; CHECK-NEXT:    mov x8, #1125899906842623 // =0x3ffffffffffff
 ; CHECK-NEXT:    fcvtzu x11, s0
 ; CHECK-NEXT:    mov s2, v1.s[1]
 ; CHECK-NEXT:    fcvtzu x9, s1
 ; CHECK-NEXT:    fcvtzu x12, s3
-; CHECK-NEXT:    cmp x9, x8
 ; CHECK-NEXT:    fcvtzu x10, s2
+; CHECK-NEXT:    cmp x9, x8
 ; CHECK-NEXT:    csel x2, x9, x8, lo
 ; CHECK-NEXT:    cmp x10, x8
 ; CHECK-NEXT:    csel x3, x10, x8, lo
@@ -932,12 +932,12 @@ define <4 x i64> @test_unsigned_v4f32_v4i64(<4 x float> %f) {
 ; CHECK-NEXT:    fcvtzu x9, s0
 ; CHECK-NEXT:    mov s2, v1.s[1]
 ; CHECK-NEXT:    fcvtzu x8, s1
+; CHECK-NEXT:    fcvtzu x11, s3
 ; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    fcvtzu x9, s3
+; CHECK-NEXT:    fcvtzu x10, s2
 ; CHECK-NEXT:    fmov d1, x8
-; CHECK-NEXT:    fcvtzu x8, s2
-; CHECK-NEXT:    mov v0.d[1], x9
-; CHECK-NEXT:    mov v1.d[1], x8
+; CHECK-NEXT:    mov v0.d[1], x11
+; CHECK-NEXT:    mov v1.d[1], x10
 ; CHECK-NEXT:    ret
     %x = call <4 x i64> @llvm.fptoui.sat.v4f32.v4i64(<4 x float> %f)
     ret <4 x i64> %x
@@ -968,10 +968,10 @@ define <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    mov w8, #1904214015 // =0x717fffff
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    fcmp s8, #0.0
-; CHECK-NEXT:    mov x25, #68719476735 // =0xfffffffff
+; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov s9, w8
+; CHECK-NEXT:    mov x25, #68719476735 // =0xfffffffff
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
@@ -1009,17 +1009,17 @@ define <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) {
 ; CHECK-NEXT:    fcmp s0, #0.0
 ; CHECK-NEXT:    mov x7, x23
 ; CHECK-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
 ; CHECK-NEXT:    fcmp s0, s9
-; CHECK-NEXT:    ldp x22, x21, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x24, x23, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
 ; CHECK-NEXT:    csel x1, x25, x9, gt
-; CHECK-NEXT:    ldp x24, x23, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    ldp x30, x25, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov v0.d[1], x1
-; CHECK-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    add sp, sp, #112
 ; CHECK-NEXT:    ret
@@ -1051,8 +1051,8 @@ define <4 x i128> @test_unsigned_v4f32_v4i128(<4 x float> %f) {
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    mov w8, #2139095039 // =0x7f7fffff
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    fcmp s8, #0.0
+; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov s9, w8
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    csel x8, xzr, x1, lt
@@ -1090,17 +1090,17 @@ define <4 x i128> @test_unsigned_v4f32_v4i128(<4 x float> %f) {
 ; CHECK-NEXT:    mov x6, x23
 ; CHECK-NEXT:    fcmp s0, #0.0
 ; CHECK-NEXT:    mov x7, x24
+; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
 ; CHECK-NEXT:    fcmp s0, s9
-; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp x22, x21, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x24, x23, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
 ; CHECK-NEXT:    csinv x1, x9, xzr, le
-; CHECK-NEXT:    ldp x24, x23, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov v0.d[1], x1
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    add sp, sp, #112
@@ -1222,9 +1222,9 @@ define <2 x i32> @test_unsigned_v2f64_v2i32_duplicate(<2 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov d1, v0.d[1]
 ; CHECK-NEXT:    fcvtzu w8, d0
+; CHECK-NEXT:    fcvtzu w9, d1
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fcvtzu w8, d1
-; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    mov v0.s[1], w9
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
     %x = call <2 x i32> @llvm.fptoui.sat.v2f64.v2i32(<2 x double> %f)
@@ -1279,9 +1279,9 @@ define <2 x i100> @test_unsigned_v2f64_v2i100(<2 x double> %f) {
 ; CHECK-NEXT:    mov x8, #5057542381537067007 // =0x462fffffffffffff
 ; CHECK-NEXT:    fcmp d8, #0.0
 ; CHECK-NEXT:    mov x21, #68719476735 // =0xfffffffff
+; CHECK-NEXT:    fmov d9, x8
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    fmov d9, x8
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
 ; CHECK-NEXT:    fcmp d8, d9
@@ -1299,8 +1299,8 @@ define <2 x i100> @test_unsigned_v2f64_v2i100(<2 x double> %f) {
 ; CHECK-NEXT:    ldp d9, d8, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
 ; CHECK-NEXT:    csel x1, x21, x9, gt
-; CHECK-NEXT:    ldp x30, x21, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldp x30, x21, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov v0.d[1], x1
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    add sp, sp, #64
@@ -1329,8 +1329,8 @@ define <2 x i128> @test_unsigned_v2f64_v2i128(<2 x double> %f) {
 ; CHECK-NEXT:    mov x8, #5183643171103440895 // =0x47efffffffffffff
 ; CHECK-NEXT:    fcmp d8, #0.0
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    fmov d9, x8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    csel x8, xzr, x1, lt
 ; CHECK-NEXT:    csel x9, xzr, x0, lt
 ; CHECK-NEXT:    fcmp d8, d9
@@ -1340,8 +1340,8 @@ define <2 x i128> @test_unsigned_v2f64_v2i128(<2 x double> %f) {
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov x2, x19
 ; CHECK-NEXT:    mov x3, x20
-; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
 ; CHECK-NEXT:    fcmp d0, #0.0
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
@@ -1476,17 +1476,17 @@ define <4 x i50> @test_unsigned_v4f16_v4i50(<4 x half> %f) {
 ; CHECK-CVT-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-CVT-NEXT:    mov h1, v0.h[1]
 ; CHECK-CVT-NEXT:    mov h2, v0.h[2]
+; CHECK-CVT-NEXT:    mov x8, #1125899906842623 // =0x3ffffffffffff
 ; CHECK-CVT-NEXT:    mov h3, v0.h[3]
 ; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov x8, #1125899906842623 // =0x3ffffffffffff
 ; CHECK-CVT-NEXT:    fcvt s1, h1
 ; CHECK-CVT-NEXT:    fcvt s2, h2
 ; CHECK-CVT-NEXT:    fcvt s3, h3
 ; CHECK-CVT-NEXT:    fcvtzu x9, s0
 ; CHECK-CVT-NEXT:    fcvtzu x10, s1
 ; CHECK-CVT-NEXT:    fcvtzu x11, s2
-; CHECK-CVT-NEXT:    cmp x9, x8
 ; CHECK-CVT-NEXT:    fcvtzu x12, s3
+; CHECK-CVT-NEXT:    cmp x9, x8
 ; CHECK-CVT-NEXT:    csel x0, x9, x8, lo
 ; CHECK-CVT-NEXT:    cmp x10, x8
 ; CHECK-CVT-NEXT:    csel x1, x10, x8, lo
@@ -1501,13 +1501,13 @@ define <4 x i50> @test_unsigned_v4f16_v4i50(<4 x half> %f) {
 ; CHECK-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-FP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-FP16-NEXT:    mov x8, #1125899906842623 // =0x3ffffffffffff
 ; CHECK-FP16-NEXT:    mov h3, v0.h[3]
 ; CHECK-FP16-NEXT:    fcvtzu x9, h0
-; CHECK-FP16-NEXT:    mov x8, #1125899906842623 // =0x3ffffffffffff
 ; CHECK-FP16-NEXT:    fcvtzu x10, h1
 ; CHECK-FP16-NEXT:    fcvtzu x11, h2
-; CHECK-FP16-NEXT:    cmp x9, x8
 ; CHECK-FP16-NEXT:    fcvtzu x12, h3
+; CHECK-FP16-NEXT:    cmp x9, x8
 ; CHECK-FP16-NEXT:    csel x0, x9, x8, lo
 ; CHECK-FP16-NEXT:    cmp x10, x8
 ; CHECK-FP16-NEXT:    csel x1, x10, x8, lo
@@ -1526,19 +1526,19 @@ define <4 x i64> @test_unsigned_v4f16_v4i64(<4 x half> %f) {
 ; CHECK-CVT-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-CVT-NEXT:    mov h1, v0.h[2]
 ; CHECK-CVT-NEXT:    mov h2, v0.h[1]
-; CHECK-CVT-NEXT:    fcvt s3, h0
-; CHECK-CVT-NEXT:    mov h0, v0.h[3]
+; CHECK-CVT-NEXT:    mov h3, v0.h[3]
+; CHECK-CVT-NEXT:    fcvt s0, h0
 ; CHECK-CVT-NEXT:    fcvt s1, h1
 ; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    fcvtzu x8, s3
-; CHECK-CVT-NEXT:    fcvt s3, h0
+; CHECK-CVT-NEXT:    fcvt s3, h3
+; CHECK-CVT-NEXT:    fcvtzu x8, s0
 ; CHECK-CVT-NEXT:    fcvtzu x9, s1
+; CHECK-CVT-NEXT:    fcvtzu x10, s2
+; CHECK-CVT-NEXT:    fcvtzu x11, s3
 ; CHECK-CVT-NEXT:    fmov d0, x8
-; CHECK-CVT-NEXT:    fcvtzu x8, s2
 ; CHECK-CVT-NEXT:    fmov d1, x9
-; CHECK-CVT-NEXT:    fcvtzu x9, s3
-; CHECK-CVT-NEXT:    mov v0.d[1], x8
-; CHECK-CVT-NEXT:    mov v1.d[1], x9
+; CHECK-CVT-NEXT:    mov v0.d[1], x10
+; CHECK-CVT-NEXT:    mov v1.d[1], x11
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_unsigned_v4f16_v4i64:
@@ -1549,12 +1549,12 @@ define <4 x i64> @test_unsigned_v4f16_v4i64(<4 x half> %f) {
 ; CHECK-FP16-NEXT:    mov h3, v0.h[3]
 ; CHECK-FP16-NEXT:    fcvtzu x8, h0
 ; CHECK-FP16-NEXT:    fcvtzu x9, h1
+; CHECK-FP16-NEXT:    fcvtzu x10, h2
+; CHECK-FP16-NEXT:    fcvtzu x11, h3
 ; CHECK-FP16-NEXT:    fmov d0, x8
-; CHECK-FP16-NEXT:    fcvtzu x8, h2
 ; CHECK-FP16-NEXT:    fmov d1, x9
-; CHECK-FP16-NEXT:    fcvtzu x9, h3
-; CHECK-FP16-NEXT:    mov v0.d[1], x8
-; CHECK-FP16-NEXT:    mov v1.d[1], x9
+; CHECK-FP16-NEXT:    mov v0.d[1], x10
+; CHECK-FP16-NEXT:    mov v1.d[1], x11
 ; CHECK-FP16-NEXT:    ret
     %x = call <4 x i64> @llvm.fptoui.sat.v4f16.v4i64(<4 x half> %f)
     ret <4 x i64> %x
@@ -1589,15 +1589,15 @@ define <4 x i100> @test_unsigned_v4f16_v4i100(<4 x half> %f) {
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov w8, #1904214015 // =0x717fffff
 ; CHECK-NEXT:    fcmp s8, #0.0
+; CHECK-NEXT:    fmov s9, w8
 ; CHECK-NEXT:    mov x25, #68719476735 // =0xfffffffff
 ; CHECK-NEXT:    mov h0, v0.h[1]
-; CHECK-NEXT:    fmov s9, w8
-; CHECK-NEXT:    csel x8, xzr, x0, lt
-; CHECK-NEXT:    csel x9, xzr, x1, lt
+; CHECK-NEXT:    csel x9, xzr, x0, lt
+; CHECK-NEXT:    csel x8, xzr, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    fcvt s8, h0
-; CHECK-NEXT:    csel x19, x25, x9, gt
-; CHECK-NEXT:    csinv x20, x8, xzr, le
+; CHECK-NEXT:    csel x19, x25, x8, gt
+; CHECK-NEXT:    csinv x20, x9, xzr, le
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
@@ -1627,19 +1627,19 @@ define <4 x i100> @test_unsigned_v4f16_v4i100(<4 x half> %f) {
 ; CHECK-NEXT:    mov x4, x20
 ; CHECK-NEXT:    mov x5, x19
 ; CHECK-NEXT:    mov x6, x24
+; CHECK-NEXT:    mov x7, x23
+; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
-; CHECK-NEXT:    mov x7, x23
-; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
 ; CHECK-NEXT:    csel x1, x25, x9, gt
-; CHECK-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    mov v0.d[1], x1
 ; CHECK-NEXT:    ldp x30, x25, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    mov v0.d[1], x1
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
@@ -1675,14 +1675,14 @@ define <4 x i128> @test_unsigned_v4f16_v4i128(<4 x half> %f) {
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov w8, #2139095039 // =0x7f7fffff
 ; CHECK-NEXT:    fcmp s8, #0.0
-; CHECK-NEXT:    mov h0, v0.h[2]
 ; CHECK-NEXT:    fmov s9, w8
-; CHECK-NEXT:    csel x8, xzr, x1, lt
-; CHECK-NEXT:    csel x9, xzr, x0, lt
+; CHECK-NEXT:    mov h0, v0.h[2]
+; CHECK-NEXT:    csel x9, xzr, x1, lt
+; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    fcvt s8, h0
-; CHECK-NEXT:    csinv x19, x9, xzr, le
-; CHECK-NEXT:    csinv x20, x8, xzr, le
+; CHECK-NEXT:    csinv x19, x8, xzr, le
+; CHECK-NEXT:    csinv x20, x9, xzr, le
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
@@ -1712,19 +1712,19 @@ define <4 x i128> @test_unsigned_v4f16_v4i128(<4 x half> %f) {
 ; CHECK-NEXT:    mov x4, x21
 ; CHECK-NEXT:    mov x5, x22
 ; CHECK-NEXT:    mov x6, x23
+; CHECK-NEXT:    mov x7, x24
+; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
-; CHECK-NEXT:    mov x7, x24
-; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT:    csinv x8, x8, xzr, le
-; CHECK-NEXT:    csinv x1, x9, xzr, le
 ; CHECK-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    mov v0.d[1], x1
+; CHECK-NEXT:    csinv x8, x8, xzr, le
+; CHECK-NEXT:    csinv x1, x9, xzr, le
+; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    ldp d9, d8, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    mov v0.d[1], x1
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
@@ -1754,42 +1754,42 @@ define <8 x i1> @test_unsigned_v8f16_v8i1(<8 x half> %f) {
 ; CHECK-CVT-NEXT:    mov s2, v1.s[1]
 ; CHECK-CVT-NEXT:    mov s3, v1.s[2]
 ; CHECK-CVT-NEXT:    mov s4, v1.s[3]
-; CHECK-CVT-NEXT:    mov s5, v0.s[1]
 ; CHECK-CVT-NEXT:    fcvtzu w9, s1
-; CHECK-CVT-NEXT:    fcvtzu w10, s0
+; CHECK-CVT-NEXT:    fcvtzu w13, s0
 ; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
 ; CHECK-CVT-NEXT:    fcvtzu w8, s2
-; CHECK-CVT-NEXT:    fcvtzu w11, s3
-; CHECK-CVT-NEXT:    fcvtzu w12, s4
-; CHECK-CVT-NEXT:    fcvtzu w13, s5
+; CHECK-CVT-NEXT:    mov s2, v0.s[1]
+; CHECK-CVT-NEXT:    fcvtzu w10, s3
+; CHECK-CVT-NEXT:    fcvtzu w11, s4
+; CHECK-CVT-NEXT:    fcvtzu w14, s1
+; CHECK-CVT-NEXT:    mov s0, v0.s[3]
 ; CHECK-CVT-NEXT:    cmp w8, #1
+; CHECK-CVT-NEXT:    fcvtzu w12, s2
 ; CHECK-CVT-NEXT:    csinc w8, w8, wzr, lo
 ; CHECK-CVT-NEXT:    cmp w9, #1
 ; CHECK-CVT-NEXT:    csinc w9, w9, wzr, lo
+; CHECK-CVT-NEXT:    cmp w10, #1
+; CHECK-CVT-NEXT:    csinc w10, w10, wzr, lo
 ; CHECK-CVT-NEXT:    cmp w11, #1
+; CHECK-CVT-NEXT:    fmov s1, w9
 ; CHECK-CVT-NEXT:    csinc w11, w11, wzr, lo
 ; CHECK-CVT-NEXT:    cmp w12, #1
 ; CHECK-CVT-NEXT:    csinc w12, w12, wzr, lo
 ; CHECK-CVT-NEXT:    cmp w13, #1
 ; CHECK-CVT-NEXT:    csinc w13, w13, wzr, lo
-; CHECK-CVT-NEXT:    cmp w10, #1
-; CHECK-CVT-NEXT:    csinc w10, w10, wzr, lo
-; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    fcvtzu w9, s1
-; CHECK-CVT-NEXT:    fmov s3, w10
-; CHECK-CVT-NEXT:    mov v2.s[1], w8
-; CHECK-CVT-NEXT:    cmp w9, #1
-; CHECK-CVT-NEXT:    csinc w8, w9, wzr, lo
-; CHECK-CVT-NEXT:    fcvtzu w9, s0
-; CHECK-CVT-NEXT:    mov v3.s[1], w13
-; CHECK-CVT-NEXT:    mov v2.s[2], w11
-; CHECK-CVT-NEXT:    cmp w9, #1
-; CHECK-CVT-NEXT:    mov v3.s[2], w8
-; CHECK-CVT-NEXT:    csinc w8, w9, wzr, lo
-; CHECK-CVT-NEXT:    mov v2.s[3], w12
-; CHECK-CVT-NEXT:    mov v3.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v3.8h, v2.8h
+; CHECK-CVT-NEXT:    mov v1.s[1], w8
+; CHECK-CVT-NEXT:    cmp w14, #1
+; CHECK-CVT-NEXT:    fmov s2, w13
+; CHECK-CVT-NEXT:    fcvtzu w8, s0
+; CHECK-CVT-NEXT:    csinc w9, w14, wzr, lo
+; CHECK-CVT-NEXT:    mov v2.s[1], w12
+; CHECK-CVT-NEXT:    mov v1.s[2], w10
+; CHECK-CVT-NEXT:    cmp w8, #1
+; CHECK-CVT-NEXT:    csinc w8, w8, wzr, lo
+; CHECK-CVT-NEXT:    mov v2.s[2], w9
+; CHECK-CVT-NEXT:    mov v1.s[3], w11
+; CHECK-CVT-NEXT:    mov v2.s[3], w8
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
 ; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
@@ -1813,42 +1813,42 @@ define <8 x i8> @test_unsigned_v8f16_v8i8(<8 x half> %f) {
 ; CHECK-CVT-NEXT:    mov s2, v1.s[1]
 ; CHECK-CVT-NEXT:    mov s3, v1.s[2]
 ; CHECK-CVT-NEXT:    mov s4, v1.s[3]
-; CHECK-CVT-NEXT:    mov s5, v0.s[1]
 ; CHECK-CVT-NEXT:    fcvtzu w10, s1
-; CHECK-CVT-NEXT:    fcvtzu w11, s0
+; CHECK-CVT-NEXT:    fcvtzu w14, s0
 ; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
 ; CHECK-CVT-NEXT:    fcvtzu w9, s2
-; CHECK-CVT-NEXT:    fcvtzu w12, s3
-; CHECK-CVT-NEXT:    fcvtzu w13, s4
-; CHECK-CVT-NEXT:    fcvtzu w14, s5
+; CHECK-CVT-NEXT:    mov s2, v0.s[1]
+; CHECK-CVT-NEXT:    fcvtzu w11, s3
+; CHECK-CVT-NEXT:    fcvtzu w12, s4
+; CHECK-CVT-NEXT:    fcvtzu w15, s1
+; CHECK-CVT-NEXT:    mov s0, v0.s[3]
 ; CHECK-CVT-NEXT:    cmp w9, #255
+; CHECK-CVT-NEXT:    fcvtzu w13, s2
 ; CHECK-CVT-NEXT:    csel w9, w9, w8, lo
 ; CHECK-CVT-NEXT:    cmp w10, #255
 ; CHECK-CVT-NEXT:    csel w10, w10, w8, lo
+; CHECK-CVT-NEXT:    cmp w11, #255
+; CHECK-CVT-NEXT:    csel w11, w11, w8, lo
 ; CHECK-CVT-NEXT:    cmp w12, #255
+; CHECK-CVT-NEXT:    fmov s1, w10
 ; CHECK-CVT-NEXT:    csel w12, w12, w8, lo
 ; CHECK-CVT-NEXT:    cmp w13, #255
 ; CHECK-CVT-NEXT:    csel w13, w13, w8, lo
 ; CHECK-CVT-NEXT:    cmp w14, #255
 ; CHECK-CVT-NEXT:    csel w14, w14, w8, lo
-; CHECK-CVT-NEXT:    cmp w11, #255
-; CHECK-CVT-NEXT:    csel w11, w11, w8, lo
-; CHECK-CVT-NEXT:    fmov s2, w10
-; CHECK-CVT-NEXT:    fcvtzu w10, s1
-; CHECK-CVT-NEXT:    fmov s3, w11
-; CHECK-CVT-NEXT:    mov v2.s[1], w9
-; CHECK-CVT-NEXT:    cmp w10, #255
-; CHECK-CVT-NEXT:    csel w9, w10, w8, lo
-; CHECK-CVT-NEXT:    fcvtzu w10, s0
-; CHECK-CVT-NEXT:    mov v3.s[1], w14
-; CHECK-CVT-NEXT:    mov v2.s[2], w12
-; CHECK-CVT-NEXT:    cmp w10, #255
-; CHECK-CVT-NEXT:    csel w8, w10, w8, lo
-; CHECK-CVT-NEXT:    mov v3.s[2], w9
-; CHECK-CVT-NEXT:    mov v2.s[3], w13
-; CHECK-CVT-NEXT:    mov v3.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v3.8h, v2.8h
+; CHECK-CVT-NEXT:    mov v1.s[1], w9
+; CHECK-CVT-NEXT:    cmp w15, #255
+; CHECK-CVT-NEXT:    fmov s2, w14
+; CHECK-CVT-NEXT:    fcvtzu w9, s0
+; CHECK-CVT-NEXT:    csel w10, w15, w8, lo
+; CHECK-CVT-NEXT:    mov v2.s[1], w13
+; CHECK-CVT-NEXT:    mov v1.s[2], w11
+; CHECK-CVT-NEXT:    cmp w9, #255
+; CHECK-CVT-NEXT:    csel w8, w9, w8, lo
+; CHECK-CVT-NEXT:    mov v2.s[2], w10
+; CHECK-CVT-NEXT:    mov v1.s[3], w12
+; CHECK-CVT-NEXT:    mov v2.s[3], w8
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
 ; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-CVT-NEXT:    ret
 ;
@@ -1870,42 +1870,42 @@ define <8 x i13> @test_unsigned_v8f16_v8i13(<8 x half> %f) {
 ; CHECK-CVT-NEXT:    mov s2, v1.s[1]
 ; CHECK-CVT-NEXT:    mov s3, v1.s[2]
 ; CHECK-CVT-NEXT:    mov s4, v1.s[3]
-; CHECK-CVT-NEXT:    mov s5, v0.s[1]
 ; CHECK-CVT-NEXT:    fcvtzu w10, s1
-; CHECK-CVT-NEXT:    fcvtzu w11, s0
+; CHECK-CVT-NEXT:    fcvtzu w14, s0
 ; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
 ; CHECK-CVT-NEXT:    fcvtzu w9, s2
-; CHECK-CVT-NEXT:    fcvtzu w12, s3
-; CHECK-CVT-NEXT:    fcvtzu w13, s4
-; CHECK-CVT-NEXT:    fcvtzu w14, s5
+; CHECK-CVT-NEXT:    mov s2, v0.s[1]
+; CHECK-CVT-NEXT:    fcvtzu w11, s3
+; CHECK-CVT-NEXT:    fcvtzu w12, s4
+; CHECK-CVT-NEXT:    fcvtzu w15, s1
+; CHECK-CVT-NEXT:    mov s0, v0.s[3]
 ; CHECK-CVT-NEXT:    cmp w9, w8
+; CHECK-CVT-NEXT:    fcvtzu w13, s2
 ; CHECK-CVT-NEXT:    csel w9, w9, w8, lo
 ; CHECK-CVT-NEXT:    cmp w10, w8
 ; CHECK-CVT-NEXT:    csel w10, w10, w8, lo
+; CHECK-CVT-NEXT:    cmp w11, w8
+; CHECK-CVT-NEXT:    csel w11, w11, w8, lo
 ; CHECK-CVT-NEXT:    cmp w12, w8
+; CHECK-CVT-NEXT:    fmov s1, w10
 ; CHECK-CVT-NEXT:    csel w12, w12, w8, lo
 ; CHECK-CVT-NEXT:    cmp w13, w8
 ; CHECK-CVT-NEXT:    csel w13, w13, w8, lo
 ; CHECK-CVT-NEXT:    cmp w14, w8
 ; CHECK-CVT-NEXT:    csel w14, w14, w8, lo
-; CHECK-CVT-NEXT:    cmp w11, w8
-; CHECK-CVT-NEXT:    csel w11, w11, w8, lo
-; CHECK-CVT-NEXT:    fmov s2, w10
-; CHECK-CVT-NEXT:    fcvtzu w10, s1
-; CHECK-CVT-NEXT:    fmov s3, w11
-; CHECK-CVT-NEXT:    mov v2.s[1], w9
-; CHECK-CVT-NEXT:    cmp w10, w8
-; CHECK-CVT-NEXT:    csel w9, w10, w8, lo
-; CHECK-CVT-NEXT:    fcvtzu w10, s0
-; CHECK-CVT-NEXT:    mov v3.s[1], w14
-; CHECK-CVT-NEXT:    mov v2.s[2], w12
-; CHECK-CVT-NEXT:    cmp w10, w8
-; CHECK-CVT-NEXT:    csel w8, w10, w8, lo
-; CHECK-CVT-NEXT:    mov v3.s[2], w9
-; CHECK-CVT-NEXT:    mov v2.s[3], w13
-; CHECK-CVT-NEXT:    mov v3.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v3.8h, v2.8h
+; CHECK-CVT-NEXT:    mov v1.s[1], w9
+; CHECK-CVT-NEXT:    cmp w15, w8
+; CHECK-CVT-NEXT:    fmov s2, w14
+; CHECK-CVT-NEXT:    fcvtzu w9, s0
+; CHECK-CVT-NEXT:    csel w10, w15, w8, lo
+; CHECK-CVT-NEXT:    mov v2.s[1], w13
+; CHECK-CVT-NEXT:    mov v1.s[2], w11
+; CHECK-CVT-NEXT:    cmp w9, w8
+; CHECK-CVT-NEXT:    csel w8, w9, w8, lo
+; CHECK-CVT-NEXT:    mov v2.s[2], w10
+; CHECK-CVT-NEXT:    mov v1.s[3], w12
+; CHECK-CVT-NEXT:    mov v2.s[3], w8
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_unsigned_v8f16_v8i13:
@@ -1927,42 +1927,42 @@ define <8 x i16> @test_unsigned_v8f16_v8i16(<8 x half> %f) {
 ; CHECK-CVT-NEXT:    mov s2, v1.s[1]
 ; CHECK-CVT-NEXT:    mov s3, v1.s[2]
 ; CHECK-CVT-NEXT:    mov s4, v1.s[3]
-; CHECK-CVT-NEXT:    mov s5, v0.s[1]
 ; CHECK-CVT-NEXT:    fcvtzu w10, s1
-; CHECK-CVT-NEXT:    fcvtzu w11, s0
+; CHECK-CVT-NEXT:    fcvtzu w14, s0
 ; CHECK-CVT-NEXT:    mov s1, v0.s[2]
-; CHECK-CVT-NEXT:    mov s0, v0.s[3]
 ; CHECK-CVT-NEXT:    fcvtzu w9, s2
-; CHECK-CVT-NEXT:    fcvtzu w12, s3
-; CHECK-CVT-NEXT:    fcvtzu w13, s4
-; CHECK-CVT-NEXT:    fcvtzu w14, s5
+; CHECK-CVT-NEXT:    mov s2, v0.s[1]
+; CHECK-CVT-NEXT:    fcvtzu w11, s3
+; CHECK-CVT-NEXT:    fcvtzu w12, s4
+; CHECK-CVT-NEXT:    fcvtzu w15, s1
+; CHECK-CVT-NEXT:    mov s0, v0.s[3]
 ; CHECK-CVT-NEXT:    cmp w9, w8
+; CHECK-CVT-NEXT:    fcvtzu w13, s2
 ; CHECK-CVT-NEXT:    csel w9, w9, w8, lo
 ; CHECK-CVT-NEXT:    cmp w10, w8
 ; CHECK-CVT-NEXT:    csel w10, w10, w8, lo
+; CHECK-CVT-NEXT:    cmp w11, w8
+; CHECK-CVT-NEXT:    csel w11, w11, w8, lo
 ; CHECK-CVT-NEXT:    cmp w12, w8
+; CHECK-CVT-NEXT:    fmov s1, w10
 ; CHECK-CVT-NEXT:    csel w12, w12, w8, lo
 ; CHECK-CVT-NEXT:    cmp w13, w8
 ; CHECK-CVT-NEXT:    csel w13, w13, w8, lo
 ; CHECK-CVT-NEXT:    cmp w14, w8
 ; CHECK-CVT-NEXT:    csel w14, w14, w8, lo
-; CHECK-CVT-NEXT:    cmp w11, w8
-; CHECK-CVT-NEXT:    csel w11, w11, w8, lo
-; CHECK-CVT-NEXT:    fmov s2, w10
-; CHECK-CVT-NEXT:    fcvtzu w10, s1
-; CHECK-CVT-NEXT:    fmov s3, w11
-; CHECK-CVT-NEXT:    mov v2.s[1], w9
-; CHECK-CVT-NEXT:    cmp w10, w8
-; CHECK-CVT-NEXT:    csel w9, w10, w8, lo
-; CHECK-CVT-NEXT:    fcvtzu w10, s0
-; CHECK-CVT-NEXT:    mov v3.s[1], w14
-; CHECK-CVT-NEXT:    mov v2.s[2], w12
-; CHECK-CVT-NEXT:    cmp w10, w8
-; CHECK-CVT-NEXT:    csel w8, w10, w8, lo
-; CHECK-CVT-NEXT:    mov v3.s[2], w9
-; CHECK-CVT-NEXT:    mov v2.s[3], w13
-; CHECK-CVT-NEXT:    mov v3.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v3.8h, v2.8h
+; CHECK-CVT-NEXT:    mov v1.s[1], w9
+; CHECK-CVT-NEXT:    cmp w15, w8
+; CHECK-CVT-NEXT:    fmov s2, w14
+; CHECK-CVT-NEXT:    fcvtzu w9, s0
+; CHECK-CVT-NEXT:    csel w10, w15, w8, lo
+; CHECK-CVT-NEXT:    mov v2.s[1], w13
+; CHECK-CVT-NEXT:    mov v1.s[2], w11
+; CHECK-CVT-NEXT:    cmp w9, w8
+; CHECK-CVT-NEXT:    csel w8, w9, w8, lo
+; CHECK-CVT-NEXT:    mov v2.s[2], w10
+; CHECK-CVT-NEXT:    mov v1.s[3], w12
+; CHECK-CVT-NEXT:    mov v2.s[3], w8
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_unsigned_v8f16_v8i16:
@@ -1985,8 +1985,8 @@ define <8 x i19> @test_unsigned_v8f16_v8i19(<8 x half> %f) {
 ; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    mov w1, v2.s[1]
 ; CHECK-NEXT:    mov w2, v2.s[2]
-; CHECK-NEXT:    mov w5, v0.s[1]
 ; CHECK-NEXT:    mov w3, v2.s[3]
+; CHECK-NEXT:    mov w5, v0.s[1]
 ; CHECK-NEXT:    mov w6, v0.s[2]
 ; CHECK-NEXT:    mov w7, v0.s[3]
 ; CHECK-NEXT:    fmov w4, s0
@@ -2012,81 +2012,81 @@ define <8 x i50> @test_unsigned_v8f16_v8i50(<8 x half> %f) {
 ; CHECK-CVT-LABEL: test_unsigned_v8f16_v8i50:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-CVT-NEXT:    mov h5, v0.h[1]
 ; CHECK-CVT-NEXT:    mov x8, #1125899906842623 // =0x3ffffffffffff
-; CHECK-CVT-NEXT:    mov h2, v0.h[1]
-; CHECK-CVT-NEXT:    mov h3, v0.h[2]
-; CHECK-CVT-NEXT:    mov h5, v0.h[3]
+; CHECK-CVT-NEXT:    mov h6, v0.h[2]
+; CHECK-CVT-NEXT:    mov h7, v0.h[3]
 ; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    mov h4, v1.h[1]
-; CHECK-CVT-NEXT:    mov h6, v1.h[2]
-; CHECK-CVT-NEXT:    mov h7, v1.h[3]
+; CHECK-CVT-NEXT:    mov h2, v1.h[1]
+; CHECK-CVT-NEXT:    mov h3, v1.h[2]
+; CHECK-CVT-NEXT:    mov h4, v1.h[3]
 ; CHECK-CVT-NEXT:    fcvt s1, h1
+; CHECK-CVT-NEXT:    fcvtzu x13, s0
 ; CHECK-CVT-NEXT:    fcvt s2, h2
 ; CHECK-CVT-NEXT:    fcvt s3, h3
-; CHECK-CVT-NEXT:    fcvtzu x9, s0
-; CHECK-CVT-NEXT:    fcvt s5, h5
 ; CHECK-CVT-NEXT:    fcvt s4, h4
-; CHECK-CVT-NEXT:    fcvt s6, h6
-; CHECK-CVT-NEXT:    fcvt s0, h7
-; CHECK-CVT-NEXT:    fcvtzu x10, s1
-; CHECK-CVT-NEXT:    fcvtzu x11, s2
-; CHECK-CVT-NEXT:    fcvtzu x12, s3
-; CHECK-CVT-NEXT:    fcvtzu x14, s5
-; CHECK-CVT-NEXT:    fcvtzu x13, s4
-; CHECK-CVT-NEXT:    fcvtzu x15, s6
-; CHECK-CVT-NEXT:    cmp x10, x8
-; CHECK-CVT-NEXT:    fcvtzu x16, s0
-; CHECK-CVT-NEXT:    csel x4, x10, x8, lo
-; CHECK-CVT-NEXT:    cmp x13, x8
-; CHECK-CVT-NEXT:    csel x5, x13, x8, lo
-; CHECK-CVT-NEXT:    cmp x15, x8
-; CHECK-CVT-NEXT:    csel x6, x15, x8, lo
-; CHECK-CVT-NEXT:    cmp x16, x8
-; CHECK-CVT-NEXT:    csel x7, x16, x8, lo
+; CHECK-CVT-NEXT:    fcvtzu x9, s1
+; CHECK-CVT-NEXT:    fcvt s1, h5
+; CHECK-CVT-NEXT:    fcvtzu x10, s2
+; CHECK-CVT-NEXT:    fcvtzu x11, s3
+; CHECK-CVT-NEXT:    fcvt s2, h6
+; CHECK-CVT-NEXT:    fcvtzu x12, s4
+; CHECK-CVT-NEXT:    fcvt s3, h7
 ; CHECK-CVT-NEXT:    cmp x9, x8
-; CHECK-CVT-NEXT:    csel x0, x9, x8, lo
+; CHECK-CVT-NEXT:    fcvtzu x14, s1
+; CHECK-CVT-NEXT:    csel x4, x9, x8, lo
+; CHECK-CVT-NEXT:    cmp x10, x8
+; CHECK-CVT-NEXT:    fcvtzu x9, s2
+; CHECK-CVT-NEXT:    csel x5, x10, x8, lo
 ; CHECK-CVT-NEXT:    cmp x11, x8
-; CHECK-CVT-NEXT:    csel x1, x11, x8, lo
+; CHECK-CVT-NEXT:    fcvtzu x10, s3
+; CHECK-CVT-NEXT:    csel x6, x11, x8, lo
 ; CHECK-CVT-NEXT:    cmp x12, x8
-; CHECK-CVT-NEXT:    csel x2, x12, x8, lo
+; CHECK-CVT-NEXT:    csel x7, x12, x8, lo
+; CHECK-CVT-NEXT:    cmp x13, x8
+; CHECK-CVT-NEXT:    csel x0, x13, x8, lo
 ; CHECK-CVT-NEXT:    cmp x14, x8
-; CHECK-CVT-NEXT:    csel x3, x14, x8, lo
+; CHECK-CVT-NEXT:    csel x1, x14, x8, lo
+; CHECK-CVT-NEXT:    cmp x9, x8
+; CHECK-CVT-NEXT:    csel x2, x9, x8, lo
+; CHECK-CVT-NEXT:    cmp x10, x8
+; CHECK-CVT-NEXT:    csel x3, x10, x8, lo
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_unsigned_v8f16_v8i50:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; CHECK-FP16-NEXT:    mov x8, #1125899906842623 // =0x3ffffffffffff
-; CHECK-FP16-NEXT:    mov h2, v0.h[1]
-; CHECK-FP16-NEXT:    mov h3, v0.h[2]
-; CHECK-FP16-NEXT:    mov h5, v0.h[3]
-; CHECK-FP16-NEXT:    fcvtzu x9, h0
-; CHECK-FP16-NEXT:    mov h4, v1.h[1]
-; CHECK-FP16-NEXT:    mov h6, v1.h[2]
-; CHECK-FP16-NEXT:    mov h0, v1.h[3]
-; CHECK-FP16-NEXT:    fcvtzu x10, h1
-; CHECK-FP16-NEXT:    fcvtzu x11, h2
-; CHECK-FP16-NEXT:    fcvtzu x12, h3
-; CHECK-FP16-NEXT:    fcvtzu x14, h5
-; CHECK-FP16-NEXT:    fcvtzu x13, h4
-; CHECK-FP16-NEXT:    fcvtzu x15, h6
-; CHECK-FP16-NEXT:    cmp x10, x8
-; CHECK-FP16-NEXT:    fcvtzu x16, h0
-; CHECK-FP16-NEXT:    csel x4, x10, x8, lo
-; CHECK-FP16-NEXT:    cmp x13, x8
-; CHECK-FP16-NEXT:    csel x5, x13, x8, lo
-; CHECK-FP16-NEXT:    cmp x15, x8
-; CHECK-FP16-NEXT:    csel x6, x15, x8, lo
-; CHECK-FP16-NEXT:    cmp x16, x8
-; CHECK-FP16-NEXT:    csel x7, x16, x8, lo
+; CHECK-FP16-NEXT:    fcvtzu x13, h0
+; CHECK-FP16-NEXT:    mov h2, v1.h[1]
+; CHECK-FP16-NEXT:    mov h3, v1.h[2]
+; CHECK-FP16-NEXT:    mov h4, v1.h[3]
+; CHECK-FP16-NEXT:    fcvtzu x9, h1
+; CHECK-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-FP16-NEXT:    fcvtzu x10, h2
+; CHECK-FP16-NEXT:    fcvtzu x11, h3
+; CHECK-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-FP16-NEXT:    fcvtzu x12, h4
+; CHECK-FP16-NEXT:    mov h3, v0.h[3]
 ; CHECK-FP16-NEXT:    cmp x9, x8
-; CHECK-FP16-NEXT:    csel x0, x9, x8, lo
+; CHECK-FP16-NEXT:    fcvtzu x14, h1
+; CHECK-FP16-NEXT:    csel x4, x9, x8, lo
+; CHECK-FP16-NEXT:    cmp x10, x8
+; CHECK-FP16-NEXT:    fcvtzu x9, h2
+; CHECK-FP16-NEXT:    csel x5, x10, x8, lo
 ; CHECK-FP16-NEXT:    cmp x11, x8
-; CHECK-FP16-NEXT:    csel x1, x11, x8, lo
+; CHECK-FP16-NEXT:    fcvtzu x10, h3
+; CHECK-FP16-NEXT:    csel x6, x11, x8, lo
 ; CHECK-FP16-NEXT:    cmp x12, x8
-; CHECK-FP16-NEXT:    csel x2, x12, x8, lo
+; CHECK-FP16-NEXT:    csel x7, x12, x8, lo
+; CHECK-FP16-NEXT:    cmp x13, x8
+; CHECK-FP16-NEXT:    csel x0, x13, x8, lo
 ; CHECK-FP16-NEXT:    cmp x14, x8
-; CHECK-FP16-NEXT:    csel x3, x14, x8, lo
+; CHECK-FP16-NEXT:    csel x1, x14, x8, lo
+; CHECK-FP16-NEXT:    cmp x9, x8
+; CHECK-FP16-NEXT:    csel x2, x9, x8, lo
+; CHECK-FP16-NEXT:    cmp x10, x8
+; CHECK-FP16-NEXT:    csel x3, x10, x8, lo
 ; CHECK-FP16-NEXT:    ret
     %x = call <8 x i50> @llvm.fptoui.sat.v8f16.v8i50(<8 x half> %f)
     ret <8 x i50> %x
@@ -2096,63 +2096,63 @@ define <8 x i64> @test_unsigned_v8f16_v8i64(<8 x half> %f) {
 ; CHECK-CVT-LABEL: test_unsigned_v8f16_v8i64:
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-CVT-NEXT:    mov h2, v0.h[2]
-; CHECK-CVT-NEXT:    fcvt s3, h0
-; CHECK-CVT-NEXT:    mov h7, v0.h[1]
-; CHECK-CVT-NEXT:    mov h0, v0.h[3]
-; CHECK-CVT-NEXT:    mov h4, v1.h[1]
-; CHECK-CVT-NEXT:    mov h6, v1.h[2]
-; CHECK-CVT-NEXT:    fcvt s5, h1
-; CHECK-CVT-NEXT:    mov h1, v1.h[3]
+; CHECK-CVT-NEXT:    mov h4, v0.h[2]
+; CHECK-CVT-NEXT:    mov h3, v0.h[1]
+; CHECK-CVT-NEXT:    mov h7, v0.h[3]
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    mov h2, v1.h[2]
+; CHECK-CVT-NEXT:    mov h5, v1.h[1]
+; CHECK-CVT-NEXT:    mov h6, v1.h[3]
+; CHECK-CVT-NEXT:    fcvt s1, h1
+; CHECK-CVT-NEXT:    fcvt s4, h4
+; CHECK-CVT-NEXT:    fcvt s3, h3
+; CHECK-CVT-NEXT:    fcvt s7, h7
+; CHECK-CVT-NEXT:    fcvtzu x9, s0
 ; CHECK-CVT-NEXT:    fcvt s2, h2
-; CHECK-CVT-NEXT:    fcvtzu x8, s3
-; CHECK-CVT-NEXT:    fcvt s3, h4
-; CHECK-CVT-NEXT:    fcvt s4, h6
-; CHECK-CVT-NEXT:    fcvtzu x9, s5
-; CHECK-CVT-NEXT:    fcvt s5, h7
-; CHECK-CVT-NEXT:    fcvt s6, h0
-; CHECK-CVT-NEXT:    fcvt s7, h1
-; CHECK-CVT-NEXT:    fcvtzu x10, s2
-; CHECK-CVT-NEXT:    fmov d0, x8
-; CHECK-CVT-NEXT:    fmov d2, x9
-; CHECK-CVT-NEXT:    fcvtzu x9, s4
+; CHECK-CVT-NEXT:    fcvt s5, h5
+; CHECK-CVT-NEXT:    fcvt s6, h6
+; CHECK-CVT-NEXT:    fcvtzu x8, s1
+; CHECK-CVT-NEXT:    fcvtzu x12, s4
 ; CHECK-CVT-NEXT:    fcvtzu x11, s3
-; CHECK-CVT-NEXT:    fcvtzu x8, s5
-; CHECK-CVT-NEXT:    fmov d1, x10
-; CHECK-CVT-NEXT:    fcvtzu x10, s6
-; CHECK-CVT-NEXT:    fmov d3, x9
-; CHECK-CVT-NEXT:    fcvtzu x9, s7
-; CHECK-CVT-NEXT:    mov v2.d[1], x11
-; CHECK-CVT-NEXT:    mov v0.d[1], x8
-; CHECK-CVT-NEXT:    mov v1.d[1], x10
-; CHECK-CVT-NEXT:    mov v3.d[1], x9
+; CHECK-CVT-NEXT:    fcvtzu x15, s7
+; CHECK-CVT-NEXT:    fmov d0, x9
+; CHECK-CVT-NEXT:    fcvtzu x10, s2
+; CHECK-CVT-NEXT:    fcvtzu x13, s5
+; CHECK-CVT-NEXT:    fcvtzu x14, s6
+; CHECK-CVT-NEXT:    fmov d2, x8
+; CHECK-CVT-NEXT:    fmov d1, x12
+; CHECK-CVT-NEXT:    mov v0.d[1], x11
+; CHECK-CVT-NEXT:    fmov d3, x10
+; CHECK-CVT-NEXT:    mov v2.d[1], x13
+; CHECK-CVT-NEXT:    mov v1.d[1], x15
+; CHECK-CVT-NEXT:    mov v3.d[1], x14
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_unsigned_v8f16_v8i64:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-FP16-NEXT:    mov h2, v0.h[2]
-; CHECK-FP16-NEXT:    mov h5, v0.h[1]
-; CHECK-FP16-NEXT:    mov h6, v0.h[3]
-; CHECK-FP16-NEXT:    fcvtzu x8, h0
-; CHECK-FP16-NEXT:    mov h4, v1.h[2]
-; CHECK-FP16-NEXT:    fcvtzu x9, h1
-; CHECK-FP16-NEXT:    mov h3, v1.h[1]
-; CHECK-FP16-NEXT:    mov h7, v1.h[3]
-; CHECK-FP16-NEXT:    fcvtzu x10, h2
-; CHECK-FP16-NEXT:    fmov d0, x8
-; CHECK-FP16-NEXT:    fmov d2, x9
-; CHECK-FP16-NEXT:    fcvtzu x8, h5
-; CHECK-FP16-NEXT:    fcvtzu x9, h4
+; CHECK-FP16-NEXT:    mov h4, v0.h[2]
+; CHECK-FP16-NEXT:    mov h3, v0.h[1]
+; CHECK-FP16-NEXT:    mov h7, v0.h[3]
+; CHECK-FP16-NEXT:    fcvtzu x9, h0
+; CHECK-FP16-NEXT:    mov h2, v1.h[2]
+; CHECK-FP16-NEXT:    mov h5, v1.h[1]
+; CHECK-FP16-NEXT:    mov h6, v1.h[3]
+; CHECK-FP16-NEXT:    fcvtzu x8, h1
+; CHECK-FP16-NEXT:    fcvtzu x12, h4
 ; CHECK-FP16-NEXT:    fcvtzu x11, h3
-; CHECK-FP16-NEXT:    fmov d1, x10
-; CHECK-FP16-NEXT:    fcvtzu x10, h6
-; CHECK-FP16-NEXT:    fmov d3, x9
-; CHECK-FP16-NEXT:    fcvtzu x9, h7
-; CHECK-FP16-NEXT:    mov v2.d[1], x11
-; CHECK-FP16-NEXT:    mov v0.d[1], x8
-; CHECK-FP16-NEXT:    mov v1.d[1], x10
-; CHECK-FP16-NEXT:    mov v3.d[1], x9
+; CHECK-FP16-NEXT:    fcvtzu x15, h7
+; CHECK-FP16-NEXT:    fmov d0, x9
+; CHECK-FP16-NEXT:    fcvtzu x10, h2
+; CHECK-FP16-NEXT:    fcvtzu x13, h5
+; CHECK-FP16-NEXT:    fcvtzu x14, h6
+; CHECK-FP16-NEXT:    fmov d2, x8
+; CHECK-FP16-NEXT:    fmov d1, x12
+; CHECK-FP16-NEXT:    mov v0.d[1], x11
+; CHECK-FP16-NEXT:    fmov d3, x10
+; CHECK-FP16-NEXT:    mov v2.d[1], x13
+; CHECK-FP16-NEXT:    mov v1.d[1], x15
+; CHECK-FP16-NEXT:    mov v3.d[1], x14
 ; CHECK-FP16-NEXT:    ret
     %x = call <8 x i64> @llvm.fptoui.sat.v8f16.v8i64(<8 x half> %f)
     ret <8 x i64> %x
@@ -2185,8 +2185,8 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    .cfi_offset b8, -104
 ; CHECK-NEXT:    .cfi_offset b9, -112
 ; CHECK-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    mov x19, x8
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    mov x19, x8
 ; CHECK-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov h0, v0.h[1]
 ; CHECK-NEXT:    fcvt s8, h0
@@ -2195,17 +2195,17 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov w8, #1904214015 // =0x717fffff
 ; CHECK-NEXT:    fcmp s8, #0.0
-; CHECK-NEXT:    mov x23, #68719476735 // =0xfffffffff
-; CHECK-NEXT:    mov h0, v0.h[3]
 ; CHECK-NEXT:    fmov s9, w8
-; CHECK-NEXT:    csel x8, xzr, x0, lt
-; CHECK-NEXT:    csel x9, xzr, x1, lt
+; CHECK-NEXT:    mov x22, #68719476735 // =0xfffffffff
+; CHECK-NEXT:    mov h0, v0.h[3]
+; CHECK-NEXT:    csel x9, xzr, x0, lt
+; CHECK-NEXT:    csel x8, xzr, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    fcvt s8, h0
-; CHECK-NEXT:    csel x9, x23, x9, gt
-; CHECK-NEXT:    csinv x8, x8, xzr, le
+; CHECK-NEXT:    csel x10, x22, x8, gt
+; CHECK-NEXT:    csinv x8, x9, xzr, le
+; CHECK-NEXT:    stp x8, x10, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    fmov s0, s8
-; CHECK-NEXT:    stp x8, x9, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    fcmp s8, #0.0
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
@@ -2213,10 +2213,10 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    fcvt s8, h0
-; CHECK-NEXT:    csel x9, x23, x9, gt
+; CHECK-NEXT:    csel x9, x22, x9, gt
 ; CHECK-NEXT:    csinv x24, x8, xzr, le
-; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    str x9, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    fcmp s8, #0.0
@@ -2226,7 +2226,7 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    fcvt s8, h0
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
-; CHECK-NEXT:    csel x25, x23, x9, gt
+; CHECK-NEXT:    csel x25, x22, x9, gt
 ; CHECK-NEXT:    str x8, [sp, #32] // 8-byte Folded Spill
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixunssfti
@@ -2238,7 +2238,7 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    fcvt s8, h0
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
-; CHECK-NEXT:    csel x27, x23, x9, gt
+; CHECK-NEXT:    csel x26, x22, x9, gt
 ; CHECK-NEXT:    str x8, [sp] // 8-byte Folded Spill
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixunssfti
@@ -2249,8 +2249,8 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    fcvt s8, h0
-; CHECK-NEXT:    csel x29, x23, x9, gt
-; CHECK-NEXT:    csinv x26, x8, xzr, le
+; CHECK-NEXT:    csel x29, x22, x9, gt
+; CHECK-NEXT:    csinv x27, x8, xzr, le
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    fcmp s8, #0.0
@@ -2259,8 +2259,8 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    fcvt s8, h0
-; CHECK-NEXT:    csel x28, x23, x9, gt
-; CHECK-NEXT:    csinv x20, x8, xzr, le
+; CHECK-NEXT:    csel x20, x22, x9, gt
+; CHECK-NEXT:    csinv x21, x8, xzr, le
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
@@ -2270,45 +2270,46 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
 ; CHECK-NEXT:    csel x9, xzr, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    fcvt s8, h0
-; CHECK-NEXT:    csel x21, x23, x9, gt
-; CHECK-NEXT:    csinv x22, x8, xzr, le
+; CHECK-NEXT:    csel x28, x22, x9, gt
+; CHECK-NEXT:    csinv x23, x8, xzr, le
 ; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixunssfti
+; CHECK-NEXT:    ldr x9, [sp] // 8-byte Folded Reload
+; CHECK-NEXT:    extr x8, x20, x21, #28
 ; CHECK-NEXT:    fcmp s8, #0.0
-; CHECK-NEXT:    extr x8, x28, x20, #28
-; CHECK-NEXT:    bfi x21, x26, #36, #28
-; CHECK-NEXT:    extr x9, x29, x26, #28
+; CHECK-NEXT:    bfi x28, x27, #36, #28
 ; CHECK-NEXT:    lsr x11, x29, #28
-; CHECK-NEXT:    str x22, [x19]
+; CHECK-NEXT:    bfi x26, x24, #36, #28
+; CHECK-NEXT:    stur x9, [x19, #75]
+; CHECK-NEXT:    extr x9, x29, x27, #28
 ; CHECK-NEXT:    stur x8, [x19, #41]
 ; CHECK-NEXT:    csel x8, xzr, x0, lt
-; CHECK-NEXT:    csel x10, xzr, x1, lt
+; CHECK-NEXT:    str x9, [x19, #16]
+; CHECK-NEXT:    csel x9, xzr, x1, lt
 ; CHECK-NEXT:    fcmp s8, s9
-; CHECK-NEXT:    stp x21, x9, [x19, #8]
-; CHECK-NEXT:    lsr x9, x28, #28
+; CHECK-NEXT:    ldr x10, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT:    stp x23, x28, [x19]
 ; CHECK-NEXT:    strb w11, [x19, #24]
-; CHECK-NEXT:    bfi x27, x24, #36, #28
-; CHECK-NEXT:    csel x10, x23, x10, gt
+; CHECK-NEXT:    stur x10, [x19, #50]
+; CHECK-NEXT:    lsr x10, x20, #28
+; CHECK-NEXT:    csel x9, x22, x9, gt
+; CHECK-NEXT:    bfi x9, x21, #36, #28
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
-; CHECK-NEXT:    bfi x10, x20, #36, #28
-; CHECK-NEXT:    strb w9, [x19, #49]
+; CHECK-NEXT:    strb w10, [x19, #49]
+; CHECK-NEXT:    ldr x11, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    stur x8, [x19, #25]
-; CHECK-NEXT:    stur x10, [x19, #33]
-; CHECK-NEXT:    ldp x9, x12, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    stur x9, [x19, #75]
-; CHECK-NEXT:    extr x8, x12, x24, #28
-; CHECK-NEXT:    ldr x9, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    stur x9, [x19, #50]
-; CHECK-NEXT:    ldp x11, x10, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    stur x8, [x19, #91]
-; CHECK-NEXT:    lsr x8, x12, #28
-; CHECK-NEXT:    stur x27, [x19, #83]
-; CHECK-NEXT:    extr x9, x10, x11, #28
-; CHECK-NEXT:    bfi x25, x11, #36, #28
-; CHECK-NEXT:    strb w8, [x19, #99]
-; CHECK-NEXT:    stur x9, [x19, #66]
-; CHECK-NEXT:    lsr x9, x10, #28
+; CHECK-NEXT:    stur x9, [x19, #33]
+; CHECK-NEXT:    extr x10, x11, x24, #28
+; CHECK-NEXT:    stur x10, [x19, #91]
+; CHECK-NEXT:    ldp x10, x9, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    stur x26, [x19, #83]
+; CHECK-NEXT:    extr x8, x9, x10, #28
+; CHECK-NEXT:    bfi x25, x10, #36, #28
+; CHECK-NEXT:    lsr x9, x9, #28
+; CHECK-NEXT:    stur x8, [x19, #66]
+; CHECK-NEXT:    lsr x8, x11, #28
 ; CHECK-NEXT:    stur x25, [x19, #58]
+; CHECK-NEXT:    strb w8, [x19, #99]
 ; CHECK-NEXT:    strb w9, [x19, #74]
 ; CHECK-NEXT:    ldp x20, x19, [sp, #160] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x22, x21, [sp, #144] // 16-byte Folded Reload
@@ -2350,8 +2351,8 @@ define <8 x i128> @test_unsigned_v8f16_v8i128(<8 x half> %f) {
 ; CHECK-NEXT:    .cfi_offset b8, -104
 ; CHECK-NEXT:    .cfi_offset b9, -112
 ; CHECK-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    mov x19, x8
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    mov x19, x8
 ; CHECK-NEXT:    fcvt s8, h0
 ; CHECK-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    fmov s0, s8
@@ -2359,16 +2360,16 @@ define <8 x i128> @test_unsigned_v8f16_v8i128(<8 x half> %f) {
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov w8, #2139095039 // =0x7f7fffff
 ; CHECK-NEXT:    fcmp s8, #0.0
-; CHECK-NEXT:    mov h0, v0.h[1]
 ; CHECK-NEXT:    fmov s9, w8
-; CHECK-NEXT:    csel x8, xzr, x1, lt
-; CHECK-NEXT:    csel x9, xzr, x0, lt
+; CHECK-NEXT:    mov h0, v0.h[1]
+; CHECK-NEXT:    csel x9, xzr, x1, lt
+; CHECK-NEXT:    csel x8, xzr, x0, lt
 ; CHECK-NEXT:    fcmp s8, s9
 ; CHECK-NEXT:    fcvt s8, h0
-; CHECK-NEXT:    csinv x9, x9, xzr, le
-; CHECK-NEXT:    csinv x8, x8, xzr, le
+; CHECK-NEXT:    csinv x10, x8, xzr, le
+; CHECK-NEXT:    csinv x8, x9, xzr, le
+; CHECK-NEXT:    stp x8, x10, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    fmov s0, s8
-; CHECK-NEXT:    stp x8, x9, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    fcmp s8, #0.0
@@ -2379,8 +2380,8 @@ define <8 x i128> @test_unsigned_v8f16_v8i128(<8 x half> %f) {
 ; CHECK-NEXT:    fcvt s8, h0
 ; CHECK-NEXT:    csinv x9, x9, xzr, le
 ; CHECK-NEXT:    csinv x8, x8, xzr, le
-; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    stp x8, x9, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    fmov s0, s8
 ; CHECK-NEXT:    bl __fixunssfti
 ; CHECK-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    fcmp s8, #0.0
@@ -2558,87 +2559,87 @@ define <16 x i8> @test_unsigned_v16f16_v16i8(<16 x half> %f) {
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
 ; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl2 v5.4s, v0.8h
 ; CHECK-CVT-NEXT:    mov w8, #255 // =0xff
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-CVT-NEXT:    mov s3, v2.s[1]
 ; CHECK-CVT-NEXT:    mov s4, v2.s[2]
-; CHECK-CVT-NEXT:    fcvtzu w9, s2
-; CHECK-CVT-NEXT:    mov s2, v2.s[3]
-; CHECK-CVT-NEXT:    fcvtzu w12, s1
-; CHECK-CVT-NEXT:    fcvtzu w16, s5
-; CHECK-CVT-NEXT:    fcvtzu w2, s0
-; CHECK-CVT-NEXT:    fcvtzu w10, s3
+; CHECK-CVT-NEXT:    mov s5, v2.s[3]
+; CHECK-CVT-NEXT:    fcvtzu w10, s2
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-CVT-NEXT:    fcvtzu w13, s1
+; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-NEXT:    fcvtzu w9, s3
 ; CHECK-CVT-NEXT:    mov s3, v1.s[1]
 ; CHECK-CVT-NEXT:    fcvtzu w11, s4
 ; CHECK-CVT-NEXT:    mov s4, v1.s[2]
+; CHECK-CVT-NEXT:    fcvtzu w12, s5
 ; CHECK-CVT-NEXT:    mov s1, v1.s[3]
-; CHECK-CVT-NEXT:    fcvtzu w13, s2
-; CHECK-CVT-NEXT:    cmp w10, #255
-; CHECK-CVT-NEXT:    mov s2, v5.s[1]
+; CHECK-CVT-NEXT:    fcvtzu w18, s2
+; CHECK-CVT-NEXT:    fcvtzu w3, s0
 ; CHECK-CVT-NEXT:    fcvtzu w14, s3
-; CHECK-CVT-NEXT:    csel w10, w10, w8, lo
 ; CHECK-CVT-NEXT:    cmp w9, #255
-; CHECK-CVT-NEXT:    fcvtzu w15, s4
+; CHECK-CVT-NEXT:    mov s3, v2.s[1]
 ; CHECK-CVT-NEXT:    csel w9, w9, w8, lo
+; CHECK-CVT-NEXT:    cmp w10, #255
+; CHECK-CVT-NEXT:    fcvtzu w15, s4
+; CHECK-CVT-NEXT:    csel w10, w10, w8, lo
 ; CHECK-CVT-NEXT:    cmp w11, #255
+; CHECK-CVT-NEXT:    mov s4, v2.s[2]
 ; CHECK-CVT-NEXT:    csel w11, w11, w8, lo
-; CHECK-CVT-NEXT:    cmp w13, #255
-; CHECK-CVT-NEXT:    mov s3, v5.s[2]
-; CHECK-CVT-NEXT:    fcvtzu w17, s1
-; CHECK-CVT-NEXT:    csel w13, w13, w8, lo
-; CHECK-CVT-NEXT:    cmp w14, #255
-; CHECK-CVT-NEXT:    mov s4, v5.s[3]
-; CHECK-CVT-NEXT:    fcvtzu w18, s2
-; CHECK-CVT-NEXT:    csel w14, w14, w8, lo
 ; CHECK-CVT-NEXT:    cmp w12, #255
-; CHECK-CVT-NEXT:    mov s1, v0.s[1]
+; CHECK-CVT-NEXT:    fcvtzu w16, s1
+; CHECK-CVT-NEXT:    mov s1, v2.s[3]
 ; CHECK-CVT-NEXT:    csel w12, w12, w8, lo
+; CHECK-CVT-NEXT:    cmp w14, #255
+; CHECK-CVT-NEXT:    fcvtzu w17, s3
+; CHECK-CVT-NEXT:    mov s3, v0.s[1]
+; CHECK-CVT-NEXT:    csel w14, w14, w8, lo
+; CHECK-CVT-NEXT:    cmp w13, #255
+; CHECK-CVT-NEXT:    fcvtzu w0, s4
+; CHECK-CVT-NEXT:    fmov s2, w10
+; CHECK-CVT-NEXT:    csel w13, w13, w8, lo
 ; CHECK-CVT-NEXT:    cmp w15, #255
-; CHECK-CVT-NEXT:    fcvtzu w0, s3
 ; CHECK-CVT-NEXT:    csel w15, w15, w8, lo
-; CHECK-CVT-NEXT:    cmp w17, #255
-; CHECK-CVT-NEXT:    csel w17, w17, w8, lo
-; CHECK-CVT-NEXT:    cmp w18, #255
-; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    csel w9, w18, w8, lo
-; CHECK-CVT-NEXT:    fcvtzu w18, s4
 ; CHECK-CVT-NEXT:    cmp w16, #255
 ; CHECK-CVT-NEXT:    fcvtzu w1, s1
 ; CHECK-CVT-NEXT:    csel w16, w16, w8, lo
-; CHECK-CVT-NEXT:    cmp w0, #255
+; CHECK-CVT-NEXT:    cmp w17, #255
+; CHECK-CVT-NEXT:    fcvtzu w2, s3
+; CHECK-CVT-NEXT:    csel w17, w17, w8, lo
+; CHECK-CVT-NEXT:    cmp w18, #255
 ; CHECK-CVT-NEXT:    mov s1, v0.s[2]
+; CHECK-CVT-NEXT:    csel w18, w18, w8, lo
+; CHECK-CVT-NEXT:    cmp w0, #255
+; CHECK-CVT-NEXT:    mov v2.s[1], w9
 ; CHECK-CVT-NEXT:    csel w0, w0, w8, lo
-; CHECK-CVT-NEXT:    cmp w18, #255
-; CHECK-CVT-NEXT:    mov v2.s[1], w10
-; CHECK-CVT-NEXT:    csel w10, w18, w8, lo
 ; CHECK-CVT-NEXT:    cmp w1, #255
-; CHECK-CVT-NEXT:    fmov s3, w12
-; CHECK-CVT-NEXT:    csel w18, w1, w8, lo
+; CHECK-CVT-NEXT:    fmov s3, w18
+; CHECK-CVT-NEXT:    csel w10, w1, w8, lo
 ; CHECK-CVT-NEXT:    cmp w2, #255
-; CHECK-CVT-NEXT:    csel w1, w2, w8, lo
-; CHECK-CVT-NEXT:    fmov s4, w16
-; CHECK-CVT-NEXT:    mov v2.s[2], w11
-; CHECK-CVT-NEXT:    fcvtzu w11, s1
 ; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    fmov s1, w1
-; CHECK-CVT-NEXT:    mov v3.s[1], w14
-; CHECK-CVT-NEXT:    cmp w11, #255
+; CHECK-CVT-NEXT:    csel w9, w2, w8, lo
+; CHECK-CVT-NEXT:    cmp w3, #255
+; CHECK-CVT-NEXT:    fcvtzu w2, s1
+; CHECK-CVT-NEXT:    csel w1, w3, w8, lo
+; CHECK-CVT-NEXT:    fmov s1, w13
+; CHECK-CVT-NEXT:    mov v3.s[1], w17
+; CHECK-CVT-NEXT:    fmov s4, w1
+; CHECK-CVT-NEXT:    mov v2.s[2], w11
+; CHECK-CVT-NEXT:    mov v1.s[1], w14
+; CHECK-CVT-NEXT:    cmp w2, #255
 ; CHECK-CVT-NEXT:    mov v4.s[1], w9
-; CHECK-CVT-NEXT:    csel w9, w11, w8, lo
-; CHECK-CVT-NEXT:    mov v1.s[1], w18
-; CHECK-CVT-NEXT:    fcvtzu w11, s0
-; CHECK-CVT-NEXT:    mov v3.s[2], w15
-; CHECK-CVT-NEXT:    mov v4.s[2], w0
-; CHECK-CVT-NEXT:    mov v1.s[2], w9
-; CHECK-CVT-NEXT:    cmp w11, #255
-; CHECK-CVT-NEXT:    csel w8, w11, w8, lo
-; CHECK-CVT-NEXT:    mov v2.s[3], w13
-; CHECK-CVT-NEXT:    mov v3.s[3], w17
-; CHECK-CVT-NEXT:    mov v4.s[3], w10
-; CHECK-CVT-NEXT:    mov v1.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v3.8h, v2.8h
-; CHECK-CVT-NEXT:    uzp1 v1.8h, v1.8h, v4.8h
+; CHECK-CVT-NEXT:    fcvtzu w9, s0
+; CHECK-CVT-NEXT:    csel w11, w2, w8, lo
+; CHECK-CVT-NEXT:    mov v3.s[2], w0
+; CHECK-CVT-NEXT:    mov v2.s[3], w12
+; CHECK-CVT-NEXT:    mov v1.s[2], w15
+; CHECK-CVT-NEXT:    mov v4.s[2], w11
+; CHECK-CVT-NEXT:    cmp w9, #255
+; CHECK-CVT-NEXT:    csel w8, w9, w8, lo
+; CHECK-CVT-NEXT:    mov v3.s[3], w10
+; CHECK-CVT-NEXT:    mov v1.s[3], w16
+; CHECK-CVT-NEXT:    mov v4.s[3], w8
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v1.8h, v2.8h
+; CHECK-CVT-NEXT:    uzp1 v1.8h, v4.8h, v3.8h
 ; CHECK-CVT-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
 ; CHECK-CVT-NEXT:    ret
 ;
@@ -2660,87 +2661,87 @@ define <16 x i16> @test_unsigned_v16f16_v16i16(<16 x half> %f) {
 ; CHECK-CVT:       // %bb.0:
 ; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
 ; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtl2 v5.4s, v1.8h
 ; CHECK-CVT-NEXT:    mov w8, #65535 // =0xffff
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
 ; CHECK-CVT-NEXT:    mov s3, v2.s[1]
 ; CHECK-CVT-NEXT:    mov s4, v2.s[2]
-; CHECK-CVT-NEXT:    fcvtzu w9, s2
-; CHECK-CVT-NEXT:    mov s2, v2.s[3]
-; CHECK-CVT-NEXT:    fcvtzu w12, s0
-; CHECK-CVT-NEXT:    fcvtzu w16, s5
-; CHECK-CVT-NEXT:    fcvtzu w2, s1
-; CHECK-CVT-NEXT:    fcvtzu w10, s3
+; CHECK-CVT-NEXT:    mov s5, v2.s[3]
+; CHECK-CVT-NEXT:    fcvtzu w10, s2
+; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-NEXT:    fcvtzu w13, s0
+; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-NEXT:    fcvtzu w9, s3
 ; CHECK-CVT-NEXT:    mov s3, v0.s[1]
 ; CHECK-CVT-NEXT:    fcvtzu w11, s4
 ; CHECK-CVT-NEXT:    mov s4, v0.s[2]
+; CHECK-CVT-NEXT:    fcvtzu w12, s5
 ; CHECK-CVT-NEXT:    mov s0, v0.s[3]
-; CHECK-CVT-NEXT:    fcvtzu w13, s2
-; CHECK-CVT-NEXT:    cmp w10, w8
-; CHECK-CVT-NEXT:    mov s2, v5.s[1]
+; CHECK-CVT-NEXT:    fcvtzu w18, s2
+; CHECK-CVT-NEXT:    fcvtzu w3, s1
 ; CHECK-CVT-NEXT:    fcvtzu w14, s3
-; CHECK-CVT-NEXT:    csel w10, w10, w8, lo
 ; CHECK-CVT-NEXT:    cmp w9, w8
-; CHECK-CVT-NEXT:    fcvtzu w15, s4
+; CHECK-CVT-NEXT:    mov s3, v2.s[1]
 ; CHECK-CVT-NEXT:    csel w9, w9, w8, lo
+; CHECK-CVT-NEXT:    cmp w10, w8
+; CHECK-CVT-NEXT:    fcvtzu w15, s4
+; CHECK-CVT-NEXT:    csel w10, w10, w8, lo
 ; CHECK-CVT-NEXT:    cmp w11, w8
+; CHECK-CVT-NEXT:    mov s4, v2.s[2]
 ; CHECK-CVT-NEXT:    csel w11, w11, w8, lo
-; CHECK-CVT-NEXT:    cmp w13, w8
-; CHECK-CVT-NEXT:    mov s3, v5.s[2]
-; CHECK-CVT-NEXT:    fcvtzu w17, s0
-; CHECK-CVT-NEXT:    csel w13, w13, w8, lo
-; CHECK-CVT-NEXT:    cmp w14, w8
-; CHECK-CVT-NEXT:    mov s4, v5.s[3]
-; CHECK-CVT-NEXT:    fcvtzu w18, s2
-; CHECK-CVT-NEXT:    csel w14, w14, w8, lo
 ; CHECK-CVT-NEXT:    cmp w12, w8
-; CHECK-CVT-NEXT:    mov s0, v1.s[1]
+; CHECK-CVT-NEXT:    fcvtzu w16, s0
+; CHECK-CVT-NEXT:    mov s0, v2.s[3]
 ; CHECK-CVT-NEXT:    csel w12, w12, w8, lo
+; CHECK-CVT-NEXT:    cmp w14, w8
+; CHECK-CVT-NEXT:    fcvtzu w17, s3
+; CHECK-CVT-NEXT:    mov s3, v1.s[1]
+; CHECK-CVT-NEXT:    csel w14, w14, w8, lo
+; CHECK-CVT-NEXT:    cmp w13, w8
+; CHECK-CVT-NEXT:    fcvtzu w0, s4
+; CHECK-CVT-NEXT:    fmov s2, w10
+; CHECK-CVT-NEXT:    csel w13, w13, w8, lo
 ; CHECK-CVT-NEXT:    cmp w15, w8
-; CHECK-CVT-NEXT:    fcvtzu w0, s3
 ; CHECK-CVT-NEXT:    csel w15, w15, w8, lo
-; CHECK-CVT-NEXT:    cmp w17, w8
-; CHECK-CVT-NEXT:    csel w17, w17, w8, lo
-; CHECK-CVT-NEXT:    cmp w18, w8
-; CHECK-CVT-NEXT:    fmov s2, w9
-; CHECK-CVT-NEXT:    csel w9, w18, w8, lo
-; CHECK-CVT-NEXT:    fcvtzu w18, s4
 ; CHECK-CVT-NEXT:    cmp w16, w8
 ; CHECK-CVT-NEXT:    fcvtzu w1, s0
 ; CHECK-CVT-NEXT:    csel w16, w16, w8, lo
-; CHECK-CVT-NEXT:    cmp w0, w8
+; CHECK-CVT-NEXT:    cmp w17, w8
+; CHECK-CVT-NEXT:    fcvtzu w2, s3
+; CHECK-CVT-NEXT:    csel w17, w17, w8, lo
+; CHECK-CVT-NEXT:    cmp w18, w8
 ; CHECK-CVT-NEXT:    mov s0, v1.s[2]
+; CHECK-CVT-NEXT:    csel w18, w18, w8, lo
+; CHECK-CVT-NEXT:    cmp w0, w8
+; CHECK-CVT-NEXT:    mov v2.s[1], w9
 ; CHECK-CVT-NEXT:    csel w0, w0, w8, lo
-; CHECK-CVT-NEXT:    cmp w18, w8
-; CHECK-CVT-NEXT:    mov v2.s[1], w10
-; CHECK-CVT-NEXT:    csel w10, w18, w8, lo
 ; CHECK-CVT-NEXT:    cmp w1, w8
-; CHECK-CVT-NEXT:    fmov s3, w12
-; CHECK-CVT-NEXT:    csel w18, w1, w8, lo
+; CHECK-CVT-NEXT:    fmov s3, w18
+; CHECK-CVT-NEXT:    csel w10, w1, w8, lo
 ; CHECK-CVT-NEXT:    cmp w2, w8
-; CHECK-CVT-NEXT:    csel w1, w2, w8, lo
-; CHECK-CVT-NEXT:    fmov s4, w16
-; CHECK-CVT-NEXT:    mov v2.s[2], w11
-; CHECK-CVT-NEXT:    fcvtzu w11, s0
+; CHECK-CVT-NEXT:    csel w9, w2, w8, lo
+; CHECK-CVT-NEXT:    cmp w3, w8
+; CHECK-CVT-NEXT:    fcvtzu w2, s0
+; CHECK-CVT-NEXT:    csel w1, w3, w8, lo
 ; CHECK-CVT-NEXT:    mov s0, v1.s[3]
-; CHECK-CVT-NEXT:    fmov s5, w1
-; CHECK-CVT-NEXT:    mov v3.s[1], w14
-; CHECK-CVT-NEXT:    cmp w11, w8
+; CHECK-CVT-NEXT:    fmov s1, w13
+; CHECK-CVT-NEXT:    fmov s4, w1
+; CHECK-CVT-NEXT:    mov v3.s[1], w17
+; CHECK-CVT-NEXT:    mov v2.s[2], w11
+; CHECK-CVT-NEXT:    mov v1.s[1], w14
+; CHECK-CVT-NEXT:    cmp w2, w8
 ; CHECK-CVT-NEXT:    mov v4.s[1], w9
-; CHECK-CVT-NEXT:    csel w9, w11, w8, lo
-; CHECK-CVT-NEXT:    mov v5.s[1], w18
-; CHECK-CVT-NEXT:    fcvtzu w11, s0
-; CHECK-CVT-NEXT:    mov v3.s[2], w15
-; CHECK-CVT-NEXT:    mov v4.s[2], w0
-; CHECK-CVT-NEXT:    mov v5.s[2], w9
-; CHECK-CVT-NEXT:    cmp w11, w8
-; CHECK-CVT-NEXT:    csel w8, w11, w8, lo
-; CHECK-CVT-NEXT:    mov v2.s[3], w13
-; CHECK-CVT-NEXT:    mov v3.s[3], w17
-; CHECK-CVT-NEXT:    mov v4.s[3], w10
-; CHECK-CVT-NEXT:    mov v5.s[3], w8
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v3.8h, v2.8h
-; CHECK-CVT-NEXT:    uzp1 v1.8h, v5.8h, v4.8h
+; CHECK-CVT-NEXT:    fcvtzu w9, s0
+; CHECK-CVT-NEXT:    csel w11, w2, w8, lo
+; CHECK-CVT-NEXT:    mov v3.s[2], w0
+; CHECK-CVT-NEXT:    mov v2.s[3], w12
+; CHECK-CVT-NEXT:    mov v1.s[2], w15
+; CHECK-CVT-NEXT:    mov v4.s[2], w11
+; CHECK-CVT-NEXT:    cmp w9, w8
+; CHECK-CVT-NEXT:    csel w8, w9, w8, lo
+; CHECK-CVT-NEXT:    mov v3.s[3], w10
+; CHECK-CVT-NEXT:    mov v1.s[3], w16
+; CHECK-CVT-NEXT:    mov v4.s[3], w8
+; CHECK-CVT-NEXT:    uzp1 v0.8h, v1.8h, v2.8h
+; CHECK-CVT-NEXT:    uzp1 v1.8h, v4.8h, v3.8h
 ; CHECK-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_unsigned_v16f16_v16i16:
@@ -2756,45 +2757,45 @@ define <8 x i8> @test_unsigned_v8f64_v8i8(<8 x double> %f) {
 ; CHECK-LABEL: test_unsigned_v8f64_v8i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov d4, v3.d[1]
-; CHECK-NEXT:    fcvtzu w10, d3
-; CHECK-NEXT:    mov d3, v2.d[1]
-; CHECK-NEXT:    mov w8, #255 // =0xff
+; CHECK-NEXT:    mov d5, v2.d[1]
+; CHECK-NEXT:    mov w11, #255 // =0xff
+; CHECK-NEXT:    fcvtzu w9, d3
+; CHECK-NEXT:    mov d3, v1.d[1]
 ; CHECK-NEXT:    fcvtzu w12, d2
-; CHECK-NEXT:    fcvtzu w13, d1
-; CHECK-NEXT:    fcvtzu w9, d4
-; CHECK-NEXT:    mov d4, v1.d[1]
-; CHECK-NEXT:    fcvtzu w11, d3
-; CHECK-NEXT:    mov d1, v0.d[1]
+; CHECK-NEXT:    fcvtzu w14, d1
+; CHECK-NEXT:    fcvtzu w8, d4
+; CHECK-NEXT:    mov d4, v0.d[1]
+; CHECK-NEXT:    fcvtzu w10, d5
+; CHECK-NEXT:    fcvtzu w13, d3
+; CHECK-NEXT:    cmp w8, #255
+; CHECK-NEXT:    fcvtzu w15, d4
+; CHECK-NEXT:    csel w8, w8, w11, lo
 ; CHECK-NEXT:    cmp w9, #255
-; CHECK-NEXT:    csel w9, w9, w8, lo
+; CHECK-NEXT:    csel w9, w9, w11, lo
 ; CHECK-NEXT:    cmp w10, #255
-; CHECK-NEXT:    csel w10, w10, w8, lo
-; CHECK-NEXT:    cmp w11, #255
-; CHECK-NEXT:    csel w11, w11, w8, lo
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    csel w9, w10, w11, lo
 ; CHECK-NEXT:    cmp w12, #255
-; CHECK-NEXT:    csel w12, w12, w8, lo
-; CHECK-NEXT:    fmov s19, w10
-; CHECK-NEXT:    fcvtzu w10, d4
-; CHECK-NEXT:    cmp w10, #255
-; CHECK-NEXT:    mov v19.s[1], w9
-; CHECK-NEXT:    csel w10, w10, w8, lo
-; CHECK-NEXT:    cmp w13, #255
-; CHECK-NEXT:    fmov s18, w12
-; CHECK-NEXT:    fcvtzu w9, d1
-; CHECK-NEXT:    csel w12, w13, w8, lo
-; CHECK-NEXT:    fcvtzu w13, d0
-; CHECK-NEXT:    mov v18.s[1], w11
-; CHECK-NEXT:    cmp w9, #255
-; CHECK-NEXT:    fmov s17, w12
-; CHECK-NEXT:    csel w9, w9, w8, lo
+; CHECK-NEXT:    fcvtzu w10, d0
+; CHECK-NEXT:    mov v4.s[1], w8
+; CHECK-NEXT:    csel w8, w12, w11, lo
 ; CHECK-NEXT:    cmp w13, #255
-; CHECK-NEXT:    csel w8, w13, w8, lo
-; CHECK-NEXT:    mov v17.s[1], w10
-; CHECK-NEXT:    fmov s16, w8
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    csel w8, w13, w11, lo
+; CHECK-NEXT:    cmp w14, #255
+; CHECK-NEXT:    mov v3.s[1], w9
+; CHECK-NEXT:    csel w9, w14, w11, lo
+; CHECK-NEXT:    cmp w15, #255
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    csel w9, w15, w11, lo
+; CHECK-NEXT:    cmp w10, #255
+; CHECK-NEXT:    mov v2.s[1], w8
+; CHECK-NEXT:    csel w8, w10, w11, lo
+; CHECK-NEXT:    fmov s1, w8
 ; CHECK-NEXT:    adrp x8, .LCPI82_0
-; CHECK-NEXT:    mov v16.s[1], w9
 ; CHECK-NEXT:    ldr d0, [x8, :lo12:.LCPI82_0]
-; CHECK-NEXT:    tbl v0.8b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.8b
+; CHECK-NEXT:    mov v1.s[1], w9
+; CHECK-NEXT:    tbl v0.8b, { v1.16b, v2.16b, v3.16b, v4.16b }, v0.8b
 ; CHECK-NEXT:    ret
     %x = call <8 x i8> @llvm.fptoui.sat.v8f64.v8i8(<8 x double> %f)
     ret <8 x i8> %x
@@ -2805,99 +2806,99 @@ define <16 x i8> @test_unsigned_v16f64_v16i8(<16 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov d16, v0.d[1]
 ; CHECK-NEXT:    fcvtzu w10, d0
-; CHECK-NEXT:    mov d0, v1.d[1]
 ; CHECK-NEXT:    mov w8, #255 // =0xff
-; CHECK-NEXT:    fcvtzu w12, d1
-; CHECK-NEXT:    mov d1, v2.d[1]
 ; CHECK-NEXT:    fcvtzu w9, d16
-; CHECK-NEXT:    fcvtzu w11, d0
+; CHECK-NEXT:    mov d16, v1.d[1]
 ; CHECK-NEXT:    cmp w9, #255
 ; CHECK-NEXT:    csel w9, w9, w8, lo
 ; CHECK-NEXT:    cmp w10, #255
 ; CHECK-NEXT:    csel w10, w10, w8, lo
-; CHECK-NEXT:    cmp w11, #255
 ; CHECK-NEXT:    fmov s0, w10
-; CHECK-NEXT:    csel w10, w11, w8, lo
-; CHECK-NEXT:    cmp w12, #255
-; CHECK-NEXT:    csel w11, w12, w8, lo
-; CHECK-NEXT:    fcvtzu w12, d2
+; CHECK-NEXT:    fcvtzu w10, d16
+; CHECK-NEXT:    mov d16, v2.d[1]
 ; CHECK-NEXT:    mov v0.s[1], w9
 ; CHECK-NEXT:    fcvtzu w9, d1
-; CHECK-NEXT:    mov d2, v3.d[1]
-; CHECK-NEXT:    fmov s1, w11
+; CHECK-NEXT:    cmp w10, #255
+; CHECK-NEXT:    csel w10, w10, w8, lo
 ; CHECK-NEXT:    cmp w9, #255
 ; CHECK-NEXT:    mov w11, v0.s[1]
 ; CHECK-NEXT:    csel w9, w9, w8, lo
-; CHECK-NEXT:    cmp w12, #255
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcvtzu w9, d16
+; CHECK-NEXT:    mov d16, v3.d[1]
+; CHECK-NEXT:    mov v0.b[1], w11
 ; CHECK-NEXT:    mov v1.s[1], w10
-; CHECK-NEXT:    csel w12, w12, w8, lo
 ; CHECK-NEXT:    fcvtzu w10, d2
-; CHECK-NEXT:    mov v0.b[1], w11
-; CHECK-NEXT:    fcvtzu w11, d3
-; CHECK-NEXT:    fmov s2, w12
-; CHECK-NEXT:    mov w12, v1.s[1]
+; CHECK-NEXT:    cmp w9, #255
+; CHECK-NEXT:    csel w9, w9, w8, lo
 ; CHECK-NEXT:    cmp w10, #255
-; CHECK-NEXT:    mov d3, v4.d[1]
-; CHECK-NEXT:    csel w10, w10, w8, lo
+; CHECK-NEXT:    mov w11, v1.s[1]
 ; CHECK-NEXT:    mov v0.b[2], v1.b[0]
-; CHECK-NEXT:    cmp w11, #255
+; CHECK-NEXT:    csel w10, w10, w8, lo
+; CHECK-NEXT:    fmov s2, w10
+; CHECK-NEXT:    fcvtzu w10, d16
+; CHECK-NEXT:    mov d16, v4.d[1]
+; CHECK-NEXT:    mov v0.b[3], w11
 ; CHECK-NEXT:    mov v2.s[1], w9
-; CHECK-NEXT:    csel w11, w11, w8, lo
 ; CHECK-NEXT:    fcvtzu w9, d3
-; CHECK-NEXT:    mov d3, v5.d[1]
-; CHECK-NEXT:    mov v0.b[3], w12
-; CHECK-NEXT:    fcvtzu w12, d4
-; CHECK-NEXT:    fmov s4, w11
-; CHECK-NEXT:    mov w11, v2.s[1]
+; CHECK-NEXT:    cmp w10, #255
+; CHECK-NEXT:    csel w10, w10, w8, lo
 ; CHECK-NEXT:    cmp w9, #255
-; CHECK-NEXT:    csel w9, w9, w8, lo
-; CHECK-NEXT:    cmp w12, #255
+; CHECK-NEXT:    mov w11, v2.s[1]
 ; CHECK-NEXT:    mov v0.b[4], v2.b[0]
-; CHECK-NEXT:    csel w12, w12, w8, lo
-; CHECK-NEXT:    mov v4.s[1], w10
-; CHECK-NEXT:    fcvtzu w10, d3
-; CHECK-NEXT:    fmov s3, w12
+; CHECK-NEXT:    csel w9, w9, w8, lo
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fcvtzu w9, d16
 ; CHECK-NEXT:    mov v0.b[5], w11
-; CHECK-NEXT:    fcvtzu w11, d5
-; CHECK-NEXT:    mov w12, v4.s[1]
+; CHECK-NEXT:    mov v3.s[1], w10
+; CHECK-NEXT:    fcvtzu w10, d4
+; CHECK-NEXT:    mov d4, v5.d[1]
+; CHECK-NEXT:    cmp w9, #255
+; CHECK-NEXT:    csel w9, w9, w8, lo
 ; CHECK-NEXT:    cmp w10, #255
-; CHECK-NEXT:    csel w10, w10, w8, lo
-; CHECK-NEXT:    mov d5, v6.d[1]
-; CHECK-NEXT:    cmp w11, #255
-; CHECK-NEXT:    mov v0.b[6], v4.b[0]
-; CHECK-NEXT:    csel w11, w11, w8, lo
-; CHECK-NEXT:    mov v3.s[1], w9
-; CHECK-NEXT:    fcvtzu w9, d6
-; CHECK-NEXT:    mov d6, v7.d[1]
-; CHECK-NEXT:    mov v0.b[7], w12
-; CHECK-NEXT:    fcvtzu w12, d5
-; CHECK-NEXT:    fmov s5, w11
 ; CHECK-NEXT:    mov w11, v3.s[1]
-; CHECK-NEXT:    cmp w12, #255
-; CHECK-NEXT:    mov v0.b[8], v3.b[0]
-; CHECK-NEXT:    csel w12, w12, w8, lo
+; CHECK-NEXT:    mov v0.b[6], v3.b[0]
+; CHECK-NEXT:    csel w10, w10, w8, lo
+; CHECK-NEXT:    fmov s16, w10
+; CHECK-NEXT:    fcvtzu w10, d4
+; CHECK-NEXT:    mov d4, v6.d[1]
+; CHECK-NEXT:    mov v0.b[7], w11
+; CHECK-NEXT:    mov v16.s[1], w9
+; CHECK-NEXT:    fcvtzu w9, d5
+; CHECK-NEXT:    cmp w10, #255
+; CHECK-NEXT:    csel w10, w10, w8, lo
 ; CHECK-NEXT:    cmp w9, #255
-; CHECK-NEXT:    mov v5.s[1], w10
+; CHECK-NEXT:    mov w11, v16.s[1]
+; CHECK-NEXT:    mov v0.b[8], v16.b[0]
 ; CHECK-NEXT:    csel w9, w9, w8, lo
-; CHECK-NEXT:    fcvtzu w10, d6
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    fcvtzu w9, d4
+; CHECK-NEXT:    mov d4, v7.d[1]
 ; CHECK-NEXT:    mov v0.b[9], w11
-; CHECK-NEXT:    fcvtzu w11, d7
-; CHECK-NEXT:    fmov s16, w9
-; CHECK-NEXT:    mov w9, v5.s[1]
+; CHECK-NEXT:    mov v5.s[1], w10
+; CHECK-NEXT:    fcvtzu w10, d6
+; CHECK-NEXT:    cmp w9, #255
+; CHECK-NEXT:    csel w9, w9, w8, lo
 ; CHECK-NEXT:    cmp w10, #255
 ; CHECK-NEXT:    mov v0.b[10], v5.b[0]
-; CHECK-NEXT:    mov v16.s[1], w12
-; CHECK-NEXT:    mov v0.b[11], w9
-; CHECK-NEXT:    csel w9, w10, w8, lo
-; CHECK-NEXT:    cmp w11, #255
-; CHECK-NEXT:    mov w10, v16.s[1]
-; CHECK-NEXT:    csel w8, w11, w8, lo
-; CHECK-NEXT:    mov v0.b[12], v16.b[0]
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    mov v0.b[13], w10
+; CHECK-NEXT:    mov w11, v5.s[1]
+; CHECK-NEXT:    csel w10, w10, w8, lo
+; CHECK-NEXT:    fmov s6, w10
+; CHECK-NEXT:    fcvtzu w10, d7
+; CHECK-NEXT:    mov v0.b[11], w11
 ; CHECK-NEXT:    mov v6.s[1], w9
-; CHECK-NEXT:    mov v0.b[14], v6.b[0]
-; CHECK-NEXT:    mov w8, v6.s[1]
+; CHECK-NEXT:    fcvtzu w9, d4
+; CHECK-NEXT:    cmp w9, #255
+; CHECK-NEXT:    mov v0.b[12], v6.b[0]
+; CHECK-NEXT:    mov w11, v6.s[1]
+; CHECK-NEXT:    csel w9, w9, w8, lo
+; CHECK-NEXT:    cmp w10, #255
+; CHECK-NEXT:    csel w8, w10, w8, lo
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    mov v0.b[13], w11
+; CHECK-NEXT:    mov v4.s[1], w9
+; CHECK-NEXT:    mov v0.b[14], v4.b[0]
+; CHECK-NEXT:    mov w8, v4.s[1]
 ; CHECK-NEXT:    mov v0.b[15], w8
 ; CHECK-NEXT:    ret
     %x = call <16 x i8> @llvm.fptoui.sat.v16f64.v16i8(<16 x double> %f)
@@ -2908,45 +2909,45 @@ define <8 x i16> @test_unsigned_v8f64_v8i16(<8 x double> %f) {
 ; CHECK-LABEL: test_unsigned_v8f64_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov d4, v3.d[1]
-; CHECK-NEXT:    fcvtzu w10, d3
-; CHECK-NEXT:    mov d3, v2.d[1]
-; CHECK-NEXT:    mov w8, #65535 // =0xffff
+; CHECK-NEXT:    mov d5, v2.d[1]
+; CHECK-NEXT:    mov w10, #65535 // =0xffff
+; CHECK-NEXT:    fcvtzu w9, d3
+; CHECK-NEXT:    mov d3, v1.d[1]
 ; CHECK-NEXT:    fcvtzu w12, d2
-; CHECK-NEXT:    fcvtzu w13, d1
-; CHECK-NEXT:    fcvtzu w9, d4
-; CHECK-NEXT:    mov d4, v1.d[1]
-; CHECK-NEXT:    fcvtzu w11, d3
-; CHECK-NEXT:    mov d1, v0.d[1]
-; CHECK-NEXT:    cmp w9, w8
-; CHECK-NEXT:    csel w9, w9, w8, lo
-; CHECK-NEXT:    cmp w10, w8
-; CHECK-NEXT:    csel w10, w10, w8, lo
-; CHECK-NEXT:    cmp w11, w8
-; CHECK-NEXT:    csel w11, w11, w8, lo
-; CHECK-NEXT:    cmp w12, w8
-; CHECK-NEXT:    csel w12, w12, w8, lo
-; CHECK-NEXT:    fmov s19, w10
-; CHECK-NEXT:    fcvtzu w10, d4
-; CHECK-NEXT:    cmp w10, w8
-; CHECK-NEXT:    mov v19.s[1], w9
-; CHECK-NEXT:    csel w10, w10, w8, lo
-; CHECK-NEXT:    cmp w13, w8
-; CHECK-NEXT:    fmov s18, w12
-; CHECK-NEXT:    fcvtzu w9, d1
-; CHECK-NEXT:    csel w12, w13, w8, lo
-; CHECK-NEXT:    fcvtzu w13, d0
-; CHECK-NEXT:    mov v18.s[1], w11
-; CHECK-NEXT:    cmp w9, w8
-; CHECK-NEXT:    fmov s17, w12
-; CHECK-NEXT:    csel w9, w9, w8, lo
-; CHECK-NEXT:    cmp w13, w8
-; CHECK-NEXT:    csel w8, w13, w8, lo
-; CHECK-NEXT:    mov v17.s[1], w10
-; CHECK-NEXT:    fmov s16, w8
+; CHECK-NEXT:    fcvtzu w14, d1
+; CHECK-NEXT:    fcvtzu w8, d4
+; CHECK-NEXT:    mov d4, v0.d[1]
+; CHECK-NEXT:    fcvtzu w11, d5
+; CHECK-NEXT:    fcvtzu w13, d3
+; CHECK-NEXT:    cmp w8, w10
+; CHECK-NEXT:    fcvtzu w15, d4
+; CHECK-NEXT:    csel w8, w8, w10, lo
+; CHECK-NEXT:    cmp w9, w10
+; CHECK-NEXT:    csel w9, w9, w10, lo
+; CHECK-NEXT:    cmp w11, w10
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    csel w9, w11, w10, lo
+; CHECK-NEXT:    cmp w12, w10
+; CHECK-NEXT:    fcvtzu w11, d0
+; CHECK-NEXT:    mov v4.s[1], w8
+; CHECK-NEXT:    csel w8, w12, w10, lo
+; CHECK-NEXT:    cmp w13, w10
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    csel w8, w13, w10, lo
+; CHECK-NEXT:    cmp w14, w10
+; CHECK-NEXT:    mov v3.s[1], w9
+; CHECK-NEXT:    csel w9, w14, w10, lo
+; CHECK-NEXT:    cmp w15, w10
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    csel w9, w15, w10, lo
+; CHECK-NEXT:    cmp w11, w10
+; CHECK-NEXT:    mov v2.s[1], w8
+; CHECK-NEXT:    csel w8, w11, w10, lo
+; CHECK-NEXT:    fmov s1, w8
 ; CHECK-NEXT:    adrp x8, .LCPI84_0
-; CHECK-NEXT:    mov v16.s[1], w9
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI84_0]
-; CHECK-NEXT:    tbl v0.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b
+; CHECK-NEXT:    mov v1.s[1], w9
+; CHECK-NEXT:    tbl v0.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v0.16b
 ; CHECK-NEXT:    ret
     %x = call <8 x i16> @llvm.fptoui.sat.v8f64.v8i16(<8 x double> %f)
     ret <8 x i16> %x
@@ -2957,78 +2958,78 @@ define <16 x i16> @test_unsigned_v16f64_v16i16(<16 x double> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov d16, v3.d[1]
 ; CHECK-NEXT:    mov d17, v2.d[1]
+; CHECK-NEXT:    mov w8, #65535 // =0xffff
 ; CHECK-NEXT:    fcvtzu w9, d3
 ; CHECK-NEXT:    mov d3, v1.d[1]
-; CHECK-NEXT:    mov w8, #65535 // =0xffff
-; CHECK-NEXT:    fcvtzu w10, d2
-; CHECK-NEXT:    fcvtzu w12, d1
+; CHECK-NEXT:    fcvtzu w11, d1
 ; CHECK-NEXT:    mov d1, v0.d[1]
-; CHECK-NEXT:    fcvtzu w11, d16
-; CHECK-NEXT:    fcvtzu w13, d17
-; CHECK-NEXT:    fcvtzu w14, d3
-; CHECK-NEXT:    mov d2, v7.d[1]
-; CHECK-NEXT:    fcvtzu w17, d6
-; CHECK-NEXT:    cmp w11, w8
-; CHECK-NEXT:    fcvtzu w15, d1
-; CHECK-NEXT:    csel w11, w11, w8, lo
+; CHECK-NEXT:    fcvtzu w10, d2
+; CHECK-NEXT:    fcvtzu w13, d0
+; CHECK-NEXT:    mov d0, v7.d[1]
+; CHECK-NEXT:    mov d2, v6.d[1]
+; CHECK-NEXT:    fcvtzu w15, d7
+; CHECK-NEXT:    fcvtzu w12, d16
+; CHECK-NEXT:    fcvtzu w14, d17
+; CHECK-NEXT:    fcvtzu w16, d6
+; CHECK-NEXT:    fcvtzu w17, d3
+; CHECK-NEXT:    mov d6, v5.d[1]
+; CHECK-NEXT:    mov d3, v4.d[1]
+; CHECK-NEXT:    fcvtzu w18, d1
+; CHECK-NEXT:    cmp w12, w8
+; CHECK-NEXT:    csel w12, w12, w8, lo
 ; CHECK-NEXT:    cmp w9, w8
 ; CHECK-NEXT:    csel w9, w9, w8, lo
-; CHECK-NEXT:    cmp w13, w8
-; CHECK-NEXT:    csel w13, w13, w8, lo
-; CHECK-NEXT:    cmp w10, w8
-; CHECK-NEXT:    csel w10, w10, w8, lo
 ; CHECK-NEXT:    cmp w14, w8
 ; CHECK-NEXT:    fmov s19, w9
 ; CHECK-NEXT:    csel w9, w14, w8, lo
+; CHECK-NEXT:    cmp w10, w8
 ; CHECK-NEXT:    fcvtzu w14, d0
-; CHECK-NEXT:    cmp w12, w8
-; CHECK-NEXT:    fcvtzu w16, d2
-; CHECK-NEXT:    mov d0, v6.d[1]
-; CHECK-NEXT:    csel w12, w12, w8, lo
-; CHECK-NEXT:    cmp w15, w8
-; CHECK-NEXT:    mov v19.s[1], w11
-; CHECK-NEXT:    fcvtzu w11, d7
-; CHECK-NEXT:    fmov s18, w10
-; CHECK-NEXT:    csel w10, w15, w8, lo
-; CHECK-NEXT:    cmp w14, w8
-; CHECK-NEXT:    csel w14, w14, w8, lo
-; CHECK-NEXT:    cmp w16, w8
-; CHECK-NEXT:    csel w15, w16, w8, lo
+; CHECK-NEXT:    csel w10, w10, w8, lo
+; CHECK-NEXT:    cmp w17, w8
+; CHECK-NEXT:    mov v19.s[1], w12
+; CHECK-NEXT:    csel w12, w17, w8, lo
 ; CHECK-NEXT:    cmp w11, w8
-; CHECK-NEXT:    fcvtzu w16, d0
-; CHECK-NEXT:    mov d0, v5.d[1]
 ; CHECK-NEXT:    csel w11, w11, w8, lo
-; CHECK-NEXT:    mov v18.s[1], w13
-; CHECK-NEXT:    cmp w16, w8
-; CHECK-NEXT:    fmov s17, w12
+; CHECK-NEXT:    cmp w18, w8
+; CHECK-NEXT:    fmov s18, w10
+; CHECK-NEXT:    csel w10, w18, w8, lo
+; CHECK-NEXT:    cmp w13, w8
+; CHECK-NEXT:    fcvtzu w17, d2
+; CHECK-NEXT:    csel w13, w13, w8, lo
+; CHECK-NEXT:    cmp w14, w8
+; CHECK-NEXT:    fcvtzu w18, d6
+; CHECK-NEXT:    mov v18.s[1], w9
+; CHECK-NEXT:    csel w9, w14, w8, lo
+; CHECK-NEXT:    cmp w15, w8
+; CHECK-NEXT:    fmov s17, w11
+; CHECK-NEXT:    csel w11, w15, w8, lo
+; CHECK-NEXT:    fcvtzu w14, d5
 ; CHECK-NEXT:    fmov s23, w11
-; CHECK-NEXT:    csel w11, w16, w8, lo
 ; CHECK-NEXT:    cmp w17, w8
-; CHECK-NEXT:    fcvtzu w16, d0
-; CHECK-NEXT:    mov d0, v4.d[1]
-; CHECK-NEXT:    csel w13, w17, w8, lo
-; CHECK-NEXT:    fcvtzu w17, d5
-; CHECK-NEXT:    fcvtzu w12, d4
-; CHECK-NEXT:    mov v23.s[1], w15
+; CHECK-NEXT:    fcvtzu w15, d3
+; CHECK-NEXT:    csel w11, w17, w8, lo
 ; CHECK-NEXT:    cmp w16, w8
-; CHECK-NEXT:    fmov s22, w13
-; CHECK-NEXT:    csel w13, w16, w8, lo
-; CHECK-NEXT:    cmp w17, w8
-; CHECK-NEXT:    fcvtzu w16, d0
-; CHECK-NEXT:    csel w15, w17, w8, lo
+; CHECK-NEXT:    fcvtzu w17, d4
+; CHECK-NEXT:    mov v17.s[1], w12
+; CHECK-NEXT:    mov v23.s[1], w9
+; CHECK-NEXT:    csel w9, w16, w8, lo
+; CHECK-NEXT:    cmp w18, w8
+; CHECK-NEXT:    fmov s22, w9
+; CHECK-NEXT:    csel w9, w18, w8, lo
+; CHECK-NEXT:    cmp w14, w8
+; CHECK-NEXT:    fmov s16, w13
 ; CHECK-NEXT:    mov v22.s[1], w11
-; CHECK-NEXT:    cmp w16, w8
-; CHECK-NEXT:    fmov s21, w15
-; CHECK-NEXT:    csel w11, w16, w8, lo
-; CHECK-NEXT:    cmp w12, w8
-; CHECK-NEXT:    csel w8, w12, w8, lo
-; CHECK-NEXT:    mov v17.s[1], w9
-; CHECK-NEXT:    adrp x9, .LCPI85_0
-; CHECK-NEXT:    mov v21.s[1], w13
-; CHECK-NEXT:    fmov s16, w14
-; CHECK-NEXT:    fmov s20, w8
-; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI85_0]
+; CHECK-NEXT:    csel w11, w14, w8, lo
+; CHECK-NEXT:    cmp w15, w8
+; CHECK-NEXT:    fmov s21, w11
+; CHECK-NEXT:    csel w11, w15, w8, lo
+; CHECK-NEXT:    cmp w17, w8
+; CHECK-NEXT:    csel w8, w17, w8, lo
 ; CHECK-NEXT:    mov v16.s[1], w10
+; CHECK-NEXT:    mov v21.s[1], w9
+; CHECK-NEXT:    fmov s20, w8
+; CHECK-NEXT:    adrp x8, .LCPI85_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI85_0]
 ; CHECK-NEXT:    mov v20.s[1], w11
 ; CHECK-NEXT:    tbl v0.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v1.16b
 ; CHECK-NEXT:    tbl v1.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v1.16b

diff  --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll
index fed7c8634db45e..1751e353c5f2f6 100644
--- a/llvm/test/CodeGen/AArch64/fptrunc.ll
+++ b/llvm/test/CodeGen/AArch64/fptrunc.ll
@@ -57,12 +57,12 @@ define <3 x float> @fptrunc_v3f64_v3f32(<3 x double> %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    fcvt s2, d2
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT:    fcvt s1, d2
 ; CHECK-GI-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
-; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/fsqrt.ll b/llvm/test/CodeGen/AArch64/fsqrt.ll
index fc864e55a7ba76..360ba4414a0072 100644
--- a/llvm/test/CodeGen/AArch64/fsqrt.ll
+++ b/llvm/test/CodeGen/AArch64/fsqrt.ll
@@ -231,13 +231,13 @@ define <7 x half> @sqrt_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[2], v3.s[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[3], v0.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
@@ -418,7 +418,6 @@ define <16 x half> @sqrt_v16f16(<16 x half> %a) {
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
 ; CHECK-SD-NOFP16-NEXT:    fsqrt s18, s0
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt h18, s18
 ; CHECK-SD-NOFP16-NEXT:    fsqrt s19, s0
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v1.h[2]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
@@ -430,48 +429,49 @@ define <16 x half> @sqrt_v16f16(<16 x half> %a) {
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
 ; CHECK-SD-NOFP16-NEXT:    fsqrt s2, s2
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
-; CHECK-SD-NOFP16-NEXT:    fsqrt s4, s4
 ; CHECK-SD-NOFP16-NEXT:    fsqrt s22, s0
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v1.h[5]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fsqrt s4, s4
 ; CHECK-SD-NOFP16-NEXT:    fsqrt s5, s5
-; CHECK-SD-NOFP16-NEXT:    fsqrt s6, s6
 ; CHECK-SD-NOFP16-NEXT:    fsqrt s23, s0
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v1.h[6]
-; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
 ; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fsqrt s6, s6
 ; CHECK-SD-NOFP16-NEXT:    fsqrt s7, s7
-; CHECK-SD-NOFP16-NEXT:    fsqrt s16, s16
 ; CHECK-SD-NOFP16-NEXT:    fsqrt s24, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h1
+; CHECK-SD-NOFP16-NEXT:    mov h0, v1.h[7]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h1, s19
+; CHECK-SD-NOFP16-NEXT:    fcvt s25, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s18
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s4
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v18.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s20
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[1], v3.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s20
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[2], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s5
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s21
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[2], v3.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s21
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[3], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s6
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s22
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s22
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s7
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s23
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[4], v3.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s23
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v3.h[0]
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s24
+; CHECK-SD-NOFP16-NEXT:    fsqrt s16, s16
+; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s16
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[5], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s24
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[6], v2.h[0]
 ; CHECK-SD-NOFP16-NEXT:    fcvt h2, s17
-; CHECK-SD-NOFP16-NEXT:    mov v1.h[6], v4.h[0]
+; CHECK-SD-NOFP16-NEXT:    fsqrt s18, s25
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[7], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    fsqrt s3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, s18
 ; CHECK-SD-NOFP16-NEXT:    mov v1.h[7], v3.h[0]
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;

diff  --git a/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll b/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll
index 3e097c559032d1..181f2185893e43 100644
--- a/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll
+++ b/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll
@@ -40,12 +40,12 @@ define i16 @rotl_i16(i16 %x, i16 %z) {
 ; CHECK-LABEL: rotl_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    neg w8, w1
-; CHECK-NEXT:    and w9, w1, #0xf
+; CHECK-NEXT:    and w9, w0, #0xffff
+; CHECK-NEXT:    and w10, w1, #0xf
 ; CHECK-NEXT:    and w8, w8, #0xf
-; CHECK-NEXT:    and w10, w0, #0xffff
-; CHECK-NEXT:    lsl w9, w0, w9
-; CHECK-NEXT:    lsr w8, w10, w8
-; CHECK-NEXT:    orr w0, w9, w8
+; CHECK-NEXT:    lsl w10, w0, w10
+; CHECK-NEXT:    lsr w8, w9, w8
+; CHECK-NEXT:    orr w0, w10, w8
 ; CHECK-NEXT:    ret
   %f = call i16 @llvm.fshl.i16(i16 %x, i16 %x, i16 %z)
   ret i16 %f
@@ -132,10 +132,10 @@ define i16 @rotr_i16(i16 %x, i16 %z) {
 ; CHECK-LABEL: rotr_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    neg w8, w1
-; CHECK-NEXT:    and w9, w1, #0xf
+; CHECK-NEXT:    and w9, w0, #0xffff
+; CHECK-NEXT:    and w10, w1, #0xf
 ; CHECK-NEXT:    and w8, w8, #0xf
-; CHECK-NEXT:    and w10, w0, #0xffff
-; CHECK-NEXT:    lsr w9, w10, w9
+; CHECK-NEXT:    lsr w9, w9, w10
 ; CHECK-NEXT:    lsl w8, w0, w8
 ; CHECK-NEXT:    orr w0, w9, w8
 ; CHECK-NEXT:    ret
@@ -169,8 +169,8 @@ define <4 x i32> @rotr_v4i32(<4 x i32> %x, <4 x i32> %z) {
 ; CHECK-NEXT:    movi v2.4s, #31
 ; CHECK-NEXT:    neg v3.4s, v1.4s
 ; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    neg v1.4s, v1.4s
 ; CHECK-NEXT:    and v2.16b, v3.16b, v2.16b
+; CHECK-NEXT:    neg v1.4s, v1.4s
 ; CHECK-NEXT:    ushl v2.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b

diff  --git a/llvm/test/CodeGen/AArch64/funnel-shift.ll b/llvm/test/CodeGen/AArch64/funnel-shift.ll
index a7f3aed163910d..1a2b06e0afb9d3 100644
--- a/llvm/test/CodeGen/AArch64/funnel-shift.ll
+++ b/llvm/test/CodeGen/AArch64/funnel-shift.ll
@@ -19,11 +19,11 @@ declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
 define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: fshl_i32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr w8, w1, #1
 ; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
-; CHECK-NEXT:    mvn w8, w2
-; CHECK-NEXT:    lsr w9, w1, #1
+; CHECK-NEXT:    mvn w9, w2
 ; CHECK-NEXT:    lsl w10, w0, w2
-; CHECK-NEXT:    lsr w8, w9, w8
+; CHECK-NEXT:    lsr w8, w8, w9
 ; CHECK-NEXT:    orr w0, w10, w8
 ; CHECK-NEXT:    ret
   %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
@@ -33,10 +33,10 @@ define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) {
 define i64 @fshl_i64(i64 %x, i64 %y, i64 %z) {
 ; CHECK-LABEL: fshl_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w2
-; CHECK-NEXT:    lsr x9, x1, #1
+; CHECK-NEXT:    lsr x8, x1, #1
+; CHECK-NEXT:    mvn w9, w2
 ; CHECK-NEXT:    lsl x10, x0, x2
-; CHECK-NEXT:    lsr x8, x9, x8
+; CHECK-NEXT:    lsr x8, x8, x9
 ; CHECK-NEXT:    orr x0, x10, x8
 ; CHECK-NEXT:    ret
   %f = call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 %z)
@@ -47,18 +47,18 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; CHECK-LABEL: fshl_i128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    tst x4, #0x40
-; CHECK-NEXT:    mvn w8, w4
+; CHECK-NEXT:    mvn w11, w4
+; CHECK-NEXT:    csel x8, x3, x0, ne
 ; CHECK-NEXT:    csel x9, x2, x3, ne
-; CHECK-NEXT:    csel x10, x3, x0, ne
-; CHECK-NEXT:    lsr x9, x9, #1
-; CHECK-NEXT:    lsl x11, x10, x4
 ; CHECK-NEXT:    csel x12, x0, x1, ne
-; CHECK-NEXT:    lsr x10, x10, #1
-; CHECK-NEXT:    lsr x9, x9, x8
+; CHECK-NEXT:    lsr x9, x9, #1
+; CHECK-NEXT:    lsr x10, x8, #1
+; CHECK-NEXT:    lsl x8, x8, x4
 ; CHECK-NEXT:    lsl x12, x12, x4
-; CHECK-NEXT:    lsr x8, x10, x8
-; CHECK-NEXT:    orr x0, x11, x9
-; CHECK-NEXT:    orr x1, x12, x8
+; CHECK-NEXT:    lsr x9, x9, x11
+; CHECK-NEXT:    lsr x10, x10, x11
+; CHECK-NEXT:    orr x0, x8, x9
+; CHECK-NEXT:    orr x1, x12, x10
 ; CHECK-NEXT:    ret
   %f = call i128 @llvm.fshl.i128(i128 %x, i128 %y, i128 %z)
   ret i128 %f
@@ -69,18 +69,18 @@ declare i37 @llvm.fshl.i37(i37, i37, i37)
 define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) {
 ; CHECK-LABEL: fshl_i37:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x9, #46053
+; CHECK-NEXT:    mov x9, #46053 // =0xb3e5
 ; CHECK-NEXT:    and x8, x2, #0x1fffffffff
 ; CHECK-NEXT:    movk x9, #12398, lsl #16
-; CHECK-NEXT:    ubfiz x10, x1, #26, #37
 ; CHECK-NEXT:    movk x9, #15941, lsl #32
 ; CHECK-NEXT:    movk x9, #1771, lsl #48
 ; CHECK-NEXT:    umulh x8, x8, x9
-; CHECK-NEXT:    mov w9, #37
+; CHECK-NEXT:    mov w9, #37 // =0x25
 ; CHECK-NEXT:    msub w8, w8, w9, w2
-; CHECK-NEXT:    mvn w9, w8
+; CHECK-NEXT:    ubfiz x9, x1, #26, #37
+; CHECK-NEXT:    mvn w10, w8
 ; CHECK-NEXT:    lsl x8, x0, x8
-; CHECK-NEXT:    lsr x9, x10, x9
+; CHECK-NEXT:    lsr x9, x9, x10
 ; CHECK-NEXT:    orr x0, x8, x9
 ; CHECK-NEXT:    ret
   %f = call i37 @llvm.fshl.i37(i37 %x, i37 %y, i37 %z)
@@ -93,7 +93,7 @@ declare i7 @llvm.fshl.i7(i7, i7, i7)
 define i7 @fshl_i7_const_fold() {
 ; CHECK-LABEL: fshl_i7_const_fold:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, #67
+; CHECK-NEXT:    mov w0, #67 // =0x43
 ; CHECK-NEXT:    ret
   %f = call i7 @llvm.fshl.i7(i7 112, i7 127, i7 2)
   ret i7 %f
@@ -102,7 +102,7 @@ define i7 @fshl_i7_const_fold() {
 define i8 @fshl_i8_const_fold_overshift_1() {
 ; CHECK-LABEL: fshl_i8_const_fold_overshift_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, #128
+; CHECK-NEXT:    mov w0, #128 // =0x80
 ; CHECK-NEXT:    ret
   %f = call i8 @llvm.fshl.i8(i8 255, i8 0, i8 15)
   ret i8 %f
@@ -111,7 +111,7 @@ define i8 @fshl_i8_const_fold_overshift_1() {
 define i8 @fshl_i8_const_fold_overshift_2() {
 ; CHECK-LABEL: fshl_i8_const_fold_overshift_2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, #120
+; CHECK-NEXT:    mov w0, #120 // =0x78
 ; CHECK-NEXT:    ret
   %f = call i8 @llvm.fshl.i8(i8 15, i8 15, i8 11)
   ret i8 %f
@@ -164,7 +164,7 @@ define i64 @fshl_i64_const_overshift(i64 %x, i64 %y) {
 define i8 @fshl_i8_const_fold() {
 ; CHECK-LABEL: fshl_i8_const_fold:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, #128
+; CHECK-NEXT:    mov w0, #128 // =0x80
 ; CHECK-NEXT:    ret
   %f = call i8 @llvm.fshl.i8(i8 255, i8 0, i8 7)
   ret i8 %f
@@ -177,11 +177,11 @@ define i8 @fshl_i8_const_fold() {
 define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: fshr_i32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl w8, w0, #1
 ; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
-; CHECK-NEXT:    mvn w8, w2
-; CHECK-NEXT:    lsl w9, w0, #1
+; CHECK-NEXT:    mvn w9, w2
 ; CHECK-NEXT:    lsr w10, w1, w2
-; CHECK-NEXT:    lsl w8, w9, w8
+; CHECK-NEXT:    lsl w8, w8, w9
 ; CHECK-NEXT:    orr w0, w8, w10
 ; CHECK-NEXT:    ret
   %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
@@ -191,10 +191,10 @@ define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) {
 define i64 @fshr_i64(i64 %x, i64 %y, i64 %z) {
 ; CHECK-LABEL: fshr_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w2
-; CHECK-NEXT:    lsl x9, x0, #1
+; CHECK-NEXT:    lsl x8, x0, #1
+; CHECK-NEXT:    mvn w9, w2
 ; CHECK-NEXT:    lsr x10, x1, x2
-; CHECK-NEXT:    lsl x8, x9, x8
+; CHECK-NEXT:    lsl x8, x8, x9
 ; CHECK-NEXT:    orr x0, x8, x10
 ; CHECK-NEXT:    ret
   %f = call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 %z)
@@ -206,20 +206,20 @@ declare i37 @llvm.fshr.i37(i37, i37, i37)
 define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) {
 ; CHECK-LABEL: fshr_i37:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x9, #46053
+; CHECK-NEXT:    mov x9, #46053 // =0xb3e5
 ; CHECK-NEXT:    and x8, x2, #0x1fffffffff
+; CHECK-NEXT:    lsl x10, x0, #1
 ; CHECK-NEXT:    movk x9, #12398, lsl #16
-; CHECK-NEXT:    lsl x10, x1, #27
 ; CHECK-NEXT:    movk x9, #15941, lsl #32
-; CHECK-NEXT:    lsl x11, x0, #1
 ; CHECK-NEXT:    movk x9, #1771, lsl #48
 ; CHECK-NEXT:    umulh x8, x8, x9
-; CHECK-NEXT:    mov w9, #37
+; CHECK-NEXT:    mov w9, #37 // =0x25
 ; CHECK-NEXT:    msub w8, w8, w9, w2
+; CHECK-NEXT:    lsl x9, x1, #27
 ; CHECK-NEXT:    add w8, w8, #27
-; CHECK-NEXT:    mvn w9, w8
-; CHECK-NEXT:    lsr x8, x10, x8
-; CHECK-NEXT:    lsl x9, x11, x9
+; CHECK-NEXT:    mvn w11, w8
+; CHECK-NEXT:    lsr x8, x9, x8
+; CHECK-NEXT:    lsl x9, x10, x11
 ; CHECK-NEXT:    orr x0, x9, x8
 ; CHECK-NEXT:    ret
   %f = call i37 @llvm.fshr.i37(i37 %x, i37 %y, i37 %z)
@@ -232,7 +232,7 @@ declare i7 @llvm.fshr.i7(i7, i7, i7)
 define i7 @fshr_i7_const_fold() {
 ; CHECK-LABEL: fshr_i7_const_fold:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, #31
+; CHECK-NEXT:    mov w0, #31 // =0x1f
 ; CHECK-NEXT:    ret
   %f = call i7 @llvm.fshr.i7(i7 112, i7 127, i7 2)
   ret i7 %f
@@ -241,7 +241,7 @@ define i7 @fshr_i7_const_fold() {
 define i8 @fshr_i8_const_fold_overshift_1() {
 ; CHECK-LABEL: fshr_i8_const_fold_overshift_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, #254
+; CHECK-NEXT:    mov w0, #254 // =0xfe
 ; CHECK-NEXT:    ret
   %f = call i8 @llvm.fshr.i8(i8 255, i8 0, i8 15)
   ret i8 %f
@@ -250,7 +250,7 @@ define i8 @fshr_i8_const_fold_overshift_1() {
 define i8 @fshr_i8_const_fold_overshift_2() {
 ; CHECK-LABEL: fshr_i8_const_fold_overshift_2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, #225
+; CHECK-NEXT:    mov w0, #225 // =0xe1
 ; CHECK-NEXT:    ret
   %f = call i8 @llvm.fshr.i8(i8 15, i8 15, i8 11)
   ret i8 %f
@@ -259,7 +259,7 @@ define i8 @fshr_i8_const_fold_overshift_2() {
 define i8 @fshr_i8_const_fold_overshift_3() {
 ; CHECK-LABEL: fshr_i8_const_fold_overshift_3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, #255
+; CHECK-NEXT:    mov w0, #255 // =0xff
 ; CHECK-NEXT:    ret
   %f = call i8 @llvm.fshr.i8(i8 0, i8 255, i8 8)
   ret i8 %f
@@ -303,7 +303,7 @@ define i64 @fshr_i64_const_overshift(i64 %x, i64 %y) {
 define i8 @fshr_i8_const_fold() {
 ; CHECK-LABEL: fshr_i8_const_fold:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, #254
+; CHECK-NEXT:    mov w0, #254 // =0xfe
 ; CHECK-NEXT:    ret
   %f = call i8 @llvm.fshr.i8(i8 255, i8 0, i8 7)
   ret i8 %f
@@ -347,13 +347,13 @@ define i32 @or_shl_fshl(i32 %x, i32 %y, i32 %s) {
 ; CHECK-LABEL: or_shl_fshl:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w2
-; CHECK-NEXT:    mvn w9, w2
-; CHECK-NEXT:    lsr w10, w1, #1
-; CHECK-NEXT:    lsr w9, w10, w9
-; CHECK-NEXT:    lsl w8, w0, w8
+; CHECK-NEXT:    lsr w9, w1, #1
 ; CHECK-NEXT:    lsl w10, w1, w2
-; CHECK-NEXT:    orr w8, w8, w9
-; CHECK-NEXT:    orr w0, w8, w10
+; CHECK-NEXT:    mvn w11, w2
+; CHECK-NEXT:    lsl w8, w0, w8
+; CHECK-NEXT:    lsr w9, w9, w11
+; CHECK-NEXT:    orr w8, w8, w10
+; CHECK-NEXT:    orr w0, w8, w9
 ; CHECK-NEXT:    ret
   %shy = shl i32 %y, %s
   %fun = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %s)
@@ -379,13 +379,13 @@ define i32 @or_shl_fshl_commute(i32 %x, i32 %y, i32 %s) {
 ; CHECK-LABEL: or_shl_fshl_commute:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w2
-; CHECK-NEXT:    mvn w9, w2
-; CHECK-NEXT:    lsr w10, w1, #1
-; CHECK-NEXT:    lsr w9, w10, w9
-; CHECK-NEXT:    lsl w8, w0, w8
+; CHECK-NEXT:    lsr w9, w1, #1
 ; CHECK-NEXT:    lsl w10, w1, w2
-; CHECK-NEXT:    orr w8, w8, w9
-; CHECK-NEXT:    orr w0, w10, w8
+; CHECK-NEXT:    mvn w11, w2
+; CHECK-NEXT:    lsl w8, w0, w8
+; CHECK-NEXT:    lsr w9, w9, w11
+; CHECK-NEXT:    orr w8, w10, w8
+; CHECK-NEXT:    orr w0, w8, w9
 ; CHECK-NEXT:    ret
   %shy = shl i32 %y, %s
   %fun = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %s)
@@ -411,13 +411,13 @@ define i32 @or_lshr_fshr(i32 %x, i32 %y, i32 %s) {
 ; CHECK-LABEL: or_lshr_fshr:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w2
-; CHECK-NEXT:    mvn w9, w2
-; CHECK-NEXT:    lsl w10, w1, #1
-; CHECK-NEXT:    lsr w8, w0, w8
-; CHECK-NEXT:    lsl w9, w10, w9
+; CHECK-NEXT:    lsl w9, w1, #1
 ; CHECK-NEXT:    lsr w10, w1, w2
-; CHECK-NEXT:    orr w8, w9, w8
-; CHECK-NEXT:    orr w0, w8, w10
+; CHECK-NEXT:    lsr w8, w0, w8
+; CHECK-NEXT:    mvn w11, w2
+; CHECK-NEXT:    lsl w9, w9, w11
+; CHECK-NEXT:    orr w8, w8, w10
+; CHECK-NEXT:    orr w0, w9, w8
 ; CHECK-NEXT:    ret
   %shy = lshr i32 %y, %s
   %fun = call i32 @llvm.fshr.i32(i32 %y, i32 %x, i32 %s)
@@ -442,13 +442,13 @@ define i32 @or_lshr_fshr_commute(i32 %x, i32 %y, i32 %s) {
 ; CHECK-LABEL: or_lshr_fshr_commute:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w2
-; CHECK-NEXT:    mvn w9, w2
-; CHECK-NEXT:    lsl w10, w1, #1
-; CHECK-NEXT:    lsr w8, w0, w8
-; CHECK-NEXT:    lsl w9, w10, w9
+; CHECK-NEXT:    lsl w9, w1, #1
 ; CHECK-NEXT:    lsr w10, w1, w2
-; CHECK-NEXT:    orr w8, w9, w8
-; CHECK-NEXT:    orr w0, w10, w8
+; CHECK-NEXT:    lsr w8, w0, w8
+; CHECK-NEXT:    mvn w11, w2
+; CHECK-NEXT:    lsl w9, w9, w11
+; CHECK-NEXT:    orr w8, w10, w8
+; CHECK-NEXT:    orr w0, w8, w9
 ; CHECK-NEXT:    ret
   %shy = lshr i32 %y, %s
   %fun = call i32 @llvm.fshr.i32(i32 %y, i32 %x, i32 %s)
@@ -472,11 +472,11 @@ define i32 @or_lshr_rotr_commute(i32 %x, i32 %y, i32 %s) {
 define i32 @or_shl_fshl_simplify(i32 %x, i32 %y, i32 %s) {
 ; CHECK-LABEL: or_shl_fshl_simplify:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr w8, w0, #1
 ; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
-; CHECK-NEXT:    mvn w8, w2
-; CHECK-NEXT:    lsr w9, w0, #1
+; CHECK-NEXT:    mvn w9, w2
 ; CHECK-NEXT:    lsl w10, w1, w2
-; CHECK-NEXT:    lsr w8, w9, w8
+; CHECK-NEXT:    lsr w8, w8, w9
 ; CHECK-NEXT:    orr w0, w10, w8
 ; CHECK-NEXT:    ret
   %shy = shl i32 %y, %s
@@ -488,11 +488,11 @@ define i32 @or_shl_fshl_simplify(i32 %x, i32 %y, i32 %s) {
 define i32 @or_lshr_fshr_simplify(i32 %x, i32 %y, i32 %s) {
 ; CHECK-LABEL: or_lshr_fshr_simplify:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl w8, w0, #1
 ; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
-; CHECK-NEXT:    mvn w8, w2
-; CHECK-NEXT:    lsl w9, w0, #1
+; CHECK-NEXT:    mvn w9, w2
 ; CHECK-NEXT:    lsr w10, w1, w2
-; CHECK-NEXT:    lsl w8, w9, w8
+; CHECK-NEXT:    lsl w8, w8, w9
 ; CHECK-NEXT:    orr w0, w8, w10
 ; CHECK-NEXT:    ret
   %shy = lshr i32 %y, %s

diff  --git a/llvm/test/CodeGen/AArch64/global-merge-3.ll b/llvm/test/CodeGen/AArch64/global-merge-3.ll
index 1fdae070cb90e5..b3f58887139f71 100644
--- a/llvm/test/CodeGen/AArch64/global-merge-3.ll
+++ b/llvm/test/CodeGen/AArch64/global-merge-3.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc %s -mtriple=aarch64-none-linux-gnu -aarch64-enable-global-merge -global-merge-on-external -disable-post-ra -o - | FileCheck %s
 ; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-enable-global-merge -global-merge-on-external -disable-post-ra -o - | FileCheck %s
 ; RUN: llc %s -mtriple=aarch64-apple-ios -aarch64-enable-global-merge -global-merge-on-external -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
@@ -7,19 +8,28 @@
 @z = internal global i32 1, align 4
 
 define dso_local void @f1(i32 %a1, i32 %a2, i32 %a3) {
-;CHECK-APPLE-IOS: adrp    x8, _z at PAGE
-;CHECK-APPLE-IOS: adrp    x9, __MergedGlobals_x at PAGE+12
-;CHECK-APPLE-IOS-NOT: adrp
-;CHECK-APPLE-IOS: add   x9, x9, __MergedGlobals_x at PAGEOFF+12
-;CHECK-APPLE-IOS: str   w1, [x9, #400]
-;CHECK-APPLE-IOS: str   w0, [x9]
-;CHECK-APPLE-IOS: str     w2, [x8, _z at PAGEOFF]
-;CHECK: adrp    x8, z
-;CHECK: adrp    x9, .L_MergedGlobals+12
-;CHECK: add     x9, x9, :lo12:.L_MergedGlobals+12
-;CHECK: str     w1, [x9, #400]
-;CHECK: str     w0, [x9]
-;CHECK: str     w2, [x8, :lo12:z]
+; CHECK-LABEL: f1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .L_MergedGlobals+12
+; CHECK-NEXT:    add x8, x8, :lo12:.L_MergedGlobals+12
+; CHECK-NEXT:    str w0, [x8]
+; CHECK-NEXT:    str w1, [x8, #400]
+; CHECK-NEXT:    adrp x8, z
+; CHECK-NEXT:    str w2, [x8, :lo12:z]
+; CHECK-NEXT:    ret
+;
+; CHECK-APPLE-IOS-LABEL: f1:
+; CHECK-APPLE-IOS:       ; %bb.0:
+; CHECK-APPLE-IOS-NEXT:  Lloh0:
+; CHECK-APPLE-IOS-NEXT:    adrp x8, __MergedGlobals_x at PAGE+12
+; CHECK-APPLE-IOS-NEXT:  Lloh1:
+; CHECK-APPLE-IOS-NEXT:    add x8, x8, __MergedGlobals_x at PAGEOFF+12
+; CHECK-APPLE-IOS-NEXT:    str w0, [x8]
+; CHECK-APPLE-IOS-NEXT:    str w1, [x8, #400]
+; CHECK-APPLE-IOS-NEXT:    adrp x8, _z at PAGE
+; CHECK-APPLE-IOS-NEXT:    str w2, [x8, _z at PAGEOFF]
+; CHECK-APPLE-IOS-NEXT:    ret
+; CHECK-APPLE-IOS-NEXT:    .loh AdrpAdd Lloh0, Lloh1
   %x3 = getelementptr inbounds [100 x i32], ptr @x, i32 0, i64 3
   %y3 = getelementptr inbounds [100 x i32], ptr @y, i32 0, i64 3
   store i32 %a1, ptr %x3, align 4

diff  --git a/llvm/test/CodeGen/AArch64/gpr_cttz.ll b/llvm/test/CodeGen/AArch64/gpr_cttz.ll
index 632514f5b805d6..3b8269385d27df 100644
--- a/llvm/test/CodeGen/AArch64/gpr_cttz.ll
+++ b/llvm/test/CodeGen/AArch64/gpr_cttz.ll
@@ -103,24 +103,24 @@ define i64 @cttz64(i64 %x) nounwind readnone {
 define i128 @cttz128(i128 %x) nounwind readnone {
 ; CHECK-LABEL: cttz128:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    rbit x9, x1
-; CHECK-NEXT:    rbit x8, x0
-; CHECK-NEXT:    clz x9, x9
-; CHECK-NEXT:    clz x8, x8
-; CHECK-NEXT:    add x9, x9, #64
+; CHECK-NEXT:    rbit x8, x1
+; CHECK-NEXT:    rbit x9, x0
 ; CHECK-NEXT:    cmp x0, #0
-; CHECK-NEXT:    csel x0, x8, x9, ne
 ; CHECK-NEXT:    mov x1, xzr
+; CHECK-NEXT:    clz x8, x8
+; CHECK-NEXT:    clz x9, x9
+; CHECK-NEXT:    add x8, x8, #64
+; CHECK-NEXT:    csel x0, x9, x8, ne
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-CSSC-LABEL: cttz128:
 ; CHECK-CSSC:       // %bb.0:
-; CHECK-CSSC-NEXT:    ctz x9, x1
-; CHECK-CSSC-NEXT:    ctz x8, x0
-; CHECK-CSSC-NEXT:    add x9, x9, #64
+; CHECK-CSSC-NEXT:    ctz x8, x1
+; CHECK-CSSC-NEXT:    ctz x9, x0
 ; CHECK-CSSC-NEXT:    cmp x0, #0
-; CHECK-CSSC-NEXT:    csel x0, x8, x9, ne
+; CHECK-CSSC-NEXT:    add x8, x8, #64
 ; CHECK-CSSC-NEXT:    mov x1, xzr
+; CHECK-CSSC-NEXT:    csel x0, x9, x8, ne
 ; CHECK-CSSC-NEXT:    ret
   %ctz = tail call i128 @llvm.cttz.i128(i128 %x)
   ret i128 %ctz

diff  --git a/llvm/test/CodeGen/AArch64/half.ll b/llvm/test/CodeGen/AArch64/half.ll
index 577cd8b02b8a54..bb802033e05fc6 100644
--- a/llvm/test/CodeGen/AArch64/half.ll
+++ b/llvm/test/CodeGen/AArch64/half.ll
@@ -99,16 +99,16 @@ define void @test_trunc64(double %in, ptr %addr) {
 define i16 @test_fccmp(i1 %a, i16 %in) {
 ; CHECK-LABEL: test_fccmp:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #24576
 ; CHECK-NEXT:    fmov s0, w1
+; CHECK-NEXT:    mov w8, #24576 // =0x6000
 ; CHECK-NEXT:    movk w8, #15974, lsl #16
-; CHECK-NEXT:    fcvt s0, h0
 ; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    mov w8, #16384
+; CHECK-NEXT:    mov w8, #16384 // =0x4000
+; CHECK-NEXT:    fcvt s0, h0
 ; CHECK-NEXT:    movk w8, #15428, lsl #16
-; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    mov w8, #4
+; CHECK-NEXT:    mov w8, #4 // =0x4
+; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    fccmp s0, s2, #8, pl
 ; CHECK-NEXT:    csinc w8, w8, wzr, mi
 ; CHECK-NEXT:    fcmp s0, s1

diff  --git a/llvm/test/CodeGen/AArch64/highextractbitcast.ll b/llvm/test/CodeGen/AArch64/highextractbitcast.ll
index e7b2a49d699eab..f82d1ed87fba75 100644
--- a/llvm/test/CodeGen/AArch64/highextractbitcast.ll
+++ b/llvm/test/CodeGen/AArch64/highextractbitcast.ll
@@ -90,10 +90,10 @@ define <4 x i32> @test_smull_high_s16_bitcasta2(<2 x i64> %a, <8 x i16> %b) #0 {
 ;
 ; CHECK-BE-LABEL: test_smull_high_s16_bitcasta2:
 ; CHECK-BE:       // %bb.0: // %entry
-; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h
 ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-BE-NEXT:    rev64 v0.4h, v0.4h
 ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
 ; CHECK-BE-NEXT:    smull v0.4s, v0.4h, v1.4h
@@ -147,10 +147,10 @@ define <4 x i32> @test_smull_high_s16_bitcasta1_wrongindex(<2 x i64> %aa, <8 x i
 ;
 ; CHECK-BE-LABEL: test_smull_high_s16_bitcasta1_wrongindex:
 ; CHECK-BE:       // %bb.0: // %entry
-; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
 ; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h
-; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
 ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #4
 ; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
 ; CHECK-BE-NEXT:    smull v0.4s, v0.4h, v1.4h

diff  --git a/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
index 28f0a3071edf13..97511639ec8cf8 100644
--- a/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
@@ -202,8 +202,8 @@ define <4 x i1> @vec_4xi32_splat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
 define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; CHECK-LABEL: vec_4xi32_nonsplat_eq:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI13_0
 ; CHECK-NEXT:    neg v1.4s, v1.4s
+; CHECK-NEXT:    adrp x8, .LCPI13_0
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI13_0]
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
@@ -285,7 +285,7 @@ define i1 @scalar_i8_signbit_ne(i8 %x, i8 %y) nounwind {
 define i1 @scalar_i32_x_is_const_eq(i32 %y) nounwind {
 ; CHECK-LABEL: scalar_i32_x_is_const_eq:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #43605
+; CHECK-NEXT:    mov w8, #43605 // =0xaa55
 ; CHECK-NEXT:    movk w8, #43605, lsl #16
 ; CHECK-NEXT:    lsr w8, w8, w0
 ; CHECK-NEXT:    tst w8, #0x1
@@ -299,7 +299,7 @@ define i1 @scalar_i32_x_is_const_eq(i32 %y) nounwind {
 define i1 @scalar_i32_x_is_const2_eq(i32 %y) nounwind {
 ; CHECK-LABEL: scalar_i32_x_is_const2_eq:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    lsr w8, w8, w0
 ; CHECK-NEXT:    cmp w8, #0
 ; CHECK-NEXT:    cset w0, eq
@@ -324,7 +324,7 @@ define i1 @scalar_i8_bitsinmiddle_slt(i8 %x, i8 %y) nounwind {
 define i1 @scalar_i8_signbit_eq_with_nonzero(i8 %x, i8 %y) nounwind {
 ; CHECK-LABEL: scalar_i8_signbit_eq_with_nonzero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #128
+; CHECK-NEXT:    mov w8, #128 // =0x80
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-NEXT:    lsr w8, w8, w1
 ; CHECK-NEXT:    and w8, w8, w0

diff  --git a/llvm/test/CodeGen/AArch64/i128-math.ll b/llvm/test/CodeGen/AArch64/i128-math.ll
index a16c51541df152..2f7848de82274e 100644
--- a/llvm/test/CodeGen/AArch64/i128-math.ll
+++ b/llvm/test/CodeGen/AArch64/i128-math.ll
@@ -262,19 +262,19 @@ define i128 @u128_mul(i128 %x, i128 %y) {
 define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: u128_checked_mul:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul x9, x3, x0
 ; CHECK-NEXT:    cmp x1, #0
-; CHECK-NEXT:    umulh x8, x1, x2
 ; CHECK-NEXT:    ccmp x3, #0, #4, ne
-; CHECK-NEXT:    mul x9, x3, x0
+; CHECK-NEXT:    umulh x8, x1, x2
+; CHECK-NEXT:    umulh x10, x3, x0
 ; CHECK-NEXT:    madd x9, x1, x2, x9
 ; CHECK-NEXT:    ccmp xzr, x8, #0, eq
-; CHECK-NEXT:    umulh x8, x3, x0
-; CHECK-NEXT:    ccmp xzr, x8, #0, eq
-; CHECK-NEXT:    umulh x8, x0, x2
+; CHECK-NEXT:    umulh x11, x0, x2
+; CHECK-NEXT:    ccmp xzr, x10, #0, eq
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    cset w10, ne
-; CHECK-NEXT:    adds x1, x8, x9
-; CHECK-NEXT:    csinc w8, w10, wzr, lo
+; CHECK-NEXT:    cset w8, ne
+; CHECK-NEXT:    adds x1, x11, x9
+; CHECK-NEXT:    csinc w8, w8, wzr, lo
 ; CHECK-NEXT:    eor w2, w8, #0x1
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
@@ -290,19 +290,19 @@ define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) {
 define { i128, i8 } @u128_overflowing_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: u128_overflowing_mul:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul x9, x3, x0
 ; CHECK-NEXT:    cmp x1, #0
-; CHECK-NEXT:    umulh x8, x1, x2
 ; CHECK-NEXT:    ccmp x3, #0, #4, ne
-; CHECK-NEXT:    umulh x9, x3, x0
+; CHECK-NEXT:    umulh x8, x1, x2
+; CHECK-NEXT:    umulh x10, x3, x0
+; CHECK-NEXT:    madd x9, x1, x2, x9
 ; CHECK-NEXT:    ccmp xzr, x8, #0, eq
-; CHECK-NEXT:    mul x8, x3, x0
-; CHECK-NEXT:    madd x8, x1, x2, x8
-; CHECK-NEXT:    ccmp xzr, x9, #0, eq
-; CHECK-NEXT:    umulh x9, x0, x2
+; CHECK-NEXT:    umulh x11, x0, x2
+; CHECK-NEXT:    ccmp xzr, x10, #0, eq
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    cset w10, ne
-; CHECK-NEXT:    adds x1, x9, x8
-; CHECK-NEXT:    csinc w2, w10, wzr, lo
+; CHECK-NEXT:    cset w8, ne
+; CHECK-NEXT:    adds x1, x11, x9
+; CHECK-NEXT:    csinc w2, w8, wzr, lo
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
@@ -316,22 +316,22 @@ define { i128, i8 } @u128_overflowing_mul(i128 %x, i128 %y) {
 define i128 @u128_saturating_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: u128_saturating_mul:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul x9, x3, x0
 ; CHECK-NEXT:    cmp x1, #0
-; CHECK-NEXT:    umulh x8, x1, x2
 ; CHECK-NEXT:    ccmp x3, #0, #4, ne
-; CHECK-NEXT:    umulh x9, x3, x0
+; CHECK-NEXT:    umulh x8, x1, x2
+; CHECK-NEXT:    umulh x10, x3, x0
+; CHECK-NEXT:    madd x9, x1, x2, x9
 ; CHECK-NEXT:    ccmp xzr, x8, #0, eq
-; CHECK-NEXT:    mul x8, x3, x0
-; CHECK-NEXT:    madd x8, x1, x2, x8
-; CHECK-NEXT:    ccmp xzr, x9, #0, eq
-; CHECK-NEXT:    umulh x9, x0, x2
+; CHECK-NEXT:    umulh x11, x0, x2
+; CHECK-NEXT:    ccmp xzr, x10, #0, eq
+; CHECK-NEXT:    mul x8, x0, x2
 ; CHECK-NEXT:    cset w10, ne
-; CHECK-NEXT:    adds x8, x9, x8
-; CHECK-NEXT:    csinc w9, w10, wzr, lo
-; CHECK-NEXT:    mul x10, x0, x2
-; CHECK-NEXT:    cmp w9, #0
-; CHECK-NEXT:    csinv x0, x10, xzr, eq
-; CHECK-NEXT:    csinv x1, x8, xzr, eq
+; CHECK-NEXT:    adds x9, x11, x9
+; CHECK-NEXT:    csinc w10, w10, wzr, lo
+; CHECK-NEXT:    cmp w10, #0
+; CHECK-NEXT:    csinv x0, x8, xzr, eq
+; CHECK-NEXT:    csinv x1, x9, xzr, eq
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
@@ -411,13 +411,13 @@ define i128 @i128_saturating_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    mov x20, x1
 ; CHECK-NEXT:    str xzr, [sp, #8]
 ; CHECK-NEXT:    bl __muloti4
-; CHECK-NEXT:    ldr x8, [sp, #8]
-; CHECK-NEXT:    eor x9, x19, x20
+; CHECK-NEXT:    eor x8, x19, x20
+; CHECK-NEXT:    ldr x9, [sp, #8]
+; CHECK-NEXT:    asr x8, x8, #63
 ; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    asr x9, x9, #63
-; CHECK-NEXT:    eor x10, x9, #0x7fffffffffffffff
-; CHECK-NEXT:    cmp x8, #0
-; CHECK-NEXT:    csinv x0, x0, x9, eq
+; CHECK-NEXT:    cmp x9, #0
+; CHECK-NEXT:    eor x10, x8, #0x7fffffffffffffff
+; CHECK-NEXT:    csinv x0, x0, x8, eq
 ; CHECK-NEXT:    csel x1, x10, x1, ne
 ; CHECK-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/i128_volatile_load_store.ll b/llvm/test/CodeGen/AArch64/i128_volatile_load_store.ll
index 2b71a9354c5ced..73708e3fd8c44b 100644
--- a/llvm/test/CodeGen/AArch64/i128_volatile_load_store.ll
+++ b/llvm/test/CodeGen/AArch64/i128_volatile_load_store.ll
@@ -39,11 +39,11 @@ define void @test3() {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, x
 ; CHECK-NEXT:    add x8, x8, :lo12:x
-; CHECK-NEXT:    add x8, x8, #512
 ; CHECK-NEXT:    adrp x10, y
 ; CHECK-NEXT:    add x10, x10, :lo12:y
-; CHECK-NEXT:    add x10, x10, #512
+; CHECK-NEXT:    add x8, x8, #512
 ; CHECK-NEXT:    ldp x8, x9, [x8]
+; CHECK-NEXT:    add x10, x10, #512
 ; CHECK-NEXT:    stp x8, x9, [x10]
 ; CHECK-NEXT:    ret
   %tmp = load volatile i128, ptr getelementptr (i8, ptr @x, i64 512)
@@ -71,11 +71,11 @@ define void @test5() {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, x
 ; CHECK-NEXT:    add x8, x8, :lo12:x
-; CHECK-NEXT:    sub x8, x8, #520
 ; CHECK-NEXT:    adrp x10, y
 ; CHECK-NEXT:    add x10, x10, :lo12:y
-; CHECK-NEXT:    sub x10, x10, #520
+; CHECK-NEXT:    sub x8, x8, #520
 ; CHECK-NEXT:    ldp x8, x9, [x8]
+; CHECK-NEXT:    sub x10, x10, #520
 ; CHECK-NEXT:    stp x8, x9, [x10]
 ; CHECK-NEXT:    ret
   %tmp = load volatile i128, ptr getelementptr (i8, ptr @x, i64 -520)
@@ -88,11 +88,11 @@ define void @test6() {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, x
 ; CHECK-NEXT:    add x8, x8, :lo12:x
-; CHECK-NEXT:    sub x8, x8, #520
 ; CHECK-NEXT:    adrp x10, y
 ; CHECK-NEXT:    add x10, x10, :lo12:y
-; CHECK-NEXT:    sub x10, x10, #520
+; CHECK-NEXT:    sub x8, x8, #520
 ; CHECK-NEXT:    ldp x8, x9, [x8]
+; CHECK-NEXT:    sub x10, x10, #520
 ; CHECK-NEXT:    stp x8, x9, [x10]
 ; CHECK-NEXT:    ret
   %tmp = load volatile i128, ptr getelementptr (i8, ptr @x, i64 -520)
@@ -105,11 +105,11 @@ define void @test7() {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, x
 ; CHECK-NEXT:    add x8, x8, :lo12:x
-; CHECK-NEXT:    add x8, x8, #503
 ; CHECK-NEXT:    adrp x10, y
 ; CHECK-NEXT:    add x10, x10, :lo12:y
-; CHECK-NEXT:    add x10, x10, #503
+; CHECK-NEXT:    add x8, x8, #503
 ; CHECK-NEXT:    ldp x8, x9, [x8]
+; CHECK-NEXT:    add x10, x10, #503
 ; CHECK-NEXT:    stp x8, x9, [x10]
 ; CHECK-NEXT:    ret
   %tmp = load volatile i128, ptr getelementptr (i8, ptr @x, i64 503)

diff  --git a/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll b/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll
index f098b1e7b62015..767ca91a58bb10 100644
--- a/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll
+++ b/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll
@@ -8,8 +8,8 @@ define i1 @unordered_floating_point_compare_on_v8f32(<8 x float> %a_vec) {
 ; CHECK-LABEL: unordered_floating_point_compare_on_v8f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcmgt v1.4s, v1.4s, #0.0
-; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    fcmgt v0.4s, v0.4s, #0.0
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-NEXT:    xtn v0.8b, v0.8h
@@ -27,8 +27,8 @@ define i1 @unordered_floating_point_compare_on_v16f32(<16 x float> %a_vec) {
 ; CHECK-LABEL: unordered_floating_point_compare_on_v16f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcmgt v3.4s, v3.4s, #0.0
-; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    fcmgt v2.4s, v2.4s, #0.0
+; CHECK-NEXT:    mov w9, #1 // =0x1
 ; CHECK-NEXT:    fcmgt v1.4s, v1.4s, #0.0
 ; CHECK-NEXT:    fcmgt v0.4s, v0.4s, #0.0
 ; CHECK-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
@@ -36,8 +36,8 @@ define i1 @unordered_floating_point_compare_on_v16f32(<16 x float> %a_vec) {
 ; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-NEXT:    umaxv b0, v0.16b
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    bic w0, w8, w9
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    bic w0, w9, w8
 ; CHECK-NEXT:    ret
   %a_cmp = fcmp ule <16 x float> %a_vec, zeroinitializer
   %cmp_result = bitcast <16 x i1> %a_cmp to i16
@@ -49,8 +49,8 @@ define i1 @unordered_floating_point_compare_on_v32f32(<32 x float> %a_vec) {
 ; CHECK-LABEL: unordered_floating_point_compare_on_v32f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcmgt v3.4s, v3.4s, #0.0
-; CHECK-NEXT:    mov w9, #1 // =0x1
 ; CHECK-NEXT:    fcmgt v2.4s, v2.4s, #0.0
+; CHECK-NEXT:    mov w9, #1 // =0x1
 ; CHECK-NEXT:    fcmgt v1.4s, v1.4s, #0.0
 ; CHECK-NEXT:    fcmgt v0.4s, v0.4s, #0.0
 ; CHECK-NEXT:    fcmgt v7.4s, v7.4s, #0.0

diff  --git a/llvm/test/CodeGen/AArch64/insert-extend.ll b/llvm/test/CodeGen/AArch64/insert-extend.ll
index f2c61f6562bfbf..e4d2b516b8fbfe 100644
--- a/llvm/test/CodeGen/AArch64/insert-extend.ll
+++ b/llvm/test/CodeGen/AArch64/insert-extend.ll
@@ -50,112 +50,118 @@ define i32 @large(ptr nocapture noundef readonly %p1, i32 noundef %st1, ptr noca
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-NEXT:    sxtw x8, w1
 ; CHECK-NEXT:    sxtw x9, w3
-; CHECK-NEXT:    add x10, x0, x8
-; CHECK-NEXT:    add x11, x2, x9
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    add x12, x10, x8
 ; CHECK-NEXT:    ldr d1, [x2]
+; CHECK-NEXT:    add x10, x0, x8
+; CHECK-NEXT:    add x11, x2, x9
 ; CHECK-NEXT:    ldr d2, [x10]
-; CHECK-NEXT:    add x10, x11, x9
 ; CHECK-NEXT:    ldr d3, [x11]
-; CHECK-NEXT:    ldr d4, [x12]
-; CHECK-NEXT:    ldr d5, [x10]
-; CHECK-NEXT:    ldr d6, [x12, x8]
-; CHECK-NEXT:    ldr d7, [x10, x9]
 ; CHECK-NEXT:    usubl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    add x10, x10, x8
+; CHECK-NEXT:    add x11, x11, x9
+; CHECK-NEXT:    usubl v1.8h, v2.8b, v3.8b
+; CHECK-NEXT:    ldr d2, [x10]
+; CHECK-NEXT:    ldr d3, [x11]
+; CHECK-NEXT:    ldr d4, [x10, x8]
+; CHECK-NEXT:    ldr d5, [x11, x9]
 ; CHECK-NEXT:    usubl v2.8h, v2.8b, v3.8b
-; CHECK-NEXT:    usubl v1.8h, v4.8b, v5.8b
+; CHECK-NEXT:    usubl v3.8h, v4.8b, v5.8b
 ; CHECK-NEXT:    shll2 v4.4s, v0.8h, #16
-; CHECK-NEXT:    shll2 v5.4s, v2.8h, #16
-; CHECK-NEXT:    usubl v3.8h, v6.8b, v7.8b
+; CHECK-NEXT:    shll2 v5.4s, v1.8h, #16
 ; CHECK-NEXT:    saddw v0.4s, v4.4s, v0.4h
-; CHECK-NEXT:    saddw v2.4s, v5.4s, v2.4h
-; CHECK-NEXT:    shll2 v7.4s, v1.8h, #16
-; CHECK-NEXT:    shll2 v6.4s, v3.8h, #16
-; CHECK-NEXT:    saddw v1.4s, v7.4s, v1.4h
+; CHECK-NEXT:    shll2 v6.4s, v2.8h, #16
+; CHECK-NEXT:    saddw v1.4s, v5.4s, v1.4h
+; CHECK-NEXT:    shll2 v4.4s, v3.8h, #16
+; CHECK-NEXT:    saddw v2.4s, v6.4s, v2.4h
+; CHECK-NEXT:    saddw v3.4s, v4.4s, v3.4h
 ; CHECK-NEXT:    rev64 v4.4s, v0.4s
-; CHECK-NEXT:    rev64 v5.4s, v2.4s
-; CHECK-NEXT:    saddw v3.4s, v6.4s, v3.4h
-; CHECK-NEXT:    rev64 v7.4s, v1.4s
+; CHECK-NEXT:    rev64 v5.4s, v1.4s
+; CHECK-NEXT:    rev64 v6.4s, v2.4s
+; CHECK-NEXT:    rev64 v7.4s, v3.4s
 ; CHECK-NEXT:    sub v4.4s, v0.4s, v4.4s
-; CHECK-NEXT:    rev64 v6.4s, v3.4s
-; CHECK-NEXT:    sub v5.4s, v2.4s, v5.4s
-; CHECK-NEXT:    sub v7.4s, v1.4s, v7.4s
+; CHECK-NEXT:    addp v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    sub v5.4s, v1.4s, v5.4s
+; CHECK-NEXT:    sub v6.4s, v2.4s, v6.4s
+; CHECK-NEXT:    addp v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    sub v7.4s, v3.4s, v7.4s
 ; CHECK-NEXT:    zip1 v16.4s, v5.4s, v4.4s
-; CHECK-NEXT:    addp v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    sub v6.4s, v3.4s, v6.4s
-; CHECK-NEXT:    addp v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    zip2 v17.4s, v7.4s, v6.4s
-; CHECK-NEXT:    mov v7.s[1], v6.s[0]
-; CHECK-NEXT:    ext v2.16b, v5.16b, v16.16b, #8
+; CHECK-NEXT:    ext v1.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT:    zip2 v3.4s, v6.4s, v7.4s
+; CHECK-NEXT:    mov v6.s[1], v7.s[0]
+; CHECK-NEXT:    ext v7.16b, v5.16b, v16.16b, #8
 ; CHECK-NEXT:    mov v5.s[3], v4.s[2]
-; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT:    uzp2 v4.4s, v1.4s, v0.4s
-; CHECK-NEXT:    mov v7.d[1], v2.d[1]
-; CHECK-NEXT:    mov v17.d[1], v5.d[1]
-; CHECK-NEXT:    uzp1 v1.4s, v1.4s, v0.4s
-; CHECK-NEXT:    uzp1 v2.4s, v3.4s, v0.4s
-; CHECK-NEXT:    uzp2 v0.4s, v3.4s, v0.4s
-; CHECK-NEXT:    add v3.4s, v17.4s, v7.4s
-; CHECK-NEXT:    add v1.4s, v4.4s, v1.4s
-; CHECK-NEXT:    sub v4.4s, v7.4s, v17.4s
-; CHECK-NEXT:    sub v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    rev64 v2.4s, v3.4s
-; CHECK-NEXT:    rev64 v5.4s, v4.4s
-; CHECK-NEXT:    rev64 v7.4s, v0.4s
+; CHECK-NEXT:    uzp1 v4.4s, v1.4s, v0.4s
+; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v0.4s
+; CHECK-NEXT:    mov v6.d[1], v7.d[1]
+; CHECK-NEXT:    mov v3.d[1], v5.d[1]
+; CHECK-NEXT:    uzp2 v5.4s, v2.4s, v0.4s
+; CHECK-NEXT:    uzp1 v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    sub v1.4s, v4.4s, v1.4s
+; CHECK-NEXT:    add v2.4s, v3.4s, v6.4s
+; CHECK-NEXT:    sub v3.4s, v6.4s, v3.4s
+; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
 ; CHECK-NEXT:    rev64 v6.4s, v1.4s
-; CHECK-NEXT:    addp v16.4s, v0.4s, v4.4s
-; CHECK-NEXT:    addp v17.4s, v1.4s, v3.4s
-; CHECK-NEXT:    sub v4.4s, v4.4s, v5.4s
-; CHECK-NEXT:    sub v2.4s, v3.4s, v2.4s
-; CHECK-NEXT:    sub v0.4s, v0.4s, v7.4s
+; CHECK-NEXT:    rev64 v4.4s, v2.4s
+; CHECK-NEXT:    rev64 v5.4s, v3.4s
+; CHECK-NEXT:    addp v16.4s, v1.4s, v3.4s
+; CHECK-NEXT:    rev64 v7.4s, v0.4s
+; CHECK-NEXT:    addp v17.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    sub v1.4s, v1.4s, v6.4s
-; CHECK-NEXT:    ext v3.16b, v16.16b, v4.16b, #4
-; CHECK-NEXT:    ext v5.16b, v0.16b, v16.16b, #8
-; CHECK-NEXT:    ext v6.16b, v17.16b, v2.16b, #4
-; CHECK-NEXT:    zip1 v7.4s, v17.4s, v17.4s
-; CHECK-NEXT:    zip2 v3.4s, v3.4s, v16.4s
-; CHECK-NEXT:    ext v18.16b, v5.16b, v0.16b, #4
-; CHECK-NEXT:    zip2 v6.4s, v6.4s, v17.4s
-; CHECK-NEXT:    trn2 v7.4s, v7.4s, v1.4s
-; CHECK-NEXT:    ext v1.16b, v1.16b, v17.16b, #4
-; CHECK-NEXT:    ext v3.16b, v4.16b, v3.16b, #12
-; CHECK-NEXT:    mov v0.s[2], v16.s[1]
-; CHECK-NEXT:    ext v6.16b, v2.16b, v6.16b, #12
-; CHECK-NEXT:    mov v4.s[2], v16.s[3]
-; CHECK-NEXT:    uzp2 v5.4s, v5.4s, v18.4s
-; CHECK-NEXT:    mov v2.s[2], v17.s[3]
-; CHECK-NEXT:    ext v1.16b, v1.16b, v1.16b, #4
-; CHECK-NEXT:    sub v18.4s, v4.4s, v3.4s
-; CHECK-NEXT:    sub v19.4s, v0.4s, v5.4s
-; CHECK-NEXT:    sub v20.4s, v2.4s, v6.4s
-; CHECK-NEXT:    mov v4.s[1], v16.s[2]
-; CHECK-NEXT:    sub v21.4s, v7.4s, v1.4s
-; CHECK-NEXT:    mov v2.s[1], v17.s[2]
-; CHECK-NEXT:    mov v0.s[1], v16.s[0]
-; CHECK-NEXT:    mov v1.s[0], v17.s[1]
-; CHECK-NEXT:    add v3.4s, v4.4s, v3.4s
-; CHECK-NEXT:    add v2.4s, v2.4s, v6.4s
+; CHECK-NEXT:    sub v3.4s, v3.4s, v5.4s
+; CHECK-NEXT:    sub v2.4s, v2.4s, v4.4s
+; CHECK-NEXT:    sub v0.4s, v0.4s, v7.4s
+; CHECK-NEXT:    zip1 v18.4s, v17.4s, v17.4s
+; CHECK-NEXT:    ext v6.16b, v1.16b, v16.16b, #8
+; CHECK-NEXT:    ext v4.16b, v17.16b, v2.16b, #4
+; CHECK-NEXT:    ext v5.16b, v16.16b, v3.16b, #4
+; CHECK-NEXT:    mov v20.16b, v3.16b
+; CHECK-NEXT:    ext v7.16b, v0.16b, v17.16b, #4
+; CHECK-NEXT:    mov v21.16b, v2.16b
+; CHECK-NEXT:    trn2 v0.4s, v18.4s, v0.4s
+; CHECK-NEXT:    ext v19.16b, v6.16b, v1.16b, #4
+; CHECK-NEXT:    mov v1.s[2], v16.s[1]
+; CHECK-NEXT:    mov v20.s[2], v16.s[3]
+; CHECK-NEXT:    zip2 v4.4s, v4.4s, v17.4s
+; CHECK-NEXT:    zip2 v5.4s, v5.4s, v16.4s
+; CHECK-NEXT:    mov v21.s[2], v17.s[3]
+; CHECK-NEXT:    ext v7.16b, v7.16b, v7.16b, #4
+; CHECK-NEXT:    mov v18.16b, v1.16b
+; CHECK-NEXT:    ext v2.16b, v2.16b, v4.16b, #12
+; CHECK-NEXT:    ext v3.16b, v3.16b, v5.16b, #12
+; CHECK-NEXT:    uzp2 v4.4s, v6.4s, v19.4s
+; CHECK-NEXT:    mov v5.16b, v7.16b
+; CHECK-NEXT:    mov v6.16b, v20.16b
+; CHECK-NEXT:    mov v19.16b, v21.16b
+; CHECK-NEXT:    mov v18.s[1], v16.s[0]
+; CHECK-NEXT:    sub v7.4s, v0.4s, v7.4s
+; CHECK-NEXT:    mov v6.s[1], v16.s[2]
+; CHECK-NEXT:    mov v5.s[0], v17.s[1]
+; CHECK-NEXT:    mov v19.s[1], v17.s[2]
+; CHECK-NEXT:    sub v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    sub v16.4s, v20.4s, v3.4s
+; CHECK-NEXT:    sub v17.4s, v21.4s, v2.4s
+; CHECK-NEXT:    add v4.4s, v18.4s, v4.4s
+; CHECK-NEXT:    add v3.4s, v6.4s, v3.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v5.4s
-; CHECK-NEXT:    add v1.4s, v7.4s, v1.4s
-; CHECK-NEXT:    mov v3.d[1], v18.d[1]
-; CHECK-NEXT:    mov v2.d[1], v20.d[1]
-; CHECK-NEXT:    mov v1.d[1], v21.d[1]
-; CHECK-NEXT:    mov v0.d[1], v19.d[1]
-; CHECK-NEXT:    cmlt v4.8h, v3.8h, #0
-; CHECK-NEXT:    cmlt v5.8h, v2.8h, #0
-; CHECK-NEXT:    cmlt v6.8h, v1.8h, #0
-; CHECK-NEXT:    cmlt v7.8h, v0.8h, #0
-; CHECK-NEXT:    add v1.4s, v6.4s, v1.4s
-; CHECK-NEXT:    add v0.4s, v7.4s, v0.4s
-; CHECK-NEXT:    add v3.4s, v4.4s, v3.4s
-; CHECK-NEXT:    add v2.4s, v5.4s, v2.4s
-; CHECK-NEXT:    eor v3.16b, v3.16b, v4.16b
-; CHECK-NEXT:    eor v2.16b, v2.16b, v5.16b
-; CHECK-NEXT:    eor v0.16b, v0.16b, v7.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v6.16b
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    add v1.4s, v2.4s, v3.4s
+; CHECK-NEXT:    add v2.4s, v19.4s, v2.4s
+; CHECK-NEXT:    mov v4.d[1], v1.d[1]
+; CHECK-NEXT:    mov v3.d[1], v16.d[1]
+; CHECK-NEXT:    mov v0.d[1], v7.d[1]
+; CHECK-NEXT:    mov v2.d[1], v17.d[1]
+; CHECK-NEXT:    cmlt v6.8h, v4.8h, #0
+; CHECK-NEXT:    cmlt v1.8h, v3.8h, #0
+; CHECK-NEXT:    cmlt v5.8h, v0.8h, #0
+; CHECK-NEXT:    cmlt v7.8h, v2.8h, #0
+; CHECK-NEXT:    add v4.4s, v6.4s, v4.4s
+; CHECK-NEXT:    add v3.4s, v1.4s, v3.4s
+; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
+; CHECK-NEXT:    add v2.4s, v7.4s, v2.4s
+; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    eor v2.16b, v2.16b, v7.16b
+; CHECK-NEXT:    eor v3.16b, v4.16b, v6.16b
+; CHECK-NEXT:    eor v0.16b, v0.16b, v5.16b
+; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v3.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0

diff  --git a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
index d731b27e18ce33..f5d14779f6586e 100644
--- a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
@@ -9,9 +9,9 @@ define <vscale x 8 x i8> @vec_scalable_subvec_scalable_idx_zero_i8(<vscale x 8 x
 ; CHECK-LABEL: vec_scalable_subvec_scalable_idx_zero_i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1b { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    ld1b { z1.s }, p1/z, [x1]
 ; CHECK-NEXT:    uunpkhi z0.s, z0.h
 ; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
 ; CHECK-NEXT:    ret
@@ -25,9 +25,9 @@ define <vscale x 8 x i8> @vec_scalable_subvec_scalable_idx_nonzero_i8(<vscale x
 ; CHECK-LABEL: vec_scalable_subvec_scalable_idx_nonzero_i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1b { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    ld1b { z1.s }, p1/z, [x1]
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
 ; CHECK-NEXT:    ret
@@ -41,9 +41,9 @@ define <vscale x 4 x i16> @vec_scalable_subvec_scalable_idx_zero_i16(<vscale x 4
 ; CHECK-LABEL: vec_scalable_subvec_scalable_idx_zero_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1h { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1h { z1.d }, p1/z, [x1]
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
 ; CHECK-NEXT:    ret
@@ -57,9 +57,9 @@ define <vscale x 4 x i16> @vec_scalable_subvec_scalable_idx_nonzero_i16(<vscale
 ; CHECK-LABEL: vec_scalable_subvec_scalable_idx_nonzero_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1h { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1h { z1.d }, p1/z, [x1]
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -75,11 +75,11 @@ define <vscale x 8 x i8> @vec_scalable_subvec_fixed_idx_zero_i8(<vscale x 8 x i8
 ; CHECK-LABEL: vec_scalable_subvec_fixed_idx_zero_i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ptrue p1.h, vl8
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
+; CHECK-NEXT:    mov z0.h, p1/m, z1.h
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 8 x i8>, <vscale x 8 x i8>* %a
   %subvec = load <8 x i8>, <8 x i8>* %b
@@ -92,16 +92,16 @@ define <vscale x 8 x i8> @vec_scalable_subvec_fixed_idx_nonzero_i8(<vscale x 8 x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    cnth x8
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    cnth x8
+; CHECK-NEXT:    mov w9, #8 // =0x8
 ; CHECK-NEXT:    sub x8, x8, #8
-; CHECK-NEXT:    mov w9, #8
 ; CHECK-NEXT:    cmp x8, #8
-; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
-; CHECK-NEXT:    lsl x8, x8, #1
 ; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    lsl x8, x8, #1
+; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
 ; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
 ; CHECK-NEXT:    str q1, [x9, x8]
@@ -119,11 +119,11 @@ define <vscale x 4 x i16> @vec_scalable_subvec_fixed_idx_zero_i16(<vscale x 4 x
 ; CHECK-LABEL: vec_scalable_subvec_fixed_idx_zero_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ptrue p1.s, vl4
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    mov z0.s, p0/m, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, z1.s
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 4 x i16>, <vscale x 4 x i16>* %a
   %subvec = load <4 x i16>, <4 x i16>* %b
@@ -136,16 +136,16 @@ define <vscale x 4 x i16> @vec_scalable_subvec_fixed_idx_nonzero_i16(<vscale x 4
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    cntw x8
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    mov w9, #4 // =0x4
 ; CHECK-NEXT:    sub x8, x8, #4
-; CHECK-NEXT:    mov w9, #4
 ; CHECK-NEXT:    cmp x8, #4
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
-; CHECK-NEXT:    lsl x8, x8, #2
 ; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    lsl x8, x8, #2
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
 ; CHECK-NEXT:    str q1, [x9, x8]
@@ -163,11 +163,11 @@ define <vscale x 2 x i32> @vec_scalable_subvec_fixed_idx_zero_i32(<vscale x 2 x
 ; CHECK-LABEL: vec_scalable_subvec_fixed_idx_zero_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.d, vl2
 ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-NEXT:    mov z0.d, p0/m, z1.d
+; CHECK-NEXT:    mov z0.d, p1/m, z1.d
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 2 x i32>, <vscale x 2 x i32>* %a
   %subvec = load <2 x i32>, <2 x i32>* %b
@@ -180,16 +180,16 @@ define <vscale x 2 x i32> @vec_scalable_subvec_fixed_idx_nonzero_i32(<vscale x 2
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    mov w9, #2 // =0x2
 ; CHECK-NEXT:    sub x8, x8, #2
-; CHECK-NEXT:    mov w9, #2
 ; CHECK-NEXT:    cmp x8, #2
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
-; CHECK-NEXT:    lsl x8, x8, #3
 ; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    lsl x8, x8, #3
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
 ; CHECK-NEXT:    str q1, [x9, x8]

diff  --git a/llvm/test/CodeGen/AArch64/insert-subvector.ll b/llvm/test/CodeGen/AArch64/insert-subvector.ll
index d86b3b526eed78..d7656e1cd341f6 100644
--- a/llvm/test/CodeGen/AArch64/insert-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/insert-subvector.ll
@@ -47,8 +47,8 @@ define <16 x i8> @insert_v16i8_4_1(float %tmp, <16 x i8> %b, <16 x i8> %a) {
 define <16 x i8> @insert_v16i8_4_15(float %tmp, <16 x i8> %b, <16 x i8> %a) {
 ; CHECK-LABEL: insert_v16i8_4_15:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI4_0
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 def $q2_q3
+; CHECK-NEXT:    adrp x8, .LCPI4_0
 ; CHECK-NEXT:    mov v3.16b, v1.16b
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI4_0]
 ; CHECK-NEXT:    tbl v0.16b, { v2.16b, v3.16b }, v0.16b
@@ -146,8 +146,8 @@ define <8 x i16> @insert_v8i16_2_1(float %tmp, <8 x i16> %b, <8 x i16> %a) {
 define <8 x i16> @insert_v8i16_2_15(float %tmp, <8 x i16> %b, <8 x i16> %a) {
 ; CHECK-LABEL: insert_v8i16_2_15:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI13_0
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 def $q2_q3
+; CHECK-NEXT:    adrp x8, .LCPI13_0
 ; CHECK-NEXT:    mov v3.16b, v1.16b
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI13_0]
 ; CHECK-NEXT:    tbl v0.16b, { v2.16b, v3.16b }, v0.16b
@@ -272,8 +272,8 @@ define <16 x i8> @load_v16i8_4_1(float %tmp, <16 x i8> %b, ptr %a) {
 define <16 x i8> @load_v16i8_4_15(float %tmp, <16 x i8> %b, ptr %a) {
 ; CHECK-LABEL: load_v16i8_4_15:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI24_0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $q0_q1
+; CHECK-NEXT:    adrp x8, .LCPI24_0
 ; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI24_0]
 ; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
@@ -338,8 +338,8 @@ define <8 x i8> @load_v8i8_4_2(float %tmp, <8 x i8> %b, ptr %a) {
 ; CHECK-LABEL: load_v8i8_4_2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov d0, d1
-; CHECK-NEXT:    ldr s1, [x0]
-; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    ldr s2, [x0]
+; CHECK-NEXT:    mov v0.s[1], v2.s[0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
   %l = load <4 x i8>, ptr %a
@@ -365,8 +365,8 @@ define <16 x i8> @load_v16i8_8_2(float %tmp, <16 x i8> %b, ptr %a) {
 ; CHECK-LABEL: load_v16i8_8_2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ldr d2, [x0]
+; CHECK-NEXT:    mov v0.d[1], v2.d[0]
 ; CHECK-NEXT:    ret
   %l = load <8 x i8>, ptr %a
   %s1 = shufflevector <8 x i8> %l, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -379,13 +379,13 @@ define <16 x i8> @load_v16i8_8_2(float %tmp, <16 x i8> %b, ptr %a) {
 define <8 x i16> @load_v8i16_2_1(float %tmp, <8 x i16> %b, ptr %a) {
 ; CHECK-LABEL: load_v8i16_2_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w9, [x0]
-; CHECK-NEXT:    add x8, x0, #2
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    add x9, x0, #2
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ld1 { v0.h }[2], [x9]
+; CHECK-NEXT:    xtn v2.4h, v0.4s
 ; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    ld1 { v2.h }[2], [x8]
-; CHECK-NEXT:    xtn v1.4h, v2.4s
-; CHECK-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-NEXT:    mov v0.s[0], v2.s[0]
 ; CHECK-NEXT:    ret
   %l = load <2 x i16>, ptr %a
   %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -396,15 +396,15 @@ define <8 x i16> @load_v8i16_2_1(float %tmp, <8 x i16> %b, ptr %a) {
 define <8 x i16> @load_v8i16_2_15(float %tmp, <8 x i16> %b, ptr %a) {
 ; CHECK-LABEL: load_v8i16_2_15:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w9, [x0]
-; CHECK-NEXT:    add x8, x0, #2
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    add x9, x0, #2
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $q0_q1
-; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    ld1 { v2.h }[2], [x8]
+; CHECK-NEXT:    fmov s2, w8
 ; CHECK-NEXT:    adrp x8, .LCPI33_0
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI33_0]
+; CHECK-NEXT:    ld1 { v2.h }[2], [x9]
 ; CHECK-NEXT:    xtn v0.4h, v2.4s
-; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v3.16b
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI33_0]
+; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-NEXT:    ret
   %l = load <2 x i16>, ptr %a
   %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -415,13 +415,13 @@ define <8 x i16> @load_v8i16_2_15(float %tmp, <8 x i16> %b, ptr %a) {
 define <8 x i16> @load_v8i16_2_2(float %tmp, <8 x i16> %b, ptr %a) {
 ; CHECK-LABEL: load_v8i16_2_2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w9, [x0]
-; CHECK-NEXT:    add x8, x0, #2
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    add x9, x0, #2
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ld1 { v0.h }[2], [x9]
+; CHECK-NEXT:    xtn v2.4h, v0.4s
 ; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    ld1 { v2.h }[2], [x8]
-; CHECK-NEXT:    xtn v1.4h, v2.4s
-; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    mov v0.s[1], v2.s[0]
 ; CHECK-NEXT:    ret
   %l = load <2 x i16>, ptr %a
   %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -432,13 +432,13 @@ define <8 x i16> @load_v8i16_2_2(float %tmp, <8 x i16> %b, ptr %a) {
 define <8 x i16> @load_v8i16_2_3(float %tmp, <8 x i16> %b, ptr %a) {
 ; CHECK-LABEL: load_v8i16_2_3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w9, [x0]
-; CHECK-NEXT:    add x8, x0, #2
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    add x9, x0, #2
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ld1 { v0.h }[2], [x9]
+; CHECK-NEXT:    xtn v2.4h, v0.4s
 ; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    ld1 { v2.h }[2], [x8]
-; CHECK-NEXT:    xtn v1.4h, v2.4s
-; CHECK-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-NEXT:    mov v0.s[2], v2.s[0]
 ; CHECK-NEXT:    ret
   %l = load <2 x i16>, ptr %a
   %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -449,13 +449,13 @@ define <8 x i16> @load_v8i16_2_3(float %tmp, <8 x i16> %b, ptr %a) {
 define <8 x i16> @load_v8i16_2_4(float %tmp, <8 x i16> %b, ptr %a) {
 ; CHECK-LABEL: load_v8i16_2_4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w9, [x0]
-; CHECK-NEXT:    add x8, x0, #2
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    add x9, x0, #2
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ld1 { v0.h }[2], [x9]
+; CHECK-NEXT:    xtn v2.4h, v0.4s
 ; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    ld1 { v2.h }[2], [x8]
-; CHECK-NEXT:    xtn v1.4h, v2.4s
-; CHECK-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-NEXT:    mov v0.s[3], v2.s[0]
 ; CHECK-NEXT:    ret
   %l = load <2 x i16>, ptr %a
   %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -514,8 +514,8 @@ define <8 x i16> @load_v8i16_4_2(float %tmp, <8 x i16> %b, ptr %a) {
 ; CHECK-LABEL: load_v8i16_4_2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ldr d2, [x0]
+; CHECK-NEXT:    mov v0.d[1], v2.d[0]
 ; CHECK-NEXT:    ret
   %l = load <4 x i16>, ptr %a
   %s1 = shufflevector <4 x i16> %l, <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -542,8 +542,8 @@ define <4 x i32> @load_v4i32_2_2(float %tmp, <4 x i32> %b, ptr %a) {
 ; CHECK-LABEL: load_v4i32_2_2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ldr d2, [x0]
+; CHECK-NEXT:    mov v0.d[1], v2.d[0]
 ; CHECK-NEXT:    ret
   %l = load <2 x i32>, ptr %a
   %s1 = shufflevector <2 x i32> %l, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -621,13 +621,13 @@ define <16 x i8> @load2multi1_v4i8(float %tmp, ptr %a, ptr %b) {
 define <16 x i8> @load2multi2_v4i8(float %tmp, ptr %a, ptr %b) {
 ; CHECK-LABEL: load2multi2_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x1]
-; CHECK-NEXT:    ldr s1, [x0]
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ldr s1, [x1]
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    mov v0.d[1], v0.d[0]
 ; CHECK-NEXT:    mov v1.d[1], v1.d[0]
-; CHECK-NEXT:    uzp1 v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    mov v0.d[1], v0.d[0]
+; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %la = load <4 x i8>, ptr %a
   %lb = load <4 x i8>, ptr %b
@@ -640,8 +640,8 @@ define <16 x i8> @load2multi2_v4i8(float %tmp, ptr %a, ptr %b) {
 define void @loads_before_stores(ptr %i44) {
 ; CHECK-LABEL: loads_before_stores:
 ; CHECK:       // %bb.0: // %bb
-; CHECK-NEXT:    add x8, x0, #20
 ; CHECK-NEXT:    ldr s0, [x0, #28]
+; CHECK-NEXT:    add x8, x0, #20
 ; CHECK-NEXT:    ldrh w9, [x0, #26]
 ; CHECK-NEXT:    ldrh w10, [x0, #24]
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x8]

diff  --git a/llvm/test/CodeGen/AArch64/insertshuffleload.ll b/llvm/test/CodeGen/AArch64/insertshuffleload.ll
index 17ddd0d0b99723..b97b9b1dcdcf8d 100644
--- a/llvm/test/CodeGen/AArch64/insertshuffleload.ll
+++ b/llvm/test/CodeGen/AArch64/insertshuffleload.ll
@@ -121,11 +121,11 @@ define <8 x i32> @inserti32_last(ptr %p) {
 define <8 x i32> @inserti32_first_multiuse(ptr %p) {
 ; CHECK-LABEL: inserti32_first_multiuse:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    ldp q3, q2, [x0]
 ; CHECK-NEXT:    ldur q1, [x0, #20]
 ; CHECK-NEXT:    ldur q0, [x0, #4]
-; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    ret
   %q = getelementptr inbounds i8, ptr %p, i32 4
   %l1 = load <8 x i32>, ptr %q
@@ -140,10 +140,10 @@ define <8 x i32> @inserti32_last_multiuse(ptr %p) {
 ; CHECK-LABEL: inserti32_last_multiuse:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldur q2, [x0, #4]
-; CHECK-NEXT:    ldur q3, [x0, #20]
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    ldur q2, [x0, #20]
+; CHECK-NEXT:    ldur q3, [x0, #4]
+; CHECK-NEXT:    add v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    ret
   %q = getelementptr inbounds i8, ptr %p, i32 32
   %l1 = load <8 x i32>, ptr %p
@@ -291,8 +291,8 @@ define <8 x i8> @wrong_shuffle(ptr %p) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldur d0, [x0, #1]
 ; CHECK-NEXT:    adrp x8, .LCPI19_0
-; CHECK-NEXT:    mov v0.d[1], v0.d[0]
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI19_0]
+; CHECK-NEXT:    mov v0.d[1], v0.d[0]
 ; CHECK-NEXT:    tbl v0.8b, { v0.16b }, v1.8b
 ; CHECK-NEXT:    ld1 { v0.b }[0], [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0

diff  --git a/llvm/test/CodeGen/AArch64/isinf.ll b/llvm/test/CodeGen/AArch64/isinf.ll
index a4aa7b73e77bef..458bd7eeba16cf 100644
--- a/llvm/test/CodeGen/AArch64/isinf.ll
+++ b/llvm/test/CodeGen/AArch64/isinf.ll
@@ -10,7 +10,7 @@ declare fp128  @llvm.fabs.f128(fp128)
 define i32 @replace_isinf_call_f16(half %x) {
 ; CHECK-LABEL: replace_isinf_call_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #31744
+; CHECK-NEXT:    mov w8, #31744 // =0x7c00
 ; CHECK-NEXT:    fabs h0, h0
 ; CHECK-NEXT:    fmov h1, w8
 ; CHECK-NEXT:    fcmp h0, h1
@@ -26,8 +26,8 @@ define i32 @replace_isinf_call_f16(half %x) {
 define i32 @replace_isinf_call_f32(float %x) {
 ; CHECK-LABEL: replace_isinf_call_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #2139095040
 ; CHECK-NEXT:    fabs s0, s0
+; CHECK-NEXT:    mov w8, #2139095040 // =0x7f800000
 ; CHECK-NEXT:    fmov s1, w8
 ; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    cset w0, eq
@@ -42,8 +42,8 @@ define i32 @replace_isinf_call_f32(float %x) {
 define i32 @replace_isinf_call_f64(double %x) {
 ; CHECK-LABEL: replace_isinf_call_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #9218868437227405312
 ; CHECK-NEXT:    fabs d0, d0
+; CHECK-NEXT:    mov x8, #9218868437227405312 // =0x7ff0000000000000
 ; CHECK-NEXT:    fmov d1, x8
 ; CHECK-NEXT:    fcmp d0, d1
 ; CHECK-NEXT:    cset w0, eq
@@ -70,8 +70,8 @@ define i32 @replace_isinf_call_f128(fp128 %x) {
 ; CHECK-NEXT:    ldr q0, [sp]
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI3_0]
 ; CHECK-NEXT:    bl __eqtf2
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/known-never-nan.ll b/llvm/test/CodeGen/AArch64/known-never-nan.ll
index 1d602eae9724ce..d024f713a86ca9 100644
--- a/llvm/test/CodeGen/AArch64/known-never-nan.ll
+++ b/llvm/test/CodeGen/AArch64/known-never-nan.ll
@@ -28,13 +28,13 @@ define float @fmaxnm(i32 %i1, i32 %i2) #0 {
 define float @not_fmaxnm_maybe_nan(i32 %i1, i32 %i2) #0 {
 ; CHECK-LABEL: not_fmaxnm_maybe_nan:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-8388608
 ; CHECK-NEXT:    ucvtf s0, w0
 ; CHECK-NEXT:    ucvtf s1, w1
-; CHECK-NEXT:    fmov s3, #17.00000000
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    fadd s1, s1, s3
-; CHECK-NEXT:    fmul s0, s0, s2
+; CHECK-NEXT:    mov w8, #-8388608 // =0xff800000
+; CHECK-NEXT:    fmov s2, #17.00000000
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmul s0, s0, s3
+; CHECK-NEXT:    fadd s1, s1, s2
 ; CHECK-NEXT:    fcmp s0, s1
 ; CHECK-NEXT:    fcsel s0, s0, s1, pl
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll
index dcf2732aa43646..f8cba4ce4b63bf 100644
--- a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll
+++ b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll
@@ -12,17 +12,17 @@ define <vscale x 16 x i8> @sdiv_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
 ; CHECK-NEXT:    sunpkhi z3.h, z0.b
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sunpklo z1.h, z1.b
+; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    sunpkhi z4.s, z2.h
 ; CHECK-NEXT:    sunpkhi z5.s, z3.h
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
-; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT:    sunpkhi z5.s, z0.h
+; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    sunpkhi z3.s, z1.h
-; CHECK-NEXT:    sunpkhi z5.s, z0.h
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z5.s
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    uzp1 z1.h, z2.h, z4.h
@@ -113,23 +113,23 @@ define <vscale x 16 x i8> @srem_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
 ; CHECK-NEXT:    sunpkhi z2.h, z1.b
 ; CHECK-NEXT:    sunpkhi z3.h, z0.b
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    sunpkhi z5.s, z2.h
-; CHECK-NEXT:    sunpkhi z6.s, z3.h
+; CHECK-NEXT:    sunpkhi z4.s, z2.h
+; CHECK-NEXT:    sunpkhi z5.s, z3.h
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
-; CHECK-NEXT:    sunpklo z4.h, z1.b
+; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT:    sunpklo z5.h, z0.b
+; CHECK-NEXT:    sunpkhi z7.s, z5.h
+; CHECK-NEXT:    sunpklo z5.s, z5.h
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    sunpklo z3.h, z0.b
-; CHECK-NEXT:    sdivr z5.s, p0/m, z5.s, z6.s
-; CHECK-NEXT:    sunpkhi z6.s, z4.h
-; CHECK-NEXT:    sunpkhi z7.s, z3.h
-; CHECK-NEXT:    sunpklo z4.s, z4.h
+; CHECK-NEXT:    sunpklo z3.h, z1.b
+; CHECK-NEXT:    sunpkhi z6.s, z3.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT:    sdiv z3.s, p0/m, z3.s, z4.s
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z5.h
-; CHECK-NEXT:    uzp1 z3.h, z3.h, z6.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z4.h
+; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z5.s
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    uzp1 z3.h, z3.h, z6.h
 ; CHECK-NEXT:    uzp1 z2.b, z3.b, z2.b
 ; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    ret
@@ -143,13 +143,12 @@ define <vscale x 8 x i16> @srem_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sunpkhi z2.s, z1.h
 ; CHECK-NEXT:    sunpkhi z3.s, z0.h
-; CHECK-NEXT:    sunpklo z4.s, z1.h
+; CHECK-NEXT:    sunpklo z4.s, z0.h
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    sunpklo z5.s, z0.h
-; CHECK-NEXT:    movprfx z3, z5
-; CHECK-NEXT:    sdiv z3.s, p0/m, z3.s, z4.s
-; CHECK-NEXT:    uzp1 z2.h, z3.h, z2.h
+; CHECK-NEXT:    sunpklo z3.s, z1.h
+; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    uzp1 z2.h, z3.h, z2.h
 ; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    ret
   %div = srem <vscale x 8 x i16> %a, %b
@@ -191,17 +190,17 @@ define <vscale x 16 x i8> @udiv_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
 ; CHECK-NEXT:    uunpkhi z3.h, z0.b
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpklo z1.h, z1.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    uunpkhi z4.s, z2.h
 ; CHECK-NEXT:    uunpkhi z5.s, z3.h
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT:    uunpkhi z5.s, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    uunpkhi z3.s, z1.h
-; CHECK-NEXT:    uunpkhi z5.s, z0.h
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z5.s
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    uzp1 z1.h, z2.h, z4.h
@@ -293,23 +292,23 @@ define <vscale x 16 x i8> @urem_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
 ; CHECK-NEXT:    uunpkhi z2.h, z1.b
 ; CHECK-NEXT:    uunpkhi z3.h, z0.b
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    uunpkhi z5.s, z2.h
-; CHECK-NEXT:    uunpkhi z6.s, z3.h
+; CHECK-NEXT:    uunpkhi z4.s, z2.h
+; CHECK-NEXT:    uunpkhi z5.s, z3.h
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    uunpklo z4.h, z1.b
+; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT:    uunpklo z5.h, z0.b
+; CHECK-NEXT:    uunpkhi z7.s, z5.h
+; CHECK-NEXT:    uunpklo z5.s, z5.h
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    uunpklo z3.h, z0.b
-; CHECK-NEXT:    udivr z5.s, p0/m, z5.s, z6.s
-; CHECK-NEXT:    uunpkhi z6.s, z4.h
-; CHECK-NEXT:    uunpkhi z7.s, z3.h
-; CHECK-NEXT:    uunpklo z4.s, z4.h
+; CHECK-NEXT:    uunpklo z3.h, z1.b
+; CHECK-NEXT:    uunpkhi z6.s, z3.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT:    udiv z3.s, p0/m, z3.s, z4.s
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z5.h
-; CHECK-NEXT:    uzp1 z3.h, z3.h, z6.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z4.h
+; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z5.s
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    uzp1 z3.h, z3.h, z6.h
 ; CHECK-NEXT:    uzp1 z2.b, z3.b, z2.b
 ; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    ret
@@ -323,13 +322,12 @@ define <vscale x 8 x i16> @urem_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpkhi z2.s, z1.h
 ; CHECK-NEXT:    uunpkhi z3.s, z0.h
-; CHECK-NEXT:    uunpklo z4.s, z1.h
+; CHECK-NEXT:    uunpklo z4.s, z0.h
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    uunpklo z5.s, z0.h
-; CHECK-NEXT:    movprfx z3, z5
-; CHECK-NEXT:    udiv z3.s, p0/m, z3.s, z4.s
-; CHECK-NEXT:    uzp1 z2.h, z3.h, z2.h
+; CHECK-NEXT:    uunpklo z3.s, z1.h
+; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    uzp1 z2.h, z3.h, z2.h
 ; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    ret
   %div = urem <vscale x 8 x i16> %a, %b
@@ -424,9 +422,9 @@ define <vscale x 32 x i16> @smin_split_i16(<vscale x 32 x i16> %a, <vscale x 32
 ; CHECK-LABEL: smin_split_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    smin z2.h, p0/m, z2.h, z6.h
 ; CHECK-NEXT:    smin z0.h, p0/m, z0.h, z4.h
 ; CHECK-NEXT:    smin z1.h, p0/m, z1.h, z5.h
+; CHECK-NEXT:    smin z2.h, p0/m, z2.h, z6.h
 ; CHECK-NEXT:    smin z3.h, p0/m, z3.h, z7.h
 ; CHECK-NEXT:    ret
   %cmp = icmp slt <vscale x 32 x i16> %a, %b
@@ -1067,9 +1065,9 @@ define <vscale x 64 x i1> @cmp_split_64(<vscale x 64 x i8> %a, <vscale x 64 x i8
 ; CHECK-LABEL: cmp_split_64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p3.b
-; CHECK-NEXT:    cmpgt p2.b, p3/z, z2.b, z6.b
 ; CHECK-NEXT:    cmpgt p0.b, p3/z, z0.b, z4.b
 ; CHECK-NEXT:    cmpgt p1.b, p3/z, z1.b, z5.b
+; CHECK-NEXT:    cmpgt p2.b, p3/z, z2.b, z6.b
 ; CHECK-NEXT:    cmpgt p3.b, p3/z, z3.b, z7.b
 ; CHECK-NEXT:    ret
   %cmp = icmp sgt <vscale x 64 x i8> %a, %b
@@ -1083,12 +1081,12 @@ declare <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x
 define <vscale x 2 x i64> @fshl_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c){
 ; CHECK-LABEL: fshl_i64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z3.d, #63 // =0x3f
 ; CHECK-NEXT:    mov z4.d, z2.d
+; CHECK-NEXT:    lsr z1.d, z1.d, #1
 ; CHECK-NEXT:    bic z2.d, z3.d, z2.d
 ; CHECK-NEXT:    and z4.d, z4.d, #0x3f
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    lsr z1.d, z1.d, #1
 ; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z4.d
 ; CHECK-NEXT:    lsr z1.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
@@ -1100,18 +1098,19 @@ define <vscale x 2 x i64> @fshl_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b
 define <vscale x 4 x i64> @fshl_illegal_i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, <vscale x 4 x i64> %c){
 ; CHECK-LABEL: fshl_illegal_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z6.d, #63 // =0x3f
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    bic z7.d, z6.d, z4.d
+; CHECK-NEXT:    mov z6.d, #63 // =0x3f
+; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    lsr z2.d, z2.d, #1
+; CHECK-NEXT:    lsr z3.d, z3.d, #1
+; CHECK-NEXT:    bic z4.d, z6.d, z4.d
+; CHECK-NEXT:    and z7.d, z7.d, #0x3f
 ; CHECK-NEXT:    bic z6.d, z6.d, z5.d
-; CHECK-NEXT:    and z4.d, z4.d, #0x3f
 ; CHECK-NEXT:    and z5.d, z5.d, #0x3f
-; CHECK-NEXT:    lsr z3.d, z3.d, #1
-; CHECK-NEXT:    lsr z2.d, p0/m, z2.d, z7.d
-; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z4.d
-; CHECK-NEXT:    lsl z1.d, p0/m, z1.d, z5.d
+; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z7.d
+; CHECK-NEXT:    lsr z2.d, p0/m, z2.d, z4.d
 ; CHECK-NEXT:    lsr z3.d, p0/m, z3.d, z6.d
+; CHECK-NEXT:    lsl z1.d, p0/m, z1.d, z5.d
 ; CHECK-NEXT:    orr z0.d, z0.d, z2.d
 ; CHECK-NEXT:    orr z1.d, z1.d, z3.d
 ; CHECK-NEXT:    ret
@@ -1122,14 +1121,14 @@ define <vscale x 4 x i64> @fshl_illegal_i64(<vscale x 4 x i64> %a, <vscale x 4 x
 define <vscale x 2 x i64> @fshl_rot_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b){
 ; CHECK-LABEL: fshl_rot_i64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    subr z1.d, z1.d, #0 // =0x0
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    and z1.d, z1.d, #0x3f
 ; CHECK-NEXT:    and z2.d, z2.d, #0x3f
-; CHECK-NEXT:    lsrr z1.d, p0/m, z1.d, z0.d
-; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z2.d
-; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    and z1.d, z1.d, #0x3f
+; CHECK-NEXT:    lslr z2.d, p0/m, z2.d, z0.d
+; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    orr z0.d, z2.d, z0.d
 ; CHECK-NEXT:    ret
   %fshl = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
   ret <vscale x 2 x i64> %fshl
@@ -1139,22 +1138,22 @@ define <vscale x 2 x i64> @fshl_rot_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64
 define <vscale x 4 x i64> @fshl_rot_illegal_i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b){
 ; CHECK-LABEL: fshl_rot_illegal_i64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z4.d, z2.d
 ; CHECK-NEXT:    subr z2.d, z2.d, #0 // =0x0
 ; CHECK-NEXT:    mov z5.d, z3.d
 ; CHECK-NEXT:    subr z3.d, z3.d, #0 // =0x0
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    and z2.d, z2.d, #0x3f
 ; CHECK-NEXT:    and z4.d, z4.d, #0x3f
+; CHECK-NEXT:    and z2.d, z2.d, #0x3f
 ; CHECK-NEXT:    and z3.d, z3.d, #0x3f
-; CHECK-NEXT:    lsrr z2.d, p0/m, z2.d, z0.d
-; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z4.d
 ; CHECK-NEXT:    and z5.d, z5.d, #0x3f
-; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    lsl z4.d, p0/m, z4.d, z5.d
+; CHECK-NEXT:    lslr z4.d, p0/m, z4.d, z0.d
+; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    movprfx z2, z1
+; CHECK-NEXT:    lsl z2.d, p0/m, z2.d, z5.d
 ; CHECK-NEXT:    lsr z1.d, p0/m, z1.d, z3.d
-; CHECK-NEXT:    orr z0.d, z0.d, z2.d
-; CHECK-NEXT:    orr z1.d, z4.d, z1.d
+; CHECK-NEXT:    orr z0.d, z4.d, z0.d
+; CHECK-NEXT:    orr z1.d, z2.d, z1.d
 ; CHECK-NEXT:    ret
   %fshl = call <vscale x 4 x i64> @llvm.fshl.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %a, <vscale x 4 x i64> %b)
   ret <vscale x 4 x i64> %fshl
@@ -1176,12 +1175,12 @@ define <vscale x 2 x i64> @fshl_rot_const_i64(<vscale x 2 x i64> %a){
 define <vscale x 2 x i64> @fshr_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c){
 ; CHECK-LABEL: fshr_i64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z3.d, #63 // =0x3f
 ; CHECK-NEXT:    mov z4.d, z2.d
+; CHECK-NEXT:    lsl z0.d, z0.d, #1
 ; CHECK-NEXT:    bic z2.d, z3.d, z2.d
 ; CHECK-NEXT:    and z4.d, z4.d, #0x3f
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    lsl z0.d, z0.d, #1
 ; CHECK-NEXT:    lsr z1.d, p0/m, z1.d, z4.d
 ; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z2.d
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
@@ -1193,14 +1192,14 @@ define <vscale x 2 x i64> @fshr_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b
 define <vscale x 2 x i64> @fshr_rot_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b){
 ; CHECK-LABEL: fshr_rot_i64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    subr z1.d, z1.d, #0 // =0x0
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    and z1.d, z1.d, #0x3f
 ; CHECK-NEXT:    and z2.d, z2.d, #0x3f
-; CHECK-NEXT:    lslr z1.d, p0/m, z1.d, z0.d
-; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z2.d
-; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    and z1.d, z1.d, #0x3f
+; CHECK-NEXT:    lsrr z2.d, p0/m, z2.d, z0.d
+; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    orr z0.d, z2.d, z0.d
 ; CHECK-NEXT:    ret
   %fshr = call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
   ret <vscale x 2 x i64> %fshr

diff  --git a/llvm/test/CodeGen/AArch64/load-insert-zero.ll b/llvm/test/CodeGen/AArch64/load-insert-zero.ll
index 692e9edfedad49..993af08a66ddd9 100644
--- a/llvm/test/CodeGen/AArch64/load-insert-zero.ll
+++ b/llvm/test/CodeGen/AArch64/load-insert-zero.ll
@@ -461,27 +461,27 @@ define void @predictor_4x4_neon(ptr nocapture noundef writeonly %0, i64 noundef
 ; CHECK-LABEL: predictor_4x4_neon:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    ldur w9, [x2, #2]
+; CHECK-NEXT:    ldur w8, [x2, #2]
 ; CHECK-NEXT:    ldr s1, [x2]
-; CHECK-NEXT:    lsl x8, x1, #1
 ; CHECK-NEXT:    ldur s2, [x2, #1]
-; CHECK-NEXT:    mov v0.s[0], w9
-; CHECK-NEXT:    lsr w9, w9, #24
 ; CHECK-NEXT:    ushll v3.8h, v2.8b, #1
-; CHECK-NEXT:    dup v4.8b, w9
-; CHECK-NEXT:    add x9, x8, x1
+; CHECK-NEXT:    mov v0.s[0], w8
+; CHECK-NEXT:    lsr w8, w8, #24
 ; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
 ; CHECK-NEXT:    urhadd v1.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    str s1, [x0]
 ; CHECK-NEXT:    add v0.8h, v0.8h, v3.8h
-; CHECK-NEXT:    zip1 v2.2s, v1.2s, v4.2s
+; CHECK-NEXT:    dup v3.8b, w8
+; CHECK-NEXT:    lsl x8, x1, #1
 ; CHECK-NEXT:    rshrn v0.8b, v0.8h, #2
+; CHECK-NEXT:    zip1 v2.2s, v1.2s, v3.2s
 ; CHECK-NEXT:    str s0, [x0, x1]
-; CHECK-NEXT:    zip1 v3.2s, v0.2s, v4.2s
-; CHECK-NEXT:    ext v1.8b, v2.8b, v0.8b, #1
+; CHECK-NEXT:    zip1 v3.2s, v0.2s, v3.2s
+; CHECK-NEXT:    ext v2.8b, v2.8b, v0.8b, #1
+; CHECK-NEXT:    str s2, [x0, x8]
+; CHECK-NEXT:    add x8, x8, x1
+; CHECK-NEXT:    ext v1.8b, v3.8b, v0.8b, #1
 ; CHECK-NEXT:    str s1, [x0, x8]
-; CHECK-NEXT:    ext v2.8b, v3.8b, v0.8b, #1
-; CHECK-NEXT:    str s2, [x0, x9]
 ; CHECK-NEXT:    ret
   %5 = load i32, ptr %2, align 4
   %6 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %5, i64 0
@@ -537,24 +537,24 @@ define void @predictor_4x4_neon_new(ptr nocapture noundef writeonly %0, i64 noun
 ; CHECK-LABEL: predictor_4x4_neon_new:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s0, [x2]
-; CHECK-NEXT:    lsl x8, x1, #1
 ; CHECK-NEXT:    ldur s1, [x2, #1]
-; CHECK-NEXT:    add x9, x8, x1
+; CHECK-NEXT:    lsl x8, x1, #1
 ; CHECK-NEXT:    ldur s2, [x2, #2]
 ; CHECK-NEXT:    ldur s3, [x2, #3]
 ; CHECK-NEXT:    uaddl v4.8h, v1.8b, v0.8b
 ; CHECK-NEXT:    urhadd v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    add x9, x8, x1
 ; CHECK-NEXT:    uaddl v5.8h, v2.8b, v1.8b
 ; CHECK-NEXT:    uaddl v3.8h, v3.8b, v2.8b
+; CHECK-NEXT:    urhadd v1.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    str s0, [x0]
 ; CHECK-NEXT:    add v4.8h, v4.8h, v5.8h
 ; CHECK-NEXT:    add v3.8h, v3.8h, v5.8h
-; CHECK-NEXT:    rshrn v0.8b, v4.8h, #2
-; CHECK-NEXT:    str s0, [x0, x1]
-; CHECK-NEXT:    urhadd v0.8b, v1.8b, v2.8b
-; CHECK-NEXT:    rshrn v1.8b, v3.8h, #2
-; CHECK-NEXT:    str s0, [x0, x8]
-; CHECK-NEXT:    str s1, [x0, x9]
+; CHECK-NEXT:    rshrn v4.8b, v4.8h, #2
+; CHECK-NEXT:    rshrn v0.8b, v3.8h, #2
+; CHECK-NEXT:    str s4, [x0, x1]
+; CHECK-NEXT:    str s1, [x0, x8]
+; CHECK-NEXT:    str s0, [x0, x9]
 ; CHECK-NEXT:    ret
   %5 = load i32, ptr %2, align 4
   %6 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %5, i64 0
@@ -608,9 +608,9 @@ define void @predictor_4x4_neon_new(ptr nocapture noundef writeonly %0, i64 noun
 define <vscale x 8 x i8> @loadnxv8i8(ptr %p) {
 ; CHECK-LABEL: loadnxv8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl1
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    ldrb w8, [x0]
 ; CHECK-NEXT:    mov z0.h, p0/m, w8
 ; CHECK-NEXT:    ret
   %l = load i8, ptr %p
@@ -631,9 +631,9 @@ define <vscale x 16 x i8> @loadnxv16i8(ptr %p) {
 define <vscale x 4 x i16> @loadnxv4i16(ptr %p) {
 ; CHECK-LABEL: loadnxv4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl1
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    ldrh w8, [x0]
 ; CHECK-NEXT:    mov z0.s, p0/m, w8
 ; CHECK-NEXT:    ret
   %l = load i16, ptr %p
@@ -654,9 +654,9 @@ define <vscale x 8 x i16> @loadnxv8i16(ptr %p) {
 define <vscale x 2 x i32> @loadnxv2i32(ptr %p) {
 ; CHECK-LABEL: loadnxv2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl1
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    mov z0.d, p0/m, x8
 ; CHECK-NEXT:    ret
   %l = load i32, ptr %p
@@ -688,13 +688,13 @@ define <vscale x 2 x i64> @loadnxv2i64(ptr %p) {
 define <vscale x 4 x half> @loadnxv4f16(ptr %p) {
 ; CHECK-LABEL: loadnxv4f16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    ldr h1, [x0]
 ; CHECK-NEXT:    index z0.s, #0, #1
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z2.s, w8
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    ldr h1, [x0]
 ; CHECK-NEXT:    mov z0.h, p0/m, h1
 ; CHECK-NEXT:    ret
   %l = load half, ptr %p
@@ -715,13 +715,13 @@ define <vscale x 8 x half> @loadnxv8f16(ptr %p) {
 define <vscale x 4 x bfloat> @loadnxv4bf16(ptr %p) {
 ; CHECK-LABEL: loadnxv4bf16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    ldr h1, [x0]
 ; CHECK-NEXT:    index z0.s, #0, #1
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z2.s, w8
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    ldr h1, [x0]
 ; CHECK-NEXT:    mov z0.h, p0/m, h1
 ; CHECK-NEXT:    ret
   %l = load bfloat, ptr %p
@@ -742,13 +742,13 @@ define <vscale x 8 x bfloat> @loadnxv8bf16(ptr %p) {
 define <vscale x 2 x float> @loadnxv2f32(ptr %p) {
 ; CHECK-LABEL: loadnxv2f32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    ldr s1, [x0]
 ; CHECK-NEXT:    index z0.d, #0, #1
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, z2.d
+; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, z1.d
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    ldr s1, [x0]
 ; CHECK-NEXT:    mov z0.s, p0/m, s1
 ; CHECK-NEXT:    ret
   %l = load float, ptr %p
@@ -782,9 +782,9 @@ define <vscale x 2 x double> @loadnxv2f64(ptr %p) {
 define <vscale x 8 x i8> @loadnxv8i8_offset(ptr %p) {
 ; CHECK-LABEL: loadnxv8i8_offset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x0, #1]
 ; CHECK-NEXT:    ptrue p0.h, vl1
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    ldrb w8, [x0, #1]
 ; CHECK-NEXT:    mov z0.h, p0/m, w8
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i8, ptr %p, i64 1
@@ -807,9 +807,9 @@ define <vscale x 16 x i8> @loadnxv16i8_offset(ptr %p) {
 define <vscale x 4 x i16> @loadnxv4i16_offset(ptr %p) {
 ; CHECK-LABEL: loadnxv4i16_offset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldurh w8, [x0, #1]
 ; CHECK-NEXT:    ptrue p0.s, vl1
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    ldurh w8, [x0, #1]
 ; CHECK-NEXT:    mov z0.s, p0/m, w8
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i8, ptr %p, i64 1
@@ -832,9 +832,9 @@ define <vscale x 8 x i16> @loadnxv8i16_offset(ptr %p) {
 define <vscale x 2 x i32> @loadnxv2i32_offset(ptr %p) {
 ; CHECK-LABEL: loadnxv2i32_offset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldur w8, [x0, #1]
 ; CHECK-NEXT:    ptrue p0.d, vl1
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    ldur w8, [x0, #1]
 ; CHECK-NEXT:    mov z0.d, p0/m, x8
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i8, ptr %p, i64 1
@@ -869,13 +869,13 @@ define <vscale x 2 x i64> @loadnxv2i64_offset(ptr %p) {
 define <vscale x 4 x half> @loadnxv4f16_offset(ptr %p) {
 ; CHECK-LABEL: loadnxv4f16_offset:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    ldur h1, [x0, #1]
 ; CHECK-NEXT:    index z0.s, #0, #1
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z2.s, w8
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    ldur h1, [x0, #1]
 ; CHECK-NEXT:    mov z0.h, p0/m, h1
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i8, ptr %p, i64 1
@@ -898,13 +898,13 @@ define <vscale x 8 x half> @loadnxv8f16_offset(ptr %p) {
 define <vscale x 4 x bfloat> @loadnxv4bf16_offset(ptr %p) {
 ; CHECK-LABEL: loadnxv4bf16_offset:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    ldur h1, [x0, #1]
 ; CHECK-NEXT:    index z0.s, #0, #1
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z2.s, w8
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    ldur h1, [x0, #1]
 ; CHECK-NEXT:    mov z0.h, p0/m, h1
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i8, ptr %p, i64 1
@@ -927,13 +927,13 @@ define <vscale x 8 x bfloat> @loadnxv8bf16_offset(ptr %p) {
 define <vscale x 2 x float> @loadnxv2f32_offset(ptr %p) {
 ; CHECK-LABEL: loadnxv2f32_offset:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    ldur s1, [x0, #1]
 ; CHECK-NEXT:    index z0.d, #0, #1
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, z2.d
+; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, z1.d
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    ldur s1, [x0, #1]
 ; CHECK-NEXT:    mov z0.s, p0/m, s1
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i8, ptr %p, i64 1

diff  --git a/llvm/test/CodeGen/AArch64/logic-reassociate.ll b/llvm/test/CodeGen/AArch64/logic-reassociate.ll
index 5694dfc6b47443..3ca9a1fe222da6 100644
--- a/llvm/test/CodeGen/AArch64/logic-reassociate.ll
+++ b/llvm/test/CodeGen/AArch64/logic-reassociate.ll
@@ -14,8 +14,8 @@ define i32 @and_commute0(i32 %x, i32 %y) {
 define i128 @and_commute1(i128 %x, i128 %y) {
 ; CHECK-LABEL: and_commute1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and x0, x2, x0
 ; CHECK-NEXT:    and x1, x3, x1
+; CHECK-NEXT:    and x0, x2, x0
 ; CHECK-NEXT:    ret
   %b = and i128 %y, %x
   %b2 = and i128 %x, %b

diff  --git a/llvm/test/CodeGen/AArch64/logic-shift.ll b/llvm/test/CodeGen/AArch64/logic-shift.ll
index 24187abc7e994d..39f82dd4593fb6 100644
--- a/llvm/test/CodeGen/AArch64/logic-shift.ll
+++ b/llvm/test/CodeGen/AArch64/logic-shift.ll
@@ -200,10 +200,10 @@ define i64 @or_mix_shr(i64 %x0, i64 %x1, i64 %y, i64 %z) {
 define i64 @or_lshr_mix_shift_amount(i64 %x0, i64 %x1, i64 %y, i64 %z, i64 %w) {
 ; CHECK-LABEL: or_lshr_mix_shift_amount:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x9, x0, x2
-; CHECK-NEXT:    lsr x8, x1, x4
-; CHECK-NEXT:    orr x9, x9, x3
-; CHECK-NEXT:    orr x0, x9, x8
+; CHECK-NEXT:    lsr x8, x0, x2
+; CHECK-NEXT:    lsr x9, x1, x4
+; CHECK-NEXT:    orr x8, x8, x3
+; CHECK-NEXT:    orr x0, x8, x9
 ; CHECK-NEXT:    ret
   %sh1 = lshr i64 %x0, %y
   %sh2 = lshr i64 %x1, %w
@@ -428,10 +428,10 @@ define i64 @xor_mix_shr(i64 %x0, i64 %x1, i64 %y, i64 %z) {
 define i64 @xor_lshr_mix_shift_amount(i64 %x0, i64 %x1, i64 %y, i64 %z, i64 %w) {
 ; CHECK-LABEL: xor_lshr_mix_shift_amount:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x9, x0, x2
-; CHECK-NEXT:    lsr x8, x1, x4
-; CHECK-NEXT:    eor x9, x9, x3
-; CHECK-NEXT:    eor x0, x9, x8
+; CHECK-NEXT:    lsr x8, x0, x2
+; CHECK-NEXT:    lsr x9, x1, x4
+; CHECK-NEXT:    eor x8, x8, x3
+; CHECK-NEXT:    eor x0, x8, x9
 ; CHECK-NEXT:    ret
   %sh1 = lshr i64 %x0, %y
   %sh2 = lshr i64 %x1, %w
@@ -656,10 +656,10 @@ define i64 @and_mix_shr(i64 %x0, i64 %x1, i64 %y, i64 %z) {
 define i64 @and_lshr_mix_shift_amount(i64 %x0, i64 %x1, i64 %y, i64 %z, i64 %w) {
 ; CHECK-LABEL: and_lshr_mix_shift_amount:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x9, x0, x2
-; CHECK-NEXT:    lsr x8, x1, x4
-; CHECK-NEXT:    and x9, x9, x3
-; CHECK-NEXT:    and x0, x9, x8
+; CHECK-NEXT:    lsr x8, x0, x2
+; CHECK-NEXT:    lsr x9, x1, x4
+; CHECK-NEXT:    and x8, x8, x3
+; CHECK-NEXT:    and x0, x8, x9
 ; CHECK-NEXT:    ret
   %sh1 = lshr i64 %x0, %y
   %sh2 = lshr i64 %x1, %w
@@ -788,9 +788,10 @@ define i32 @or_fshr_commute1(i32 %x, i32 %y) {
 define i16 @or_fshr_commute2(i16 %x, i16 %y) {
 ; CHECK-LABEL: or_fshr_commute2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    orr w8, w0, w1
-; CHECK-NEXT:    lsl w0, w0, #9
-; CHECK-NEXT:    bfxil w0, w8, #7, #9
+; CHECK-NEXT:    lsl w8, w0, #9
+; CHECK-NEXT:    orr w9, w0, w1
+; CHECK-NEXT:    bfxil w8, w9, #7, #9
+; CHECK-NEXT:    mov w0, w8
 ; CHECK-NEXT:    ret
   %or1 = or i16 %x, %y
   %sh1 = shl i16 %x, 9
@@ -802,9 +803,10 @@ define i16 @or_fshr_commute2(i16 %x, i16 %y) {
 define i8 @or_fshr_commute3(i8 %x, i8 %y) {
 ; CHECK-LABEL: or_fshr_commute3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    orr w8, w1, w0
-; CHECK-NEXT:    lsl w0, w0, #2
-; CHECK-NEXT:    bfxil w0, w8, #6, #2
+; CHECK-NEXT:    lsl w8, w0, #2
+; CHECK-NEXT:    orr w9, w1, w0
+; CHECK-NEXT:    bfxil w8, w9, #6, #2
+; CHECK-NEXT:    mov w0, w8
 ; CHECK-NEXT:    ret
   %or1 = or i8 %y, %x
   %sh1 = shl i8 %x, 2

diff  --git a/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll b/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll
index 42775e5689449f..c8c1e9007c7a0f 100644
--- a/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll
+++ b/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll
@@ -18,39 +18,39 @@ define void @logical_32bit() minsize {
 ; CHECK-NEXT:    ldr w9, [x9]
 ; CHECK-NEXT:    and w11, w10, w9
 ; CHECK-NEXT:    bic w12, w10, w9
-; CHECK-NEXT:    orr w13, w10, w9
 ; CHECK-NEXT:    str w11, [x8]
-; CHECK-NEXT:    orn w11, w10, w9
+; CHECK-NEXT:    orr w11, w10, w9
 ; CHECK-NEXT:    str w12, [x8]
-; CHECK-NEXT:    eor w12, w10, w9
-; CHECK-NEXT:    str w13, [x8]
-; CHECK-NEXT:    eon w13, w9, w10
+; CHECK-NEXT:    orn w12, w10, w9
+; CHECK-NEXT:    str w11, [x8]
+; CHECK-NEXT:    eor w11, w10, w9
+; CHECK-NEXT:    str w12, [x8]
+; CHECK-NEXT:    eon w12, w9, w10
 ; CHECK-NEXT:    str w11, [x8]
 ; CHECK-NEXT:    and w11, w10, w9, lsl #31
 ; CHECK-NEXT:    str w12, [x8]
 ; CHECK-NEXT:    bic w12, w10, w9, lsl #31
-; CHECK-NEXT:    str w13, [x8]
-; CHECK-NEXT:    orr w13, w10, w9, lsl #31
 ; CHECK-NEXT:    str w11, [x8]
-; CHECK-NEXT:    orn w11, w10, w9, lsl #31
+; CHECK-NEXT:    orr w11, w10, w9, lsl #31
+; CHECK-NEXT:    str w12, [x8]
+; CHECK-NEXT:    orn w12, w10, w9, lsl #31
+; CHECK-NEXT:    str w11, [x8]
+; CHECK-NEXT:    eor w11, w10, w9, lsl #31
 ; CHECK-NEXT:    str w12, [x8]
-; CHECK-NEXT:    eor w12, w10, w9, lsl #31
-; CHECK-NEXT:    str w13, [x8]
-; CHECK-NEXT:    eon w13, w10, w9, lsl #31
+; CHECK-NEXT:    eon w12, w10, w9, lsl #31
 ; CHECK-NEXT:    str w11, [x8]
 ; CHECK-NEXT:    bic w11, w10, w9, asr #10
 ; CHECK-NEXT:    str w12, [x8]
 ; CHECK-NEXT:    eor w12, w10, w9, asr #10
-; CHECK-NEXT:    str w13, [x8]
-; CHECK-NEXT:    orn w13, w10, w9, lsr #1
 ; CHECK-NEXT:    str w11, [x8]
-; CHECK-NEXT:    eor w11, w10, w9, lsr #1
+; CHECK-NEXT:    orn w11, w10, w9, lsr #1
 ; CHECK-NEXT:    str w12, [x8]
-; CHECK-NEXT:    eon w12, w10, w9, ror #20
-; CHECK-NEXT:    and w9, w10, w9, ror #20
-; CHECK-NEXT:    str w13, [x8]
+; CHECK-NEXT:    eor w12, w10, w9, lsr #1
 ; CHECK-NEXT:    str w11, [x8]
+; CHECK-NEXT:    eon w11, w10, w9, ror #20
+; CHECK-NEXT:    and w9, w10, w9, ror #20
 ; CHECK-NEXT:    str w12, [x8]
+; CHECK-NEXT:    str w11, [x8]
 ; CHECK-NEXT:    str w9, [x8]
 ; CHECK-NEXT:    ret
   %val1 = load i32, ptr @var1_32
@@ -134,39 +134,39 @@ define void @logical_64bit() minsize {
 ; CHECK-NEXT:    ldr x9, [x9]
 ; CHECK-NEXT:    and x11, x10, x9
 ; CHECK-NEXT:    bic x12, x10, x9
-; CHECK-NEXT:    orr x13, x10, x9
 ; CHECK-NEXT:    str x11, [x8]
-; CHECK-NEXT:    orn x11, x10, x9
+; CHECK-NEXT:    orr x11, x10, x9
 ; CHECK-NEXT:    str x12, [x8]
-; CHECK-NEXT:    eor x12, x10, x9
-; CHECK-NEXT:    str x13, [x8]
-; CHECK-NEXT:    eon x13, x9, x10
+; CHECK-NEXT:    orn x12, x10, x9
+; CHECK-NEXT:    str x11, [x8]
+; CHECK-NEXT:    eor x11, x10, x9
+; CHECK-NEXT:    str x12, [x8]
+; CHECK-NEXT:    eon x12, x9, x10
 ; CHECK-NEXT:    str x11, [x8]
 ; CHECK-NEXT:    and x11, x10, x9, lsl #63
 ; CHECK-NEXT:    str x12, [x8]
 ; CHECK-NEXT:    bic x12, x10, x9, lsl #63
-; CHECK-NEXT:    str x13, [x8]
-; CHECK-NEXT:    orr x13, x10, x9, lsl #63
 ; CHECK-NEXT:    str x11, [x8]
-; CHECK-NEXT:    orn x11, x10, x9, lsl #63
+; CHECK-NEXT:    orr x11, x10, x9, lsl #63
+; CHECK-NEXT:    str x12, [x8]
+; CHECK-NEXT:    orn x12, x10, x9, lsl #63
+; CHECK-NEXT:    str x11, [x8]
+; CHECK-NEXT:    eor x11, x10, x9, lsl #63
 ; CHECK-NEXT:    str x12, [x8]
-; CHECK-NEXT:    eor x12, x10, x9, lsl #63
-; CHECK-NEXT:    str x13, [x8]
-; CHECK-NEXT:    eon x13, x10, x9, lsl #63
+; CHECK-NEXT:    eon x12, x10, x9, lsl #63
 ; CHECK-NEXT:    str x11, [x8]
 ; CHECK-NEXT:    bic x11, x10, x9, asr #10
 ; CHECK-NEXT:    str x12, [x8]
 ; CHECK-NEXT:    eor x12, x10, x9, asr #10
-; CHECK-NEXT:    str x13, [x8]
-; CHECK-NEXT:    orn x13, x10, x9, lsr #1
 ; CHECK-NEXT:    str x11, [x8]
-; CHECK-NEXT:    eor x11, x10, x9, lsr #1
+; CHECK-NEXT:    orn x11, x10, x9, lsr #1
 ; CHECK-NEXT:    str x12, [x8]
-; CHECK-NEXT:    eon x12, x10, x9, ror #20
-; CHECK-NEXT:    and x9, x10, x9, ror #20
-; CHECK-NEXT:    str x13, [x8]
+; CHECK-NEXT:    eor x12, x10, x9, lsr #1
 ; CHECK-NEXT:    str x11, [x8]
+; CHECK-NEXT:    eon x11, x10, x9, ror #20
+; CHECK-NEXT:    and x9, x10, x9, ror #20
 ; CHECK-NEXT:    str x12, [x8]
+; CHECK-NEXT:    str x11, [x8]
 ; CHECK-NEXT:    str x9, [x8]
 ; CHECK-NEXT:    ret
   %val1 = load i64, ptr @var1_64
@@ -252,16 +252,17 @@ define void @flag_setting() {
 ; CHECK-NEXT:    ldr x9, [x8]
 ; CHECK-NEXT:    ldr x10, [x10]
 ; CHECK-NEXT:    tst x9, x10
-; CHECK-NEXT:    b.gt .LBB2_2
+; CHECK-NEXT:    b.gt .LBB2_4
 ; CHECK-NEXT:  // %bb.1: // %test2
 ; CHECK-NEXT:    tst x9, x10, lsl #63
+; CHECK-NEXT:    b.lt .LBB2_4
+; CHECK-NEXT:  // %bb.2: // %test3
 ; CHECK-NEXT:    and x10, x9, x10, asr #12
-; CHECK-NEXT:    ccmp x10, #1, #0, ge
-; CHECK-NEXT:    b.lt .LBB2_3
-; CHECK-NEXT:  .LBB2_2: // %common.ret
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB2_3: // %other_exit
+; CHECK-NEXT:    cmp x10, #1
+; CHECK-NEXT:    b.ge .LBB2_4
+; CHECK-NEXT:  // %bb.3: // %other_exit
 ; CHECK-NEXT:    str x9, [x8]
+; CHECK-NEXT:  .LBB2_4: // %common.ret
 ; CHECK-NEXT:    ret
   %val1 = load i64, ptr @var1_64
   %val2 = load i64, ptr @var2_64

diff  --git a/llvm/test/CodeGen/AArch64/lowerMUL-newload.ll b/llvm/test/CodeGen/AArch64/lowerMUL-newload.ll
index 18bb4710e31c3f..ada8ae1b1f709a 100644
--- a/llvm/test/CodeGen/AArch64/lowerMUL-newload.ll
+++ b/llvm/test/CodeGen/AArch64/lowerMUL-newload.ll
@@ -86,9 +86,9 @@ define <4 x i32> @addmuli16_and(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec
 ; CHECK-LABEL: addmuli16_and:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    smull v1.4s, v1.4h, v2.4h
-; CHECK-NEXT:    movi v3.2d, #0x00ffff0000ffff
 ; CHECK-NEXT:    smlal v1.4s, v0.4h, v2.4h
-; CHECK-NEXT:    and v0.16b, v1.16b, v3.16b
+; CHECK-NEXT:    movi v0.2d, #0x00ffff0000ffff
+; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    ret
 entry:
   %v0 = sext <4 x i16> %vec0 to <4 x i32>
@@ -214,9 +214,9 @@ define <2 x i64> @addmuli32_and(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec
 ; CHECK-LABEL: addmuli32_and:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    smull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT:    movi v3.2d, #0x000000ffffffff
 ; CHECK-NEXT:    smlal v1.2d, v0.2s, v2.2s
-; CHECK-NEXT:    and v0.16b, v1.16b, v3.16b
+; CHECK-NEXT:    movi v0.2d, #0x000000ffffffff
+; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    ret
 entry:
   %v0 = sext <2 x i32> %vec0 to <2 x i64>

diff  --git a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
index 5ede3c78fdcd71..15a484d11b0a1d 100644
--- a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
+++ b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
@@ -7,8 +7,8 @@ define void @fma_dup_f16(ptr noalias nocapture noundef readonly %A, half noundef
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 def $q0
 ; CHECK-NEXT:    cbz w2, .LBB0_8
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    mov w8, w2
 ; CHECK-NEXT:    cmp w2, #15
+; CHECK-NEXT:    mov w8, w2
 ; CHECK-NEXT:    b.hi .LBB0_3
 ; CHECK-NEXT:  // %bb.2:
 ; CHECK-NEXT:    mov x9, xzr
@@ -20,13 +20,13 @@ define void @fma_dup_f16(ptr noalias nocapture noundef readonly %A, half noundef
 ; CHECK-NEXT:    mov x12, x9
 ; CHECK-NEXT:  .LBB0_4: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldp q1, q2, [x11, #-16]
+; CHECK-NEXT:    ldp q1, q3, [x11, #-16]
 ; CHECK-NEXT:    subs x12, x12, #16
+; CHECK-NEXT:    ldp q2, q4, [x10, #-16]
 ; CHECK-NEXT:    add x11, x11, #32
-; CHECK-NEXT:    ldp q3, q4, [x10, #-16]
-; CHECK-NEXT:    fmla v3.8h, v1.8h, v0.h[0]
-; CHECK-NEXT:    fmla v4.8h, v2.8h, v0.h[0]
-; CHECK-NEXT:    stp q3, q4, [x10, #-16]
+; CHECK-NEXT:    fmla v2.8h, v1.8h, v0.h[0]
+; CHECK-NEXT:    fmla v4.8h, v3.8h, v0.h[0]
+; CHECK-NEXT:    stp q2, q4, [x10, #-16]
 ; CHECK-NEXT:    add x10, x10, #32
 ; CHECK-NEXT:    b.ne .LBB0_4
 ; CHECK-NEXT:  // %bb.5: // %middle.block

diff  --git a/llvm/test/CodeGen/AArch64/machine-combiner-subadd.ll b/llvm/test/CodeGen/AArch64/machine-combiner-subadd.ll
index 77c3d4e4df2df9..6bee9f3f65662c 100644
--- a/llvm/test/CodeGen/AArch64/machine-combiner-subadd.ll
+++ b/llvm/test/CodeGen/AArch64/machine-combiner-subadd.ll
@@ -9,12 +9,12 @@
 define i32 @test1(i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: test1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add w9, w0, #100
-; CHECK-NEXT:    orr w8, w2, #0x80
-; CHECK-NEXT:    sub w8, w8, w9
-; CHECK-NEXT:    eor w9, w1, w9, lsl #8
-; CHECK-NEXT:    sub w8, w8, w9
-; CHECK-NEXT:    eor w0, w8, w9, asr #13
+; CHECK-NEXT:    add w8, w0, #100
+; CHECK-NEXT:    orr w9, w2, #0x80
+; CHECK-NEXT:    eor w10, w1, w8, lsl #8
+; CHECK-NEXT:    sub w8, w9, w8
+; CHECK-NEXT:    sub w8, w8, w10
+; CHECK-NEXT:    eor w0, w8, w10, asr #13
 ; CHECK-NEXT:    ret
 entry:
   %c1  = or  i32 %c, 128
@@ -32,12 +32,12 @@ entry:
 define i64 @test2(i64 %a, i64 %b, i64 %c) {
 ; CHECK-LABEL: test2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add x9, x0, #100
-; CHECK-NEXT:    orr x8, x2, #0x80
-; CHECK-NEXT:    sub x8, x8, x9
-; CHECK-NEXT:    eor x9, x1, x9, lsl #8
-; CHECK-NEXT:    sub x8, x8, x9
-; CHECK-NEXT:    eor x0, x8, x9, asr #13
+; CHECK-NEXT:    add x8, x0, #100
+; CHECK-NEXT:    orr x9, x2, #0x80
+; CHECK-NEXT:    eor x10, x1, x8, lsl #8
+; CHECK-NEXT:    sub x8, x9, x8
+; CHECK-NEXT:    sub x8, x8, x10
+; CHECK-NEXT:    eor x0, x8, x10, asr #13
 ; CHECK-NEXT:    ret
 entry:
   %c1  = or  i64 %c, 128
@@ -55,12 +55,12 @@ entry:
 define i32 @test3(i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: test3:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add w9, w0, #100
-; CHECK-NEXT:    orr w8, w2, #0x80
-; CHECK-NEXT:    add w8, w8, w9
-; CHECK-NEXT:    eor w9, w1, w9, lsl #8
-; CHECK-NEXT:    sub w8, w9, w8
-; CHECK-NEXT:    eor w0, w8, w9, asr #13
+; CHECK-NEXT:    add w8, w0, #100
+; CHECK-NEXT:    orr w9, w2, #0x80
+; CHECK-NEXT:    eor w10, w1, w8, lsl #8
+; CHECK-NEXT:    add w8, w9, w8
+; CHECK-NEXT:    sub w8, w10, w8
+; CHECK-NEXT:    eor w0, w8, w10, asr #13
 ; CHECK-NEXT:    ret
 entry:
   %c1  = or  i32 %c, 128

diff  --git a/llvm/test/CodeGen/AArch64/machine-combiner-transient.ll b/llvm/test/CodeGen/AArch64/machine-combiner-transient.ll
index 154304db775b1e..13845523f996c2 100644
--- a/llvm/test/CodeGen/AArch64/machine-combiner-transient.ll
+++ b/llvm/test/CodeGen/AArch64/machine-combiner-transient.ll
@@ -11,10 +11,10 @@
 define i32 @test1(i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: test1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub w8, w2, w0
-; CHECK-NEXT:    eor w9, w1, w0, lsl #8
-; CHECK-NEXT:    sub w8, w8, w9
-; CHECK-NEXT:    eor w0, w8, w9, asr #13
+; CHECK-NEXT:    eor w8, w1, w0, lsl #8
+; CHECK-NEXT:    sub w9, w2, w0
+; CHECK-NEXT:    sub w9, w9, w8
+; CHECK-NEXT:    eor w0, w9, w8, asr #13
 ; CHECK-NEXT:    ret
 entry:
   %shl = shl i32 %a, 8
@@ -30,10 +30,10 @@ entry:
 define i64 @test2(i64 %a, i64 %b, i64 %c) {
 ; CHECK-LABEL: test2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub x8, x2, x0
-; CHECK-NEXT:    eor x9, x1, x0, lsl #8
-; CHECK-NEXT:    sub x8, x8, x9
-; CHECK-NEXT:    eor x0, x8, x9, asr #13
+; CHECK-NEXT:    eor x8, x1, x0, lsl #8
+; CHECK-NEXT:    sub x9, x2, x0
+; CHECK-NEXT:    sub x9, x9, x8
+; CHECK-NEXT:    eor x0, x9, x8, asr #13
 ; CHECK-NEXT:    ret
 entry:
   %shl = shl i64 %a, 8

diff  --git a/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll b/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll
index 1db0ce9fef73e0..3230c9e946da77 100644
--- a/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll
+++ b/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll
@@ -131,23 +131,23 @@ define i32 @cant_sink_successive_store(ptr nocapture readnone %store, i32 %n) {
 ; CHECK-NEXT:    b.lt .LBB2_3
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-NEXT:    adrp x8, A
-; CHECK-NEXT:    mov w9, #42
-; CHECK-NEXT:    mov w20, w19
-; CHECK-NEXT:    ldr w21, [x8, :lo12:A]
-; CHECK-NEXT:    str w9, [x0]
+; CHECK-NEXT:    mov w21, w19
+; CHECK-NEXT:    ldr w20, [x8, :lo12:A]
+; CHECK-NEXT:    mov w8, #42 // =0x2a
+; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:  .LBB2_2: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    mov w0, w21
+; CHECK-NEXT:    mov w0, w20
 ; CHECK-NEXT:    bl _Z3usei
-; CHECK-NEXT:    sdiv w20, w20, w0
+; CHECK-NEXT:    sdiv w21, w21, w0
 ; CHECK-NEXT:    subs w19, w19, #1
 ; CHECK-NEXT:    b.ne .LBB2_2
 ; CHECK-NEXT:    b .LBB2_4
 ; CHECK-NEXT:  .LBB2_3:
-; CHECK-NEXT:    mov w20, w19
+; CHECK-NEXT:    mov w21, w19
 ; CHECK-NEXT:  .LBB2_4: // %for.cond.cleanup
-; CHECK-NEXT:    mov w0, w20
 ; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    mov w0, w21
 ; CHECK-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll
index 80c29edd896667..d4da4f28bb4a3c 100644
--- a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll
+++ b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll
@@ -9,17 +9,17 @@ define void @foo(i32 noundef %limit, ptr %out, ptr %y) {
 ; CHECK-NEXT:    b.lt .LBB0_10
 ; CHECK-NEXT:  // %bb.1: // %for.cond1.preheader.us.preheader
 ; CHECK-NEXT:    mov w10, w0
+; CHECK-NEXT:    ubfiz x11, x0, #2, #32
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    mov x9, xzr
-; CHECK-NEXT:    and x11, x10, #0xfffffff0
-; CHECK-NEXT:    add x12, x1, #32
-; CHECK-NEXT:    ubfiz x13, x0, #2, #32
+; CHECK-NEXT:    and x12, x10, #0xfffffff0
+; CHECK-NEXT:    add x13, x1, #32
 ; CHECK-NEXT:    add x14, x2, #16
 ; CHECK-NEXT:    b .LBB0_3
 ; CHECK-NEXT:  .LBB0_2: // %for.cond1.for.cond.cleanup3_crit_edge.us
 ; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    add x9, x9, #1
-; CHECK-NEXT:    add x12, x12, x13
+; CHECK-NEXT:    add x13, x13, x11
 ; CHECK-NEXT:    add x8, x8, x10
 ; CHECK-NEXT:    cmp x9, x10
 ; CHECK-NEXT:    b.eq .LBB0_10
@@ -36,43 +36,43 @@ define void @foo(i32 noundef %limit, ptr %out, ptr %y) {
 ; CHECK-NEXT:  .LBB0_5: // %vector.ph
 ; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    mov x16, x14
-; CHECK-NEXT:    mov x17, x12
-; CHECK-NEXT:    mov x18, x11
+; CHECK-NEXT:    mov x17, x13
+; CHECK-NEXT:    mov x18, x12
 ; CHECK-NEXT:  .LBB0_6: // %vector.body
 ; CHECK-NEXT:    // Parent Loop BB0_3 Depth=1
 ; CHECK-NEXT:    // => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldp q0, q1, [x16, #-16]
-; CHECK-NEXT:    dup v3.8h, w15
+; CHECK-NEXT:    dup v0.8h, w15
+; CHECK-NEXT:    ldp q1, q4, [x16, #-16]
+; CHECK-NEXT:    ldp q3, q2, [x17, #-32]
 ; CHECK-NEXT:    subs x18, x18, #16
-; CHECK-NEXT:    add x16, x16, #32
-; CHECK-NEXT:    ldp q4, q2, [x17, #-32]
-; CHECK-NEXT:    smlal v4.4s, v3.4h, v0.4h
 ; CHECK-NEXT:    ldp q6, q5, [x17]
-; CHECK-NEXT:    smlal2 v2.4s, v3.8h, v0.8h
-; CHECK-NEXT:    smlal v6.4s, v3.4h, v1.4h
-; CHECK-NEXT:    stp q4, q2, [x17, #-32]
-; CHECK-NEXT:    smlal2 v5.4s, v3.8h, v1.8h
+; CHECK-NEXT:    add x16, x16, #32
+; CHECK-NEXT:    smlal2 v2.4s, v0.8h, v1.8h
+; CHECK-NEXT:    smlal v3.4s, v0.4h, v1.4h
+; CHECK-NEXT:    smlal2 v5.4s, v0.8h, v4.8h
+; CHECK-NEXT:    smlal v6.4s, v0.4h, v4.4h
+; CHECK-NEXT:    stp q3, q2, [x17, #-32]
 ; CHECK-NEXT:    stp q6, q5, [x17], #64
 ; CHECK-NEXT:    b.ne .LBB0_6
 ; CHECK-NEXT:  // %bb.7: // %middle.block
 ; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    mov x18, x11
-; CHECK-NEXT:    cmp x11, x10
+; CHECK-NEXT:    cmp x12, x10
+; CHECK-NEXT:    mov x18, x12
 ; CHECK-NEXT:    b.eq .LBB0_2
 ; CHECK-NEXT:  .LBB0_8: // %for.body4.us.preheader
 ; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    add x17, x18, x8
-; CHECK-NEXT:    sub x16, x10, x18
-; CHECK-NEXT:    add x18, x2, x18, lsl #1
-; CHECK-NEXT:    add x17, x1, x17, lsl #2
+; CHECK-NEXT:    add x16, x18, x8
+; CHECK-NEXT:    add x17, x2, x18, lsl #1
+; CHECK-NEXT:    sub x18, x10, x18
+; CHECK-NEXT:    add x16, x1, x16, lsl #2
 ; CHECK-NEXT:  .LBB0_9: // %for.body4.us
 ; CHECK-NEXT:    // Parent Loop BB0_3 Depth=1
 ; CHECK-NEXT:    // => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldrsh w3, [x18], #2
-; CHECK-NEXT:    ldr w4, [x17]
-; CHECK-NEXT:    subs x16, x16, #1
+; CHECK-NEXT:    ldrsh w3, [x17], #2
+; CHECK-NEXT:    ldr w4, [x16]
+; CHECK-NEXT:    subs x18, x18, #1
 ; CHECK-NEXT:    madd w3, w3, w15, w4
-; CHECK-NEXT:    str w3, [x17], #4
+; CHECK-NEXT:    str w3, [x16], #4
 ; CHECK-NEXT:    b.ne .LBB0_9
 ; CHECK-NEXT:    b .LBB0_2
 ; CHECK-NEXT:  .LBB0_10: // %for.cond.cleanup

diff  --git a/llvm/test/CodeGen/AArch64/machine_cse_impdef_killflags.ll b/llvm/test/CodeGen/AArch64/machine_cse_impdef_killflags.ll
index 7a45828b6c8f91..be1c0f21cc77ab 100644
--- a/llvm/test/CodeGen/AArch64/machine_cse_impdef_killflags.ll
+++ b/llvm/test/CodeGen/AArch64/machine_cse_impdef_killflags.ll
@@ -7,11 +7,11 @@
 define i64 @csed_impdef_killflag(i64 %a) {
 ; CHECK-LABEL: csed_impdef_killflag:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov w8, #1 ; =0x1
 ; CHECK-NEXT:    cmp x0, #0
+; CHECK-NEXT:    mov x9, #2 ; =0x2
 ; CHECK-NEXT:    csel w8, wzr, w8, ne
-; CHECK-NEXT:    mov x9, #2
-; CHECK-NEXT:    mov x10, #3
+; CHECK-NEXT:    mov x10, #3 ; =0x3
 ; CHECK-NEXT:    ubfx x8, x8, #0, #32
 ; CHECK-NEXT:    csel x9, x9, x10, ne
 ; CHECK-NEXT:    add x0, x9, x8

diff  --git a/llvm/test/CodeGen/AArch64/madd-combiner.ll b/llvm/test/CodeGen/AArch64/madd-combiner.ll
index 28e80b1f0fd38c..cfdeb3d97a5df5 100644
--- a/llvm/test/CodeGen/AArch64/madd-combiner.ll
+++ b/llvm/test/CodeGen/AArch64/madd-combiner.ll
@@ -6,7 +6,7 @@
 define i32 @mul_add_imm(i32 %a, i32 %b) {
 ; CHECK-LABEL: mul_add_imm:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov w8, #4
+; CHECK-NEXT:    mov w8, #4 ; =0x4
 ; CHECK-NEXT:    madd w0, w0, w1, w8
 ; CHECK-NEXT:    ret
   %1 = mul i32 %a, %b
@@ -17,7 +17,7 @@ define i32 @mul_add_imm(i32 %a, i32 %b) {
 define i32 @mul_sub_imm1(i32 %a, i32 %b) {
 ; CHECK-LABEL: mul_sub_imm1:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov w8, #4
+; CHECK-NEXT:    mov w8, #4 ; =0x4
 ; CHECK-NEXT:    msub w0, w0, w1, w8
 ; CHECK-NEXT:    ret
   %1 = mul i32 %a, %b
@@ -29,7 +29,7 @@ define i32 @mul_sub_imm1(i32 %a, i32 %b) {
 define void @mul_add_imm2() {
 ; CHECK-ISEL-LABEL: mul_add_imm2:
 ; CHECK-ISEL:       ; %bb.0: ; %entry
-; CHECK-ISEL-NEXT:    mov w8, #1
+; CHECK-ISEL-NEXT:    mov w8, #1 ; =0x1
 ; CHECK-ISEL-NEXT:  LBB2_1: ; %for.body8
 ; CHECK-ISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-ISEL-NEXT:    cbnz w8, LBB2_1
@@ -38,10 +38,10 @@ define void @mul_add_imm2() {
 ;
 ; CHECK-FAST-LABEL: mul_add_imm2:
 ; CHECK-FAST:       ; %bb.0: ; %entry
-; CHECK-FAST-NEXT:    mov x8, #-3
-; CHECK-FAST-NEXT:    mov x9, #-3
+; CHECK-FAST-NEXT:    mov x8, #-3 ; =0xfffffffffffffffd
+; CHECK-FAST-NEXT:    mov x9, #-3 ; =0xfffffffffffffffd
 ; CHECK-FAST-NEXT:    madd x8, x8, x8, x9
-; CHECK-FAST-NEXT:    mov x9, #45968
+; CHECK-FAST-NEXT:    mov x9, #45968 ; =0xb390
 ; CHECK-FAST-NEXT:    movk x9, #48484, lsl #16
 ; CHECK-FAST-NEXT:    movk x9, #323, lsl #32
 ; CHECK-FAST-NEXT:  LBB2_1: ; %for.body8
@@ -120,9 +120,9 @@ define i64 @add1_mul_val4(i64 %a, i64 %b, i64 %c) {
 ;
 ; CHECK-FAST-LABEL: add1_mul_val4:
 ; CHECK-FAST:       ; %bb.0:
-; CHECK-FAST-NEXT:    add x8, x1, #1
-; CHECK-FAST-NEXT:    add x9, x0, x2
-; CHECK-FAST-NEXT:    mul x0, x9, x8
+; CHECK-FAST-NEXT:    add x8, x0, x2
+; CHECK-FAST-NEXT:    add x9, x1, #1
+; CHECK-FAST-NEXT:    mul x0, x8, x9
 ; CHECK-FAST-NEXT:    ret
   %1 = add i64 %a, %c
   %2 = add i64 %b, 1
@@ -138,7 +138,7 @@ define i32 @sub1_mul_val1(i32 %a, i32 %b) {
 ;
 ; CHECK-FAST-LABEL: sub1_mul_val1:
 ; CHECK-FAST:       ; %bb.0:
-; CHECK-FAST-NEXT:    mov w8, #1
+; CHECK-FAST-NEXT:    mov w8, #1 ; =0x1
 ; CHECK-FAST-NEXT:    sub w8, w8, w0
 ; CHECK-FAST-NEXT:    mul w0, w8, w1
 ; CHECK-FAST-NEXT:    ret
@@ -155,7 +155,7 @@ define i32 @sub1_mul_val2(i32 %a, i32 %b) {
 ;
 ; CHECK-FAST-LABEL: sub1_mul_val2:
 ; CHECK-FAST:       ; %bb.0:
-; CHECK-FAST-NEXT:    mov w8, #1
+; CHECK-FAST-NEXT:    mov w8, #1 ; =0x1
 ; CHECK-FAST-NEXT:    sub w8, w8, w1
 ; CHECK-FAST-NEXT:    mul w0, w0, w8
 ; CHECK-FAST-NEXT:    ret
@@ -172,7 +172,7 @@ define i64 @sub1_mul_val3(i64 %a, i64 %b) {
 ;
 ; CHECK-FAST-LABEL: sub1_mul_val3:
 ; CHECK-FAST:       ; %bb.0:
-; CHECK-FAST-NEXT:    mov x8, #1
+; CHECK-FAST-NEXT:    mov x8, #1 ; =0x1
 ; CHECK-FAST-NEXT:    sub x8, x8, x1
 ; CHECK-FAST-NEXT:    mul x0, x0, x8
 ; CHECK-FAST-NEXT:    ret
@@ -190,7 +190,7 @@ define i64 @sub1_mul_val4(i64 %a, i64 %b) {
 ;
 ; CHECK-FAST-LABEL: sub1_mul_val4:
 ; CHECK-FAST:       ; %bb.0:
-; CHECK-FAST-NEXT:    mov x8, #1
+; CHECK-FAST-NEXT:    mov x8, #1 ; =0x1
 ; CHECK-FAST-NEXT:    sub x9, x0, #1
 ; CHECK-FAST-NEXT:    sub x8, x8, x1
 ; CHECK-FAST-NEXT:    mul x0, x9, x8

diff  --git a/llvm/test/CodeGen/AArch64/memcpy-scoped-aa.ll b/llvm/test/CodeGen/AArch64/memcpy-scoped-aa.ll
index 54f75550f7e8b5..f122c94d5cffa2 100644
--- a/llvm/test/CodeGen/AArch64/memcpy-scoped-aa.ll
+++ b/llvm/test/CodeGen/AArch64/memcpy-scoped-aa.ll
@@ -15,9 +15,9 @@ define i32 @test_memcpy(i32* nocapture %p, i32* nocapture readonly %q) {
 ; CHECK-LABEL: test_memcpy:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp w9, w10, [x1]
+; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    mov x8, x0
 ; CHECK-NEXT:    add w0, w9, w10
-; CHECK-NEXT:    ldr q0, [x8, #16]
 ; CHECK-NEXT:    str q0, [x8]
 ; CHECK-NEXT:    ret
   %p0 = bitcast i32* %p to i8*
@@ -38,9 +38,9 @@ define i32 @test_memcpy_inline(i32* nocapture %p, i32* nocapture readonly %q) {
 ; CHECK-LABEL: test_memcpy_inline:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp w9, w10, [x1]
+; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    mov x8, x0
 ; CHECK-NEXT:    add w0, w9, w10
-; CHECK-NEXT:    ldr q0, [x8, #16]
 ; CHECK-NEXT:    str q0, [x8]
 ; CHECK-NEXT:    ret
   %p0 = bitcast i32* %p to i8*
@@ -61,9 +61,9 @@ define i32 @test_memmove(i32* nocapture %p, i32* nocapture readonly %q) {
 ; CHECK-LABEL: test_memmove:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp w9, w10, [x1]
+; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    mov x8, x0
 ; CHECK-NEXT:    add w0, w9, w10
-; CHECK-NEXT:    ldr q0, [x8, #16]
 ; CHECK-NEXT:    str q0, [x8]
 ; CHECK-NEXT:    ret
   %p0 = bitcast i32* %p to i8*
@@ -86,9 +86,9 @@ define i32 @test_memset(i32* nocapture %p, i32* nocapture readonly %q) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp w10, w11, [x1]
 ; CHECK-NEXT:    mov x8, x0
-; CHECK-NEXT:    mov x9, #-6148914691236517206
-; CHECK-NEXT:    add w0, w10, w11
+; CHECK-NEXT:    mov x9, #-6148914691236517206 // =0xaaaaaaaaaaaaaaaa
 ; CHECK-NEXT:    stp x9, x9, [x8]
+; CHECK-NEXT:    add w0, w10, w11
 ; CHECK-NEXT:    ret
   %p0 = bitcast i32* %p to i8*
   tail call void @llvm.memset.p0i8.i64(i8* noundef nonnull align 4 dereferenceable(16) %p0, i8 170, i64 16, i1 false), !alias.scope !2, !noalias !4
@@ -106,9 +106,9 @@ define i32 @test_mempcpy(i32* nocapture %p, i32* nocapture readonly %q) {
 ; CHECK-LABEL: test_mempcpy:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp w9, w10, [x1]
+; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    mov x8, x0
 ; CHECK-NEXT:    add w0, w9, w10
-; CHECK-NEXT:    ldr q0, [x8, #16]
 ; CHECK-NEXT:    str q0, [x8]
 ; CHECK-NEXT:    ret
   %p0 = bitcast i32* %p to i8*

diff  --git a/llvm/test/CodeGen/AArch64/merge-trunc-store.ll b/llvm/test/CodeGen/AArch64/merge-trunc-store.ll
index 0f2cb775e98e0b..b161d746ad11d5 100644
--- a/llvm/test/CodeGen/AArch64/merge-trunc-store.ll
+++ b/llvm/test/CodeGen/AArch64/merge-trunc-store.ll
@@ -524,11 +524,11 @@ define void @le_i64_to_i16_order(i64 %x, ptr %p0) {
 define void @be_i64_to_i16(i64 %x, ptr %p0) {
 ; LE-LABEL: be_i64_to_i16:
 ; LE:       // %bb.0:
-; LE-NEXT:    lsr x8, x0, #32
-; LE-NEXT:    ror w9, w0, #16
+; LE-NEXT:    ror w8, w0, #16
+; LE-NEXT:    lsr x9, x0, #32
 ; LE-NEXT:    lsr x10, x0, #48
-; LE-NEXT:    strh w8, [x1, #2]
-; LE-NEXT:    str w9, [x1, #4]
+; LE-NEXT:    str w8, [x1, #4]
+; LE-NEXT:    strh w9, [x1, #2]
 ; LE-NEXT:    strh w10, [x1]
 ; LE-NEXT:    ret
 ;
@@ -749,16 +749,16 @@ define void @i64_to_i8_incomplete(i64 %x, ptr %p0) {
 ; CHECK-NEXT:    lsr x8, x0, #56
 ; CHECK-NEXT:    lsr x9, x0, #48
 ; CHECK-NEXT:    lsr x10, x0, #40
-; CHECK-NEXT:    lsr x11, x0, #32
 ; CHECK-NEXT:    strb w0, [x1, #7]
 ; CHECK-NEXT:    strb w8, [x1]
-; CHECK-NEXT:    lsr x8, x0, #16
+; CHECK-NEXT:    lsr x8, x0, #32
 ; CHECK-NEXT:    strb w9, [x1, #1]
-; CHECK-NEXT:    lsr x9, x0, #8
+; CHECK-NEXT:    lsr x9, x0, #16
 ; CHECK-NEXT:    strb w10, [x1, #2]
-; CHECK-NEXT:    strb w11, [x1, #3]
-; CHECK-NEXT:    strb w8, [x1, #5]
-; CHECK-NEXT:    strb w9, [x1, #6]
+; CHECK-NEXT:    lsr x10, x0, #8
+; CHECK-NEXT:    strb w8, [x1, #3]
+; CHECK-NEXT:    strb w9, [x1, #5]
+; CHECK-NEXT:    strb w10, [x1, #6]
 ; CHECK-NEXT:    ret
   %sh1 = lshr i64 %x, 8
   %sh2 = lshr i64 %x, 16

diff  --git a/llvm/test/CodeGen/AArch64/midpoint-int.ll b/llvm/test/CodeGen/AArch64/midpoint-int.ll
index 2ef7c54f6d9147..1043fa5c4565ee 100644
--- a/llvm/test/CodeGen/AArch64/midpoint-int.ll
+++ b/llvm/test/CodeGen/AArch64/midpoint-int.ll
@@ -14,13 +14,13 @@ define i32 @scalar_i32_signed_reg_reg(i32 %a1, i32 %a2) nounwind {
 ; CHECK-LABEL: scalar_i32_signed_reg_reg:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmp w0, w1
-; CHECK-NEXT:    mov w10, #-1
-; CHECK-NEXT:    csel w8, w1, w0, gt
-; CHECK-NEXT:    csel w9, w0, w1, gt
-; CHECK-NEXT:    sub w8, w9, w8
-; CHECK-NEXT:    cneg w9, w10, le
-; CHECK-NEXT:    lsr w8, w8, #1
-; CHECK-NEXT:    madd w0, w8, w9, w0
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    csel w9, w1, w0, gt
+; CHECK-NEXT:    csel w10, w0, w1, gt
+; CHECK-NEXT:    cneg w8, w8, le
+; CHECK-NEXT:    sub w9, w10, w9
+; CHECK-NEXT:    lsr w9, w9, #1
+; CHECK-NEXT:    madd w0, w9, w8, w0
 ; CHECK-NEXT:    ret
   %t3 = icmp sgt i32 %a1, %a2 ; signed
   %t4 = select i1 %t3, i32 -1, i32 1
@@ -37,13 +37,13 @@ define i32 @scalar_i32_unsigned_reg_reg(i32 %a1, i32 %a2) nounwind {
 ; CHECK-LABEL: scalar_i32_unsigned_reg_reg:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmp w0, w1
-; CHECK-NEXT:    mov w10, #-1
-; CHECK-NEXT:    csel w8, w1, w0, hi
-; CHECK-NEXT:    csel w9, w0, w1, hi
-; CHECK-NEXT:    sub w8, w9, w8
-; CHECK-NEXT:    cneg w9, w10, ls
-; CHECK-NEXT:    lsr w8, w8, #1
-; CHECK-NEXT:    madd w0, w8, w9, w0
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    csel w9, w1, w0, hi
+; CHECK-NEXT:    csel w10, w0, w1, hi
+; CHECK-NEXT:    cneg w8, w8, ls
+; CHECK-NEXT:    sub w9, w10, w9
+; CHECK-NEXT:    lsr w9, w9, #1
+; CHECK-NEXT:    madd w0, w9, w8, w0
 ; CHECK-NEXT:    ret
   %t3 = icmp ugt i32 %a1, %a2
   %t4 = select i1 %t3, i32 -1, i32 1
@@ -62,12 +62,12 @@ define i32 @scalar_i32_signed_mem_reg(ptr %a1_addr, i32 %a2) nounwind {
 ; CHECK-LABEL: scalar_i32_signed_mem_reg:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    mov w8, #-1
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cmp w9, w1
 ; CHECK-NEXT:    csel w10, w1, w9, gt
 ; CHECK-NEXT:    csel w11, w9, w1, gt
-; CHECK-NEXT:    sub w10, w11, w10
 ; CHECK-NEXT:    cneg w8, w8, le
+; CHECK-NEXT:    sub w10, w11, w10
 ; CHECK-NEXT:    lsr w10, w10, #1
 ; CHECK-NEXT:    madd w0, w10, w8, w9
 ; CHECK-NEXT:    ret
@@ -87,12 +87,12 @@ define i32 @scalar_i32_signed_reg_mem(i32 %a1, ptr %a2_addr) nounwind {
 ; CHECK-LABEL: scalar_i32_signed_reg_mem:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    mov w8, #-1
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cmp w0, w9
 ; CHECK-NEXT:    csel w10, w9, w0, gt
 ; CHECK-NEXT:    csel w9, w0, w9, gt
-; CHECK-NEXT:    sub w9, w9, w10
 ; CHECK-NEXT:    cneg w8, w8, le
+; CHECK-NEXT:    sub w9, w9, w10
 ; CHECK-NEXT:    lsr w9, w9, #1
 ; CHECK-NEXT:    madd w0, w9, w8, w0
 ; CHECK-NEXT:    ret
@@ -112,13 +112,13 @@ define i32 @scalar_i32_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
 ; CHECK-LABEL: scalar_i32_signed_mem_mem:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    mov w8, #-1
 ; CHECK-NEXT:    ldr w10, [x1]
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cmp w9, w10
 ; CHECK-NEXT:    csel w11, w10, w9, gt
 ; CHECK-NEXT:    csel w10, w9, w10, gt
-; CHECK-NEXT:    sub w10, w10, w11
 ; CHECK-NEXT:    cneg w8, w8, le
+; CHECK-NEXT:    sub w10, w10, w11
 ; CHECK-NEXT:    lsr w10, w10, #1
 ; CHECK-NEXT:    madd w0, w10, w8, w9
 ; CHECK-NEXT:    ret
@@ -145,13 +145,13 @@ define i64 @scalar_i64_signed_reg_reg(i64 %a1, i64 %a2) nounwind {
 ; CHECK-LABEL: scalar_i64_signed_reg_reg:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmp x0, x1
-; CHECK-NEXT:    mov x10, #-1
-; CHECK-NEXT:    csel x8, x1, x0, gt
-; CHECK-NEXT:    csel x9, x0, x1, gt
-; CHECK-NEXT:    sub x8, x9, x8
-; CHECK-NEXT:    cneg x9, x10, le
-; CHECK-NEXT:    lsr x8, x8, #1
-; CHECK-NEXT:    madd x0, x8, x9, x0
+; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    csel x9, x1, x0, gt
+; CHECK-NEXT:    csel x10, x0, x1, gt
+; CHECK-NEXT:    cneg x8, x8, le
+; CHECK-NEXT:    sub x9, x10, x9
+; CHECK-NEXT:    lsr x9, x9, #1
+; CHECK-NEXT:    madd x0, x9, x8, x0
 ; CHECK-NEXT:    ret
   %t3 = icmp sgt i64 %a1, %a2 ; signed
   %t4 = select i1 %t3, i64 -1, i64 1
@@ -168,13 +168,13 @@ define i64 @scalar_i64_unsigned_reg_reg(i64 %a1, i64 %a2) nounwind {
 ; CHECK-LABEL: scalar_i64_unsigned_reg_reg:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmp x0, x1
-; CHECK-NEXT:    mov x10, #-1
-; CHECK-NEXT:    csel x8, x1, x0, hi
-; CHECK-NEXT:    csel x9, x0, x1, hi
-; CHECK-NEXT:    sub x8, x9, x8
-; CHECK-NEXT:    cneg x9, x10, ls
-; CHECK-NEXT:    lsr x8, x8, #1
-; CHECK-NEXT:    madd x0, x8, x9, x0
+; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    csel x9, x1, x0, hi
+; CHECK-NEXT:    csel x10, x0, x1, hi
+; CHECK-NEXT:    cneg x8, x8, ls
+; CHECK-NEXT:    sub x9, x10, x9
+; CHECK-NEXT:    lsr x9, x9, #1
+; CHECK-NEXT:    madd x0, x9, x8, x0
 ; CHECK-NEXT:    ret
   %t3 = icmp ugt i64 %a1, %a2
   %t4 = select i1 %t3, i64 -1, i64 1
@@ -193,12 +193,12 @@ define i64 @scalar_i64_signed_mem_reg(ptr %a1_addr, i64 %a2) nounwind {
 ; CHECK-LABEL: scalar_i64_signed_mem_reg:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    mov x8, #-1
+; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    cmp x9, x1
 ; CHECK-NEXT:    csel x10, x1, x9, gt
 ; CHECK-NEXT:    csel x11, x9, x1, gt
-; CHECK-NEXT:    sub x10, x11, x10
 ; CHECK-NEXT:    cneg x8, x8, le
+; CHECK-NEXT:    sub x10, x11, x10
 ; CHECK-NEXT:    lsr x10, x10, #1
 ; CHECK-NEXT:    madd x0, x10, x8, x9
 ; CHECK-NEXT:    ret
@@ -218,12 +218,12 @@ define i64 @scalar_i64_signed_reg_mem(i64 %a1, ptr %a2_addr) nounwind {
 ; CHECK-LABEL: scalar_i64_signed_reg_mem:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    mov x8, #-1
+; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    cmp x0, x9
 ; CHECK-NEXT:    csel x10, x9, x0, gt
 ; CHECK-NEXT:    csel x9, x0, x9, gt
-; CHECK-NEXT:    sub x9, x9, x10
 ; CHECK-NEXT:    cneg x8, x8, le
+; CHECK-NEXT:    sub x9, x9, x10
 ; CHECK-NEXT:    lsr x9, x9, #1
 ; CHECK-NEXT:    madd x0, x9, x8, x0
 ; CHECK-NEXT:    ret
@@ -243,13 +243,13 @@ define i64 @scalar_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
 ; CHECK-LABEL: scalar_i64_signed_mem_mem:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    mov x8, #-1
 ; CHECK-NEXT:    ldr x10, [x1]
+; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    cmp x9, x10
 ; CHECK-NEXT:    csel x11, x10, x9, gt
 ; CHECK-NEXT:    csel x10, x9, x10, gt
-; CHECK-NEXT:    sub x10, x10, x11
 ; CHECK-NEXT:    cneg x8, x8, le
+; CHECK-NEXT:    sub x10, x10, x11
 ; CHECK-NEXT:    lsr x10, x10, #1
 ; CHECK-NEXT:    madd x0, x10, x8, x9
 ; CHECK-NEXT:    ret
@@ -275,15 +275,15 @@ define i64 @scalar_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
 define i16 @scalar_i16_signed_reg_reg(i16 %a1, i16 %a2) nounwind {
 ; CHECK-LABEL: scalar_i16_signed_reg_reg:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    mov w10, #-1
-; CHECK-NEXT:    cmp w8, w1, sxth
-; CHECK-NEXT:    csel w8, w1, w0, gt
-; CHECK-NEXT:    csel w9, w0, w1, gt
-; CHECK-NEXT:    sub w8, w9, w8
-; CHECK-NEXT:    cneg w9, w10, le
-; CHECK-NEXT:    ubfx w8, w8, #1, #15
-; CHECK-NEXT:    madd w0, w8, w9, w0
+; CHECK-NEXT:    sxth w9, w0
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cmp w9, w1, sxth
+; CHECK-NEXT:    csel w9, w1, w0, gt
+; CHECK-NEXT:    csel w10, w0, w1, gt
+; CHECK-NEXT:    cneg w8, w8, le
+; CHECK-NEXT:    sub w9, w10, w9
+; CHECK-NEXT:    ubfx w9, w9, #1, #15
+; CHECK-NEXT:    madd w0, w9, w8, w0
 ; CHECK-NEXT:    ret
   %t3 = icmp sgt i16 %a1, %a2 ; signed
   %t4 = select i1 %t3, i16 -1, i16 1
@@ -299,15 +299,15 @@ define i16 @scalar_i16_signed_reg_reg(i16 %a1, i16 %a2) nounwind {
 define i16 @scalar_i16_unsigned_reg_reg(i16 %a1, i16 %a2) nounwind {
 ; CHECK-LABEL: scalar_i16_unsigned_reg_reg:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    mov w10, #-1
-; CHECK-NEXT:    cmp w8, w1, uxth
-; CHECK-NEXT:    csel w8, w1, w0, hi
-; CHECK-NEXT:    csel w9, w0, w1, hi
-; CHECK-NEXT:    sub w8, w9, w8
-; CHECK-NEXT:    cneg w9, w10, ls
-; CHECK-NEXT:    ubfx w8, w8, #1, #15
-; CHECK-NEXT:    madd w0, w8, w9, w0
+; CHECK-NEXT:    and w9, w0, #0xffff
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cmp w9, w1, uxth
+; CHECK-NEXT:    csel w9, w1, w0, hi
+; CHECK-NEXT:    csel w10, w0, w1, hi
+; CHECK-NEXT:    cneg w8, w8, ls
+; CHECK-NEXT:    sub w9, w10, w9
+; CHECK-NEXT:    ubfx w9, w9, #1, #15
+; CHECK-NEXT:    madd w0, w9, w8, w0
 ; CHECK-NEXT:    ret
   %t3 = icmp ugt i16 %a1, %a2
   %t4 = select i1 %t3, i16 -1, i16 1
@@ -326,12 +326,12 @@ define i16 @scalar_i16_signed_mem_reg(ptr %a1_addr, i16 %a2) nounwind {
 ; CHECK-LABEL: scalar_i16_signed_mem_reg:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrsh w9, [x0]
-; CHECK-NEXT:    mov w8, #-1
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cmp w9, w1, sxth
 ; CHECK-NEXT:    csel w10, w1, w9, gt
 ; CHECK-NEXT:    csel w11, w9, w1, gt
-; CHECK-NEXT:    sub w10, w11, w10
 ; CHECK-NEXT:    cneg w8, w8, le
+; CHECK-NEXT:    sub w10, w11, w10
 ; CHECK-NEXT:    ubfx w10, w10, #1, #15
 ; CHECK-NEXT:    madd w0, w10, w8, w9
 ; CHECK-NEXT:    ret
@@ -350,16 +350,16 @@ define i16 @scalar_i16_signed_mem_reg(ptr %a1_addr, i16 %a2) nounwind {
 define i16 @scalar_i16_signed_reg_mem(i16 %a1, ptr %a2_addr) nounwind {
 ; CHECK-LABEL: scalar_i16_signed_reg_mem:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrsh w9, [x1]
-; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    mov w10, #-1
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    csel w8, w9, w0, gt
-; CHECK-NEXT:    csel w9, w0, w9, gt
-; CHECK-NEXT:    sub w8, w9, w8
-; CHECK-NEXT:    cneg w9, w10, le
-; CHECK-NEXT:    ubfx w8, w8, #1, #15
-; CHECK-NEXT:    madd w0, w8, w9, w0
+; CHECK-NEXT:    sxth w9, w0
+; CHECK-NEXT:    ldrsh w10, [x1]
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cmp w9, w10
+; CHECK-NEXT:    csel w9, w10, w0, gt
+; CHECK-NEXT:    csel w10, w0, w10, gt
+; CHECK-NEXT:    cneg w8, w8, le
+; CHECK-NEXT:    sub w9, w10, w9
+; CHECK-NEXT:    ubfx w9, w9, #1, #15
+; CHECK-NEXT:    madd w0, w9, w8, w0
 ; CHECK-NEXT:    ret
   %a2 = load i16, ptr %a2_addr
   %t3 = icmp sgt i16 %a1, %a2 ; signed
@@ -377,13 +377,13 @@ define i16 @scalar_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
 ; CHECK-LABEL: scalar_i16_signed_mem_mem:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrsh w9, [x0]
-; CHECK-NEXT:    mov w8, #-1
 ; CHECK-NEXT:    ldrsh w10, [x1]
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cmp w9, w10
 ; CHECK-NEXT:    csel w11, w10, w9, gt
 ; CHECK-NEXT:    csel w10, w9, w10, gt
-; CHECK-NEXT:    sub w10, w10, w11
 ; CHECK-NEXT:    cneg w8, w8, le
+; CHECK-NEXT:    sub w10, w10, w11
 ; CHECK-NEXT:    ubfx w10, w10, #1, #15
 ; CHECK-NEXT:    madd w0, w10, w8, w9
 ; CHECK-NEXT:    ret
@@ -409,15 +409,15 @@ define i16 @scalar_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
 define i8 @scalar_i8_signed_reg_reg(i8 %a1, i8 %a2) nounwind {
 ; CHECK-LABEL: scalar_i8_signed_reg_reg:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    mov w10, #-1
-; CHECK-NEXT:    cmp w8, w1, sxtb
-; CHECK-NEXT:    csel w8, w1, w0, gt
-; CHECK-NEXT:    csel w9, w0, w1, gt
-; CHECK-NEXT:    sub w8, w9, w8
-; CHECK-NEXT:    cneg w9, w10, le
-; CHECK-NEXT:    ubfx w8, w8, #1, #7
-; CHECK-NEXT:    madd w0, w8, w9, w0
+; CHECK-NEXT:    sxtb w9, w0
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cmp w9, w1, sxtb
+; CHECK-NEXT:    csel w9, w1, w0, gt
+; CHECK-NEXT:    csel w10, w0, w1, gt
+; CHECK-NEXT:    cneg w8, w8, le
+; CHECK-NEXT:    sub w9, w10, w9
+; CHECK-NEXT:    ubfx w9, w9, #1, #7
+; CHECK-NEXT:    madd w0, w9, w8, w0
 ; CHECK-NEXT:    ret
   %t3 = icmp sgt i8 %a1, %a2 ; signed
   %t4 = select i1 %t3, i8 -1, i8 1
@@ -433,15 +433,15 @@ define i8 @scalar_i8_signed_reg_reg(i8 %a1, i8 %a2) nounwind {
 define i8 @scalar_i8_unsigned_reg_reg(i8 %a1, i8 %a2) nounwind {
 ; CHECK-LABEL: scalar_i8_unsigned_reg_reg:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xff
-; CHECK-NEXT:    mov w10, #-1
-; CHECK-NEXT:    cmp w8, w1, uxtb
-; CHECK-NEXT:    csel w8, w1, w0, hi
-; CHECK-NEXT:    csel w9, w0, w1, hi
-; CHECK-NEXT:    sub w8, w9, w8
-; CHECK-NEXT:    cneg w9, w10, ls
-; CHECK-NEXT:    ubfx w8, w8, #1, #7
-; CHECK-NEXT:    madd w0, w8, w9, w0
+; CHECK-NEXT:    and w9, w0, #0xff
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cmp w9, w1, uxtb
+; CHECK-NEXT:    csel w9, w1, w0, hi
+; CHECK-NEXT:    csel w10, w0, w1, hi
+; CHECK-NEXT:    cneg w8, w8, ls
+; CHECK-NEXT:    sub w9, w10, w9
+; CHECK-NEXT:    ubfx w9, w9, #1, #7
+; CHECK-NEXT:    madd w0, w9, w8, w0
 ; CHECK-NEXT:    ret
   %t3 = icmp ugt i8 %a1, %a2
   %t4 = select i1 %t3, i8 -1, i8 1
@@ -460,12 +460,12 @@ define i8 @scalar_i8_signed_mem_reg(ptr %a1_addr, i8 %a2) nounwind {
 ; CHECK-LABEL: scalar_i8_signed_mem_reg:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrsb w9, [x0]
-; CHECK-NEXT:    mov w8, #-1
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cmp w9, w1, sxtb
 ; CHECK-NEXT:    csel w10, w1, w9, gt
 ; CHECK-NEXT:    csel w11, w9, w1, gt
-; CHECK-NEXT:    sub w10, w11, w10
 ; CHECK-NEXT:    cneg w8, w8, le
+; CHECK-NEXT:    sub w10, w11, w10
 ; CHECK-NEXT:    ubfx w10, w10, #1, #7
 ; CHECK-NEXT:    madd w0, w10, w8, w9
 ; CHECK-NEXT:    ret
@@ -484,16 +484,16 @@ define i8 @scalar_i8_signed_mem_reg(ptr %a1_addr, i8 %a2) nounwind {
 define i8 @scalar_i8_signed_reg_mem(i8 %a1, ptr %a2_addr) nounwind {
 ; CHECK-LABEL: scalar_i8_signed_reg_mem:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrsb w9, [x1]
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    mov w10, #-1
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    csel w8, w9, w0, gt
-; CHECK-NEXT:    csel w9, w0, w9, gt
-; CHECK-NEXT:    sub w8, w9, w8
-; CHECK-NEXT:    cneg w9, w10, le
-; CHECK-NEXT:    ubfx w8, w8, #1, #7
-; CHECK-NEXT:    madd w0, w8, w9, w0
+; CHECK-NEXT:    sxtb w9, w0
+; CHECK-NEXT:    ldrsb w10, [x1]
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cmp w9, w10
+; CHECK-NEXT:    csel w9, w10, w0, gt
+; CHECK-NEXT:    csel w10, w0, w10, gt
+; CHECK-NEXT:    cneg w8, w8, le
+; CHECK-NEXT:    sub w9, w10, w9
+; CHECK-NEXT:    ubfx w9, w9, #1, #7
+; CHECK-NEXT:    madd w0, w9, w8, w0
 ; CHECK-NEXT:    ret
   %a2 = load i8, ptr %a2_addr
   %t3 = icmp sgt i8 %a1, %a2 ; signed
@@ -511,13 +511,13 @@ define i8 @scalar_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
 ; CHECK-LABEL: scalar_i8_signed_mem_mem:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrsb w9, [x0]
-; CHECK-NEXT:    mov w8, #-1
 ; CHECK-NEXT:    ldrsb w10, [x1]
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    cmp w9, w10
 ; CHECK-NEXT:    csel w11, w10, w9, gt
 ; CHECK-NEXT:    csel w10, w9, w10, gt
-; CHECK-NEXT:    sub w10, w10, w11
 ; CHECK-NEXT:    cneg w8, w8, le
+; CHECK-NEXT:    sub w10, w10, w11
 ; CHECK-NEXT:    ubfx w10, w10, #1, #7
 ; CHECK-NEXT:    madd w0, w10, w8, w9
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/minmax-of-minmax.ll b/llvm/test/CodeGen/AArch64/minmax-of-minmax.ll
index 3dad36acdf6365..9257832d4c4bd8 100644
--- a/llvm/test/CodeGen/AArch64/minmax-of-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/minmax-of-minmax.ll
@@ -10,9 +10,9 @@
 define <4 x i32> @smin_ab_bc(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: smin_ab_bc:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    smin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    smin v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    smin v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp slt <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -26,9 +26,9 @@ define <4 x i32> @smin_ab_bc(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 define <4 x i32> @smin_ab_cb(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: smin_ab_cb:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    smin v2.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    smin v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    smin v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp slt <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -74,9 +74,9 @@ define <4 x i32> @smin_bc_ba(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 define <4 x i32> @smin_ab_bc_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: smin_ab_bc_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    smin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    smin v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    smin v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp slt <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -90,9 +90,9 @@ define <4 x i32> @smin_ab_bc_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
 define <4 x i32> @smin_ab_cb_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: smin_ab_cb_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    smin v2.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    smin v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    smin v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp slt <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -138,9 +138,9 @@ define <4 x i32> @smin_bc_ba_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
 define <4 x i32> @smin_ab_bc_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: smin_ab_bc_eq_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    smin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    smin v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    smin v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp slt <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -154,9 +154,9 @@ define <4 x i32> @smin_ab_bc_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 define <4 x i32> @smin_ab_cb_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: smin_ab_cb_eq_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    smin v2.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    smin v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    smin v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp slt <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -202,9 +202,9 @@ define <4 x i32> @smin_bc_ba_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 define <4 x i32> @smin_ab_bc_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: smin_ab_bc_eq_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    smin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    smin v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    smin v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp slt <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -218,9 +218,9 @@ define <4 x i32> @smin_ab_bc_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32>
 define <4 x i32> @smin_ab_cb_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: smin_ab_cb_eq_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    smin v2.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    smin v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    smin v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp slt <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -266,9 +266,9 @@ define <4 x i32> @smin_bc_ba_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32>
 define <4 x i32> @smax_ab_bc(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: smax_ab_bc:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    smax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    smax v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    smax v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp sgt <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -282,9 +282,9 @@ define <4 x i32> @smax_ab_bc(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 define <4 x i32> @smax_ab_cb(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: smax_ab_cb:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    smax v2.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    smax v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    smax v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp sgt <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -330,9 +330,9 @@ define <4 x i32> @smax_bc_ba(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 define <4 x i32> @smax_ab_bc_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: smax_ab_bc_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    smax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    smax v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    smax v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp sgt <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -346,9 +346,9 @@ define <4 x i32> @smax_ab_bc_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
 define <4 x i32> @smax_ab_cb_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: smax_ab_cb_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    smax v2.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    smax v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    smax v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp sgt <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -394,9 +394,9 @@ define <4 x i32> @smax_bc_ba_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
 define <4 x i32> @smax_ab_bc_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: smax_ab_bc_eq_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    smax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    smax v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    smax v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp sgt <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -410,9 +410,9 @@ define <4 x i32> @smax_ab_bc_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 define <4 x i32> @smax_ab_cb_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: smax_ab_cb_eq_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    smax v2.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    smax v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    smax v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp sgt <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -458,9 +458,9 @@ define <4 x i32> @smax_bc_ba_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 define <4 x i32> @smax_ab_bc_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: smax_ab_bc_eq_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    smax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    smax v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    smax v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp sgt <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -474,9 +474,9 @@ define <4 x i32> @smax_ab_bc_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32>
 define <4 x i32> @smax_ab_cb_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: smax_ab_cb_eq_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    smax v2.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    smax v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    smax v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp sgt <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -522,9 +522,9 @@ define <4 x i32> @smax_bc_ba_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32>
 define <4 x i32> @umin_ab_bc(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: umin_ab_bc:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umin v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    umin v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp ult <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -538,9 +538,9 @@ define <4 x i32> @umin_ab_bc(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 define <4 x i32> @umin_ab_cb(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: umin_ab_cb:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umin v2.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umin v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    umin v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp ult <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -586,9 +586,9 @@ define <4 x i32> @umin_bc_ba(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 define <4 x i32> @umin_ab_bc_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: umin_ab_bc_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umin v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    umin v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp ult <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -602,9 +602,9 @@ define <4 x i32> @umin_ab_bc_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
 define <4 x i32> @umin_ab_cb_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: umin_ab_cb_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umin v2.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umin v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    umin v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp ult <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -650,9 +650,9 @@ define <4 x i32> @umin_bc_ba_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
 define <4 x i32> @umin_ab_bc_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: umin_ab_bc_eq_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umin v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    umin v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp ult <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -666,9 +666,9 @@ define <4 x i32> @umin_ab_bc_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 define <4 x i32> @umin_ab_cb_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: umin_ab_cb_eq_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umin v2.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umin v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    umin v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp ult <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -714,9 +714,9 @@ define <4 x i32> @umin_bc_ba_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 define <4 x i32> @umin_ab_bc_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: umin_ab_bc_eq_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umin v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    umin v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp ult <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -730,9 +730,9 @@ define <4 x i32> @umin_ab_bc_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32>
 define <4 x i32> @umin_ab_cb_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: umin_ab_cb_eq_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umin v2.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umin v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    umin v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp ult <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -778,9 +778,9 @@ define <4 x i32> @umin_bc_ba_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32>
 define <4 x i32> @umax_ab_bc(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: umax_ab_bc:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umax v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    umax v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp ugt <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -794,9 +794,9 @@ define <4 x i32> @umax_ab_bc(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 define <4 x i32> @umax_ab_cb(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: umax_ab_cb:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umax v2.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umax v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    umax v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp ugt <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -842,9 +842,9 @@ define <4 x i32> @umax_bc_ba(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 define <4 x i32> @umax_ab_bc_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: umax_ab_bc_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umax v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    umax v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp ugt <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -858,9 +858,9 @@ define <4 x i32> @umax_ab_bc_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
 define <4 x i32> @umax_ab_cb_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: umax_ab_cb_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umax v2.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umax v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    umax v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp ugt <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -906,9 +906,9 @@ define <4 x i32> @umax_bc_ba_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
 define <4 x i32> @umax_ab_bc_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: umax_ab_bc_eq_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umax v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    umax v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp ugt <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -922,9 +922,9 @@ define <4 x i32> @umax_ab_bc_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 define <4 x i32> @umax_ab_cb_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: umax_ab_cb_eq_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umax v2.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umax v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    umax v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp ugt <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -970,9 +970,9 @@ define <4 x i32> @umax_bc_ba_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 define <4 x i32> @umax_ab_bc_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: umax_ab_bc_eq_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umax v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    umax v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp ugt <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -986,9 +986,9 @@ define <4 x i32> @umax_ab_bc_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32>
 define <4 x i32> @umax_ab_cb_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: umax_ab_cb_eq_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umax v2.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umax v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    umax v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %cmp_ab = icmp ugt <4 x i32> %a, %b
   %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b
@@ -1034,8 +1034,8 @@ define <4 x i32> @umax_bc_ba_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32>
 define <4 x i32> @notted_smin_ab_bc(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smin_ab_bc:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    smin v1.4s, v1.4s, v2.4s
@@ -1056,8 +1056,8 @@ define <4 x i32> @notted_smin_ab_bc(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 define <4 x i32> @notted_smin_ab_cb(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smin_ab_cb:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    smin v1.4s, v2.4s, v1.4s
@@ -1078,8 +1078,8 @@ define <4 x i32> @notted_smin_ab_cb(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 define <4 x i32> @notted_smin_bc_ab(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smin_bc_ab:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
@@ -1100,8 +1100,8 @@ define <4 x i32> @notted_smin_bc_ab(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 define <4 x i32> @notted_smin_bc_ba(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smin_bc_ba:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smin v0.4s, v1.4s, v0.4s
@@ -1122,8 +1122,8 @@ define <4 x i32> @notted_smin_bc_ba(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 define <4 x i32> @notted_smin_ab_bc_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smin_ab_bc_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    smin v1.4s, v1.4s, v2.4s
@@ -1144,8 +1144,8 @@ define <4 x i32> @notted_smin_ab_bc_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
 define <4 x i32> @notted_smin_ab_cb_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smin_ab_cb_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    smin v1.4s, v2.4s, v1.4s
@@ -1166,8 +1166,8 @@ define <4 x i32> @notted_smin_ab_cb_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
 define <4 x i32> @notted_smin_bc_ab_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smin_bc_ab_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
@@ -1188,8 +1188,8 @@ define <4 x i32> @notted_smin_bc_ab_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
 define <4 x i32> @notted_smin_bc_ba_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smin_bc_ba_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smin v0.4s, v1.4s, v0.4s
@@ -1210,8 +1210,8 @@ define <4 x i32> @notted_smin_bc_ba_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
 define <4 x i32> @notted_smin_ab_bc_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smin_ab_bc_eq_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    smin v1.4s, v1.4s, v2.4s
@@ -1232,8 +1232,8 @@ define <4 x i32> @notted_smin_ab_bc_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
 define <4 x i32> @notted_smin_ab_cb_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smin_ab_cb_eq_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    smin v1.4s, v2.4s, v1.4s
@@ -1254,8 +1254,8 @@ define <4 x i32> @notted_smin_ab_cb_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
 define <4 x i32> @notted_smin_bc_ab_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smin_bc_ab_eq_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
@@ -1276,8 +1276,8 @@ define <4 x i32> @notted_smin_bc_ab_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
 define <4 x i32> @notted_smin_bc_ba_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smin_bc_ba_eq_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smin v0.4s, v1.4s, v0.4s
@@ -1298,8 +1298,8 @@ define <4 x i32> @notted_smin_bc_ba_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
 define <4 x i32> @notted_smin_ab_bc_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smin_ab_bc_eq_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    smin v1.4s, v1.4s, v2.4s
@@ -1320,8 +1320,8 @@ define <4 x i32> @notted_smin_ab_bc_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
 define <4 x i32> @notted_smin_ab_cb_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smin_ab_cb_eq_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    smin v1.4s, v2.4s, v1.4s
@@ -1342,8 +1342,8 @@ define <4 x i32> @notted_smin_ab_cb_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
 define <4 x i32> @notted_smin_bc_ab_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smin_bc_ab_eq_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smin v0.4s, v0.4s, v1.4s
@@ -1364,8 +1364,8 @@ define <4 x i32> @notted_smin_bc_ab_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
 define <4 x i32> @notted_smin_bc_ba_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smin_bc_ba_eq_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smin v0.4s, v1.4s, v0.4s
@@ -1386,8 +1386,8 @@ define <4 x i32> @notted_smin_bc_ba_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
 define <4 x i32> @notted_smax_ab_bc(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smax_ab_bc:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    smax v1.4s, v1.4s, v2.4s
@@ -1408,8 +1408,8 @@ define <4 x i32> @notted_smax_ab_bc(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 define <4 x i32> @notted_smax_ab_cb(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smax_ab_cb:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    smax v1.4s, v2.4s, v1.4s
@@ -1430,8 +1430,8 @@ define <4 x i32> @notted_smax_ab_cb(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 define <4 x i32> @notted_smax_bc_ab(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smax_bc_ab:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
@@ -1452,8 +1452,8 @@ define <4 x i32> @notted_smax_bc_ab(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 define <4 x i32> @notted_smax_bc_ba(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smax_bc_ba:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smax v0.4s, v1.4s, v0.4s
@@ -1474,8 +1474,8 @@ define <4 x i32> @notted_smax_bc_ba(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 define <4 x i32> @notted_smax_ab_bc_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smax_ab_bc_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    smax v1.4s, v1.4s, v2.4s
@@ -1496,8 +1496,8 @@ define <4 x i32> @notted_smax_ab_bc_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
 define <4 x i32> @notted_smax_ab_cb_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smax_ab_cb_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    smax v1.4s, v2.4s, v1.4s
@@ -1518,8 +1518,8 @@ define <4 x i32> @notted_smax_ab_cb_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
 define <4 x i32> @notted_smax_bc_ab_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smax_bc_ab_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
@@ -1540,8 +1540,8 @@ define <4 x i32> @notted_smax_bc_ab_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
 define <4 x i32> @notted_smax_bc_ba_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smax_bc_ba_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smax v0.4s, v1.4s, v0.4s
@@ -1562,8 +1562,8 @@ define <4 x i32> @notted_smax_bc_ba_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
 define <4 x i32> @notted_smax_ab_bc_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smax_ab_bc_eq_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    smax v1.4s, v1.4s, v2.4s
@@ -1584,8 +1584,8 @@ define <4 x i32> @notted_smax_ab_bc_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
 define <4 x i32> @notted_smax_ab_cb_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smax_ab_cb_eq_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    smax v1.4s, v2.4s, v1.4s
@@ -1606,8 +1606,8 @@ define <4 x i32> @notted_smax_ab_cb_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
 define <4 x i32> @notted_smax_bc_ab_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smax_bc_ab_eq_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
@@ -1628,8 +1628,8 @@ define <4 x i32> @notted_smax_bc_ab_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
 define <4 x i32> @notted_smax_bc_ba_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smax_bc_ba_eq_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smax v0.4s, v1.4s, v0.4s
@@ -1650,8 +1650,8 @@ define <4 x i32> @notted_smax_bc_ba_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
 define <4 x i32> @notted_smax_ab_bc_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smax_ab_bc_eq_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    smax v1.4s, v1.4s, v2.4s
@@ -1672,8 +1672,8 @@ define <4 x i32> @notted_smax_ab_bc_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
 define <4 x i32> @notted_smax_ab_cb_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smax_ab_cb_eq_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    smax v1.4s, v2.4s, v1.4s
@@ -1694,8 +1694,8 @@ define <4 x i32> @notted_smax_ab_cb_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
 define <4 x i32> @notted_smax_bc_ab_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smax_bc_ab_eq_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smax v0.4s, v0.4s, v1.4s
@@ -1716,8 +1716,8 @@ define <4 x i32> @notted_smax_bc_ab_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
 define <4 x i32> @notted_smax_bc_ba_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_smax_bc_ba_eq_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    smax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    smax v0.4s, v1.4s, v0.4s
@@ -1738,8 +1738,8 @@ define <4 x i32> @notted_smax_bc_ba_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
 define <4 x i32> @notted_umin_ab_bc(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umin_ab_bc:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    umin v1.4s, v1.4s, v2.4s
@@ -1760,8 +1760,8 @@ define <4 x i32> @notted_umin_ab_bc(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 define <4 x i32> @notted_umin_ab_cb(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umin_ab_cb:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    umin v1.4s, v2.4s, v1.4s
@@ -1782,8 +1782,8 @@ define <4 x i32> @notted_umin_ab_cb(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 define <4 x i32> @notted_umin_bc_ab(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umin_bc_ab:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
@@ -1804,8 +1804,8 @@ define <4 x i32> @notted_umin_bc_ab(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 define <4 x i32> @notted_umin_bc_ba(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umin_bc_ba:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umin v0.4s, v1.4s, v0.4s
@@ -1826,8 +1826,8 @@ define <4 x i32> @notted_umin_bc_ba(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 define <4 x i32> @notted_umin_ab_bc_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umin_ab_bc_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    umin v1.4s, v1.4s, v2.4s
@@ -1848,8 +1848,8 @@ define <4 x i32> @notted_umin_ab_bc_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
 define <4 x i32> @notted_umin_ab_cb_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umin_ab_cb_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    umin v1.4s, v2.4s, v1.4s
@@ -1870,8 +1870,8 @@ define <4 x i32> @notted_umin_ab_cb_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
 define <4 x i32> @notted_umin_bc_ab_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umin_bc_ab_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
@@ -1892,8 +1892,8 @@ define <4 x i32> @notted_umin_bc_ab_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
 define <4 x i32> @notted_umin_bc_ba_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umin_bc_ba_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umin v0.4s, v1.4s, v0.4s
@@ -1914,8 +1914,8 @@ define <4 x i32> @notted_umin_bc_ba_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
 define <4 x i32> @notted_umin_ab_bc_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umin_ab_bc_eq_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    umin v1.4s, v1.4s, v2.4s
@@ -1936,8 +1936,8 @@ define <4 x i32> @notted_umin_ab_bc_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
 define <4 x i32> @notted_umin_ab_cb_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umin_ab_cb_eq_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    umin v1.4s, v2.4s, v1.4s
@@ -1958,8 +1958,8 @@ define <4 x i32> @notted_umin_ab_cb_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
 define <4 x i32> @notted_umin_bc_ab_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umin_bc_ab_eq_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
@@ -1980,8 +1980,8 @@ define <4 x i32> @notted_umin_bc_ab_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
 define <4 x i32> @notted_umin_bc_ba_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umin_bc_ba_eq_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umin v0.4s, v1.4s, v0.4s
@@ -2002,8 +2002,8 @@ define <4 x i32> @notted_umin_bc_ba_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
 define <4 x i32> @notted_umin_ab_bc_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umin_ab_bc_eq_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    umin v1.4s, v1.4s, v2.4s
@@ -2024,8 +2024,8 @@ define <4 x i32> @notted_umin_ab_bc_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
 define <4 x i32> @notted_umin_ab_cb_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umin_ab_cb_eq_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    umin v1.4s, v2.4s, v1.4s
@@ -2046,8 +2046,8 @@ define <4 x i32> @notted_umin_ab_cb_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
 define <4 x i32> @notted_umin_bc_ab_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umin_bc_ab_eq_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umin v0.4s, v0.4s, v1.4s
@@ -2068,8 +2068,8 @@ define <4 x i32> @notted_umin_bc_ab_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
 define <4 x i32> @notted_umin_bc_ba_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umin_bc_ba_eq_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umin v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umin v0.4s, v1.4s, v0.4s
@@ -2090,8 +2090,8 @@ define <4 x i32> @notted_umin_bc_ba_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
 define <4 x i32> @notted_umax_ab_bc(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umax_ab_bc:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    umax v1.4s, v1.4s, v2.4s
@@ -2112,8 +2112,8 @@ define <4 x i32> @notted_umax_ab_bc(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 define <4 x i32> @notted_umax_ab_cb(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umax_ab_cb:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    umax v1.4s, v2.4s, v1.4s
@@ -2134,8 +2134,8 @@ define <4 x i32> @notted_umax_ab_cb(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 define <4 x i32> @notted_umax_bc_ab(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umax_bc_ab:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
@@ -2156,8 +2156,8 @@ define <4 x i32> @notted_umax_bc_ab(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 define <4 x i32> @notted_umax_bc_ba(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umax_bc_ba:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umax v0.4s, v1.4s, v0.4s
@@ -2178,8 +2178,8 @@ define <4 x i32> @notted_umax_bc_ba(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 define <4 x i32> @notted_umax_ab_bc_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umax_ab_bc_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    umax v1.4s, v1.4s, v2.4s
@@ -2200,8 +2200,8 @@ define <4 x i32> @notted_umax_ab_bc_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
 define <4 x i32> @notted_umax_ab_cb_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umax_ab_cb_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    umax v1.4s, v2.4s, v1.4s
@@ -2222,8 +2222,8 @@ define <4 x i32> @notted_umax_ab_cb_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
 define <4 x i32> @notted_umax_bc_ab_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umax_bc_ab_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
@@ -2244,8 +2244,8 @@ define <4 x i32> @notted_umax_bc_ab_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
 define <4 x i32> @notted_umax_bc_ba_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umax_bc_ba_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umax v0.4s, v1.4s, v0.4s
@@ -2266,8 +2266,8 @@ define <4 x i32> @notted_umax_bc_ba_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i
 define <4 x i32> @notted_umax_ab_bc_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umax_ab_bc_eq_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    umax v1.4s, v1.4s, v2.4s
@@ -2288,8 +2288,8 @@ define <4 x i32> @notted_umax_ab_bc_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
 define <4 x i32> @notted_umax_ab_cb_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umax_ab_cb_eq_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    umax v1.4s, v2.4s, v1.4s
@@ -2310,8 +2310,8 @@ define <4 x i32> @notted_umax_ab_cb_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
 define <4 x i32> @notted_umax_bc_ab_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umax_bc_ab_eq_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
@@ -2332,8 +2332,8 @@ define <4 x i32> @notted_umax_bc_ab_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
 define <4 x i32> @notted_umax_bc_ba_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umax_bc_ba_eq_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umax v0.4s, v1.4s, v0.4s
@@ -2354,8 +2354,8 @@ define <4 x i32> @notted_umax_bc_ba_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32
 define <4 x i32> @notted_umax_ab_bc_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umax_ab_bc_eq_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    umax v1.4s, v1.4s, v2.4s
@@ -2376,8 +2376,8 @@ define <4 x i32> @notted_umax_ab_bc_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
 define <4 x i32> @notted_umax_ab_cb_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umax_ab_cb_eq_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    umax v1.4s, v2.4s, v1.4s
@@ -2398,8 +2398,8 @@ define <4 x i32> @notted_umax_ab_cb_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
 define <4 x i32> @notted_umax_bc_ab_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umax_bc_ab_eq_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umax v0.4s, v0.4s, v1.4s
@@ -2420,8 +2420,8 @@ define <4 x i32> @notted_umax_bc_ab_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4
 define <4 x i32> @notted_umax_bc_ba_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: notted_umax_bc_ba_eq_swap_pred:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    mvn v2.16b, v2.16b
 ; CHECK-NEXT:    umax v2.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    umax v0.4s, v1.4s, v0.4s

diff  --git a/llvm/test/CodeGen/AArch64/minmax.ll b/llvm/test/CodeGen/AArch64/minmax.ll
index be9f45e6eea443..db9bd236319c67 100644
--- a/llvm/test/CodeGen/AArch64/minmax.ll
+++ b/llvm/test/CodeGen/AArch64/minmax.ll
@@ -97,8 +97,8 @@ define <4 x i32> @t9(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 define <8 x i32> @t10(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK-LABEL: t10:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    smax v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    smax v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    smax v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
   %t1 = icmp sgt <8 x i32> %a, %b
   %t2 = select <8 x i1> %t1, <8 x i32> %a, <8 x i32> %b
@@ -158,10 +158,10 @@ define <2 x i64> @t14(<2 x i64> %a, <2 x i64> %b) {
 define <4 x i64> @t15(<4 x i64> %a, <4 x i64> %b) {
 ; CHECK-LABEL: t15:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmhi v4.2d, v2.2d, v0.2d
-; CHECK-NEXT:    cmhi v5.2d, v3.2d, v1.2d
-; CHECK-NEXT:    bif v0.16b, v2.16b, v4.16b
-; CHECK-NEXT:    bif v1.16b, v3.16b, v5.16b
+; CHECK-NEXT:    cmhi v4.2d, v3.2d, v1.2d
+; CHECK-NEXT:    cmhi v5.2d, v2.2d, v0.2d
+; CHECK-NEXT:    bif v1.16b, v3.16b, v4.16b
+; CHECK-NEXT:    bif v0.16b, v2.16b, v5.16b
 ; CHECK-NEXT:    ret
   %t1 = icmp ule <4 x i64> %a, %b
   %t2 = select <4 x i1> %t1, <4 x i64> %a, <4 x i64> %b

diff  --git a/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-01.mir b/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-01.mir
index 23d8ad46b9b1cc..6cdbbb8c53d690 100644
--- a/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-01.mir
+++ b/llvm/test/CodeGen/AArch64/misched-detail-resource-booking-01.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon  %s -o - 2>&1 \
+# RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon  -mcpu=cortex-a55 %s -o - 2>&1 \
 # RUN:   -misched-dump-reserved-cycles=true \
 # RUN:   -run-pass=machine-scheduler -debug-only=machine-scheduler \
 # RUN:   -misched-bottomup=true -sched-print-cycles=true \
@@ -13,7 +13,7 @@
   source_filename = "../llvm-project/llvm/test/CodeGen/AArch64/aarch64-smull.failing.ll"
   target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
   target triple = "aarch64-none-linux-gnu"
-  
+
   define <8 x i32> @umull_and_v8i32(<8 x i16> %src1, <8 x i32> %src2) #0 {
   entry:
     %in1 = zext <8 x i16> %src1 to <8 x i32>
@@ -21,7 +21,7 @@
     %out = mul nsw <8 x i32> %in1, %in2
     ret <8 x i32> %out
   }
-  
+
   attributes #0 = { "target-features"="+neon" }
 
 ...
@@ -54,7 +54,7 @@ machineFunctionInfo: {}
 body:             |
   bb.0.entry:
     liveins: $q0, $q1, $q2
-  
+
     %2:fpr128 = COPY $q2
     %1:fpr128 = COPY $q1
     %0:fpr128 = COPY $q0
@@ -75,7 +75,7 @@ body:             |
 # CHECK-LABEL: Before MISched:
 # CHECK-NEXT: # Machine code for function umull_and_v8i32: IsSSA, NoPHIs, TracksLiveness
 # CHECK-NEXT: Function Live Ins: $q0 in %0, $q1 in %1, $q2 in %2
-# CHECK-EMPTY: 
+# CHECK-EMPTY:
 # CHECK-NEXT: bb.0.entry:
 # CHECK-NEXT:   liveins: $q0, $q1, $q2
 # CHECK-NEXT:   %2:fpr128 = COPY $q2
@@ -92,9 +92,9 @@ body:             |
 # CHECK-NEXT:   $q0 = COPY %10:fpr128
 # CHECK-NEXT:   $q1 = COPY %12:fpr128
 # CHECK-NEXT:   RET_ReallyLR implicit $q0, implicit $q1
-# CHECK-EMPTY: 
+# CHECK-EMPTY:
 # CHECK-NEXT: # End machine code for function umull_and_v8i32.
-# CHECK-EMPTY: 
+# CHECK-EMPTY:
 # CHECK-NEXT: ********** MI Scheduling **********
 # CHECK-NEXT: umull_and_v8i32:%bb.0 entry
 # CHECK-NEXT:   From: %2:fpr128 = COPY $q2
@@ -272,7 +272,7 @@ body:             |
 # CHECK-NEXT:   Predecessors:
 # CHECK-NEXT:     SU(12): Ord  Latency=3 Artificial
 # CHECK-NEXT:     SU(11): Ord  Latency=3 Artificial
-# CHECK-NEXT:   Resource booking (@0c): 
+# CHECK-NEXT:   Resource booking (@0c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitALU(1) = 4294967295
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -285,11 +285,11 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@0c): 
+# CHECK-NEXT:   getNextResourceCycle (@0c):
 # CHECK-NEXT:     Instance 0 available @0c
 # CHECK-NEXT:     Instance 1 available @0c
 # CHECK-NEXT:     selecting CortexA55UnitALU[0] available @0c
-# CHECK-NEXT:   Resource booking (@0c): 
+# CHECK-NEXT:   Resource booking (@0c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitALU(1) = 4294967295
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -302,11 +302,11 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@0c): 
+# CHECK-NEXT:   getNextResourceCycle (@0c):
 # CHECK-NEXT:     Instance 0 available @0c
 # CHECK-NEXT:     Instance 1 available @0c
 # CHECK-NEXT:     selecting CortexA55UnitALU[0] available @0c
-# CHECK-NEXT:   Resource booking (@0c): 
+# CHECK-NEXT:   Resource booking (@0c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitALU(1) = 4294967295
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -319,11 +319,11 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@0c): 
+# CHECK-NEXT:   getNextResourceCycle (@0c):
 # CHECK-NEXT:     Instance 0 available @0c
 # CHECK-NEXT:     Instance 1 available @0c
 # CHECK-NEXT:     selecting CortexA55UnitALU[0] available @0c
-# CHECK-NEXT:   Resource booking (@0c): 
+# CHECK-NEXT:   Resource booking (@0c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitALU(1) = 4294967295
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -336,14 +336,14 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@0c): 
+# CHECK-NEXT:   getNextResourceCycle (@0c):
 # CHECK-NEXT:     Instance 0 available @0c
 # CHECK-NEXT:     Instance 1 available @0c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[0] available @0c
 # CHECK-NEXT: Critical Path(GS-RR ): 14
 # CHECK-NEXT: ** ScheduleDAGMILive::schedule picking next node
 # CHECK-NEXT: Cycle: 3 BotQ.A
-# CHECK-NEXT:   Resource booking (@3c): 
+# CHECK-NEXT:   Resource booking (@3c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitALU(1) = 4294967295
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -356,11 +356,11 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@3c): 
+# CHECK-NEXT:   getNextResourceCycle (@3c):
 # CHECK-NEXT:     Instance 0 available @3c
 # CHECK-NEXT:     Instance 1 available @3c
 # CHECK-NEXT:     selecting CortexA55UnitALU[0] available @3c
-# CHECK-NEXT:   Resource booking (@3c): 
+# CHECK-NEXT:   Resource booking (@3c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitALU(1) = 4294967295
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -373,18 +373,18 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@3c): 
+# CHECK-NEXT:   getNextResourceCycle (@3c):
 # CHECK-NEXT:     Instance 0 available @3c
 # CHECK-NEXT:     Instance 1 available @3c
 # CHECK-NEXT:     selecting CortexA55UnitALU[0] available @3c
-# CHECK-NEXT: Queue BotQ.P: 
-# CHECK-NEXT: Queue BotQ.A: 12 11 
-# CHECK-NEXT:   Cand SU(12) ORDER                              
-# CHECK-NEXT: Pick Bot ORDER     
+# CHECK-NEXT: Queue BotQ.P:
+# CHECK-NEXT: Queue BotQ.A: 12 11
+# CHECK-NEXT:   Cand SU(12) ORDER
+# CHECK-NEXT: Pick Bot ORDER
 # CHECK-NEXT: Scheduling SU(12) $q1 = COPY %12:fpr128
 # CHECK-NEXT:   Ready @3c
 # CHECK-NEXT:   CortexA55UnitALU +1x1u
-# CHECK-NEXT:   Resource booking (@3c): 
+# CHECK-NEXT:   Resource booking (@3c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitALU(1) = 4294967295
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -397,11 +397,11 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@3c): 
+# CHECK-NEXT:   getNextResourceCycle (@3c):
 # CHECK-NEXT:     Instance 0 available @3c
 # CHECK-NEXT:     Instance 1 available @3c
 # CHECK-NEXT:     selecting CortexA55UnitALU[0] available @3c
-# CHECK-NEXT:   Resource booking (@3c): 
+# CHECK-NEXT:   Resource booking (@3c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitALU(1) = 4294967295
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -414,7 +414,7 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@3c): 
+# CHECK-NEXT:   getNextResourceCycle (@3c):
 # CHECK-NEXT:     Instance 0 available @3c
 # CHECK-NEXT:     Instance 1 available @3c
 # CHECK-NEXT:     selecting CortexA55UnitALU[0] available @3c
@@ -439,7 +439,7 @@ body:             |
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
 # CHECK-NEXT: ** ScheduleDAGMILive::schedule picking next node
-# CHECK-NEXT:   Resource booking (@3c): 
+# CHECK-NEXT:   Resource booking (@3c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 4294967295
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -452,16 +452,16 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@3c): 
+# CHECK-NEXT:   getNextResourceCycle (@3c):
 # CHECK-NEXT:     Instance 0 available @4c
 # CHECK-NEXT:     Instance 1 available @3c
 # CHECK-NEXT:     selecting CortexA55UnitALU[1] available @3c
-# CHECK-NEXT: Queue BotQ.P: 10 
-# CHECK-NEXT: Queue BotQ.A: 11 
+# CHECK-NEXT: Queue BotQ.P: 10
+# CHECK-NEXT: Queue BotQ.A: 11
 # CHECK-NEXT: Scheduling SU(11) $q0 = COPY %10:fpr128
 # CHECK-NEXT:   Ready @3c
 # CHECK-NEXT:   CortexA55UnitALU +1x1u
-# CHECK-NEXT:   Resource booking (@3c): 
+# CHECK-NEXT:   Resource booking (@3c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 4294967295
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -474,11 +474,11 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@3c): 
+# CHECK-NEXT:   getNextResourceCycle (@3c):
 # CHECK-NEXT:     Instance 0 available @4c
 # CHECK-NEXT:     Instance 1 available @3c
 # CHECK-NEXT:     selecting CortexA55UnitALU[1] available @3c
-# CHECK-NEXT:   Resource booking (@3c): 
+# CHECK-NEXT:   Resource booking (@3c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 4294967295
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -491,7 +491,7 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@3c): 
+# CHECK-NEXT:   getNextResourceCycle (@3c):
 # CHECK-NEXT:     Instance 0 available @4c
 # CHECK-NEXT:     Instance 1 available @3c
 # CHECK-NEXT:     selecting CortexA55UnitALU[1] available @3c
@@ -517,7 +517,7 @@ body:             |
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
 # CHECK-NEXT: ** ScheduleDAGMILive::schedule picking next node
 # CHECK-NEXT: Cycle: 7 BotQ.A
-# CHECK-NEXT:   Resource booking (@7c): 
+# CHECK-NEXT:   Resource booking (@7c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -530,11 +530,11 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@7c): 
+# CHECK-NEXT:   getNextResourceCycle (@7c):
 # CHECK-NEXT:     Instance 0 available @7c
 # CHECK-NEXT:     Instance 1 available @7c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[0] available @7c
-# CHECK-NEXT:   Resource booking (@7c): 
+# CHECK-NEXT:   Resource booking (@7c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -547,18 +547,18 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@7c): 
+# CHECK-NEXT:   getNextResourceCycle (@7c):
 # CHECK-NEXT:     Instance 0 available @7c
 # CHECK-NEXT:     Instance 1 available @7c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[0] available @7c
-# CHECK-NEXT: Queue BotQ.P: 
-# CHECK-NEXT: Queue BotQ.A: 10 8 
-# CHECK-NEXT:   Cand SU(10) ORDER                              
-# CHECK-NEXT: Pick Bot ORDER     
+# CHECK-NEXT: Queue BotQ.P:
+# CHECK-NEXT: Queue BotQ.A: 10 8
+# CHECK-NEXT:   Cand SU(10) ORDER
+# CHECK-NEXT: Pick Bot ORDER
 # CHECK-NEXT: Scheduling SU(10) %12:fpr128 = UMULLv4i16_v4i32 %3.dsub:fpr128, %11:fpr64
 # CHECK-NEXT:   Ready @7c
 # CHECK-NEXT:   CortexA55UnitFPALU +2x1u
-# CHECK-NEXT:   Resource booking (@7c): 
+# CHECK-NEXT:   Resource booking (@7c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -571,11 +571,11 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@7c): 
+# CHECK-NEXT:   getNextResourceCycle (@7c):
 # CHECK-NEXT:     Instance 0 available @7c
 # CHECK-NEXT:     Instance 1 available @7c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[0] available @7c
-# CHECK-NEXT:   Resource booking (@7c): 
+# CHECK-NEXT:   Resource booking (@7c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -588,7 +588,7 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@7c): 
+# CHECK-NEXT:   getNextResourceCycle (@7c):
 # CHECK-NEXT:     Instance 0 available @7c
 # CHECK-NEXT:     Instance 1 available @7c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[0] available @7c
@@ -614,7 +614,7 @@ body:             |
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
 # CHECK-NEXT: ** ScheduleDAGMILive::schedule picking next node
-# CHECK-NEXT:   Resource booking (@8c): 
+# CHECK-NEXT:   Resource booking (@8c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -627,16 +627,16 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@8c): 
+# CHECK-NEXT:   getNextResourceCycle (@8c):
 # CHECK-NEXT:     Instance 0 available @9c
 # CHECK-NEXT:     Instance 1 available @8c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[1] available @8c
-# CHECK-NEXT: Queue BotQ.P: 9 3 
-# CHECK-NEXT: Queue BotQ.A: 8 
+# CHECK-NEXT: Queue BotQ.P: 9 3
+# CHECK-NEXT: Queue BotQ.A: 8
 # CHECK-NEXT: Scheduling SU(8) %10:fpr128 = UMULLv4i16_v4i32 %0.dsub:fpr128, %9:fpr64
 # CHECK-NEXT:   Ready @8c
 # CHECK-NEXT:   CortexA55UnitFPALU +2x1u
-# CHECK-NEXT:   Resource booking (@8c): 
+# CHECK-NEXT:   Resource booking (@8c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -649,11 +649,11 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@8c): 
+# CHECK-NEXT:   getNextResourceCycle (@8c):
 # CHECK-NEXT:     Instance 0 available @9c
 # CHECK-NEXT:     Instance 1 available @8c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[1] available @8c
-# CHECK-NEXT:   Resource booking (@8c): 
+# CHECK-NEXT:   Resource booking (@8c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -666,7 +666,7 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@8c): 
+# CHECK-NEXT:   getNextResourceCycle (@8c):
 # CHECK-NEXT:     Instance 0 available @9c
 # CHECK-NEXT:     Instance 1 available @8c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[1] available @8c
@@ -692,7 +692,7 @@ body:             |
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
 # CHECK-NEXT: ** ScheduleDAGMILive::schedule picking next node
-# CHECK-NEXT:   Resource booking (@9c): 
+# CHECK-NEXT:   Resource booking (@9c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -705,11 +705,11 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@9c): 
+# CHECK-NEXT:   getNextResourceCycle (@9c):
 # CHECK-NEXT:     Instance 0 available @9c
 # CHECK-NEXT:     Instance 1 available @9c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[0] available @9c
-# CHECK-NEXT:   Resource booking (@9c): 
+# CHECK-NEXT:   Resource booking (@9c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -722,16 +722,16 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@9c): 
+# CHECK-NEXT:   getNextResourceCycle (@9c):
 # CHECK-NEXT:     Instance 0 available @9c
 # CHECK-NEXT:     Instance 1 available @9c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[0] available @9c
-# CHECK-NEXT: Queue BotQ.P: 7 3 
-# CHECK-NEXT: Queue BotQ.A: 9 
+# CHECK-NEXT: Queue BotQ.P: 7 3
+# CHECK-NEXT: Queue BotQ.A: 9
 # CHECK-NEXT: Scheduling SU(9) %11:fpr64 = XTNv4i16 %7:fpr128
 # CHECK-NEXT:   Ready @9c
 # CHECK-NEXT:   CortexA55UnitFPALU +1x1u
-# CHECK-NEXT:   Resource booking (@9c): 
+# CHECK-NEXT:   Resource booking (@9c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -744,11 +744,11 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@9c): 
+# CHECK-NEXT:   getNextResourceCycle (@9c):
 # CHECK-NEXT:     Instance 0 available @9c
 # CHECK-NEXT:     Instance 1 available @9c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[0] available @9c
-# CHECK-NEXT:   Resource booking (@9c): 
+# CHECK-NEXT:   Resource booking (@9c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -761,7 +761,7 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@9c): 
+# CHECK-NEXT:   getNextResourceCycle (@9c):
 # CHECK-NEXT:     Instance 0 available @9c
 # CHECK-NEXT:     Instance 1 available @9c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[0] available @9c
@@ -786,7 +786,7 @@ body:             |
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
 # CHECK-NEXT: ** ScheduleDAGMILive::schedule picking next node
 # CHECK-NEXT: Cycle: 10 BotQ.A
-# CHECK-NEXT:   Resource booking (@10c): 
+# CHECK-NEXT:   Resource booking (@10c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -799,11 +799,11 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@10c): 
+# CHECK-NEXT:   getNextResourceCycle (@10c):
 # CHECK-NEXT:     Instance 0 available @10c
 # CHECK-NEXT:     Instance 1 available @10c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[0] available @10c
-# CHECK-NEXT:   Resource booking (@10c): 
+# CHECK-NEXT:   Resource booking (@10c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -816,18 +816,18 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@10c): 
+# CHECK-NEXT:   getNextResourceCycle (@10c):
 # CHECK-NEXT:     Instance 0 available @11c
 # CHECK-NEXT:     Instance 1 available @10c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[1] available @10c
-# CHECK-NEXT: Queue BotQ.P: 3 
-# CHECK-NEXT: Queue BotQ.A: 7 5 
-# CHECK-NEXT:   Cand SU(7) ORDER                              
-# CHECK-NEXT: Pick Bot ORDER     
+# CHECK-NEXT: Queue BotQ.P: 3
+# CHECK-NEXT: Queue BotQ.A: 7 5
+# CHECK-NEXT:   Cand SU(7) ORDER
+# CHECK-NEXT: Pick Bot ORDER
 # CHECK-NEXT: Scheduling SU(7) %9:fpr64 = XTNv4i16 %8:fpr128
 # CHECK-NEXT:   Ready @10c
 # CHECK-NEXT:   CortexA55UnitFPALU +1x1u
-# CHECK-NEXT:   Resource booking (@10c): 
+# CHECK-NEXT:   Resource booking (@10c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -840,11 +840,11 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@10c): 
+# CHECK-NEXT:   getNextResourceCycle (@10c):
 # CHECK-NEXT:     Instance 0 available @10c
 # CHECK-NEXT:     Instance 1 available @10c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[0] available @10c
-# CHECK-NEXT:   Resource booking (@10c): 
+# CHECK-NEXT:   Resource booking (@10c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -857,7 +857,7 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@10c): 
+# CHECK-NEXT:   getNextResourceCycle (@10c):
 # CHECK-NEXT:     Instance 0 available @10c
 # CHECK-NEXT:     Instance 1 available @10c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[0] available @10c
@@ -880,7 +880,7 @@ body:             |
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
 # CHECK-NEXT: ** ScheduleDAGMILive::schedule picking next node
-# CHECK-NEXT:   Resource booking (@10c): 
+# CHECK-NEXT:   Resource booking (@10c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -893,17 +893,17 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@10c): 
+# CHECK-NEXT:   getNextResourceCycle (@10c):
 # CHECK-NEXT:     Instance 0 available @12c
 # CHECK-NEXT:     Instance 1 available @10c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[1] available @10c
-# CHECK-NEXT: Queue BotQ.P: 3 6 
-# CHECK-NEXT: Queue BotQ.A: 5 
+# CHECK-NEXT: Queue BotQ.P: 3 6
+# CHECK-NEXT: Queue BotQ.A: 5
 # CHECK-NEXT: Scheduling SU(5) %7:fpr128 = ANDv16i8 %2:fpr128, %6:fpr128
 # CHECK-NEXT:   Ready @10c
 # CHECK-NEXT:   CortexA55UnitFPALU +2x1u
 # CHECK-NEXT:   *** Critical resource CortexA55UnitFPALU: 4c
-# CHECK-NEXT:   Resource booking (@10c): 
+# CHECK-NEXT:   Resource booking (@10c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -916,11 +916,11 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@10c): 
+# CHECK-NEXT:   getNextResourceCycle (@10c):
 # CHECK-NEXT:     Instance 0 available @12c
 # CHECK-NEXT:     Instance 1 available @10c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[1] available @10c
-# CHECK-NEXT:   Resource booking (@10c): 
+# CHECK-NEXT:   Resource booking (@10c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -933,7 +933,7 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@10c): 
+# CHECK-NEXT:   getNextResourceCycle (@10c):
 # CHECK-NEXT:     Instance 0 available @12c
 # CHECK-NEXT:     Instance 1 available @10c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[1] available @10c
@@ -959,7 +959,7 @@ body:             |
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
 # CHECK-NEXT: ** ScheduleDAGMILive::schedule picking next node
-# CHECK-NEXT:   Resource booking (@11c): 
+# CHECK-NEXT:   Resource booking (@11c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -972,12 +972,12 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@11c): 
+# CHECK-NEXT:   getNextResourceCycle (@11c):
 # CHECK-NEXT:     Instance 0 available @12c
 # CHECK-NEXT:     Instance 1 available @12c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[0] available @12c
 # CHECK-NEXT:   SU(3) CortexA55UnitFPALU[0]=12c
-# CHECK-NEXT:   Resource booking (@11c): 
+# CHECK-NEXT:   Resource booking (@11c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -990,13 +990,13 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@11c): 
+# CHECK-NEXT:   getNextResourceCycle (@11c):
 # CHECK-NEXT:     Instance 0 available @12c
 # CHECK-NEXT:     Instance 1 available @12c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[0] available @12c
 # CHECK-NEXT:   SU(6) CortexA55UnitFPALU[0]=12c
 # CHECK-NEXT: Cycle: 12 BotQ.A
-# CHECK-NEXT:   Resource booking (@12c): 
+# CHECK-NEXT:   Resource booking (@12c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -1009,11 +1009,11 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@12c): 
+# CHECK-NEXT:   getNextResourceCycle (@12c):
 # CHECK-NEXT:     Instance 0 available @12c
 # CHECK-NEXT:     Instance 1 available @12c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[0] available @12c
-# CHECK-NEXT:   Resource booking (@12c): 
+# CHECK-NEXT:   Resource booking (@12c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -1026,19 +1026,19 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@12c): 
+# CHECK-NEXT:   getNextResourceCycle (@12c):
 # CHECK-NEXT:     Instance 0 available @12c
 # CHECK-NEXT:     Instance 1 available @12c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[0] available @12c
-# CHECK-NEXT: Queue BotQ.P: 0 
-# CHECK-NEXT: Queue BotQ.A: 3 6 
-# CHECK-NEXT:   Cand SU(3) ORDER                              
-# CHECK-NEXT:   Cand SU(6) ORDER                              
-# CHECK-NEXT: Pick Bot ORDER     
+# CHECK-NEXT: Queue BotQ.P: 0
+# CHECK-NEXT: Queue BotQ.A: 3 6
+# CHECK-NEXT:   Cand SU(3) ORDER
+# CHECK-NEXT:   Cand SU(6) ORDER
+# CHECK-NEXT: Pick Bot ORDER
 # CHECK-NEXT: Scheduling SU(6) %8:fpr128 = ANDv16i8 %1:fpr128, %6:fpr128
 # CHECK-NEXT:   Ready @12c
 # CHECK-NEXT:   CortexA55UnitFPALU +2x1u
-# CHECK-NEXT:   Resource booking (@12c): 
+# CHECK-NEXT:   Resource booking (@12c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -1051,11 +1051,11 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@12c): 
+# CHECK-NEXT:   getNextResourceCycle (@12c):
 # CHECK-NEXT:     Instance 0 available @12c
 # CHECK-NEXT:     Instance 1 available @12c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[0] available @12c
-# CHECK-NEXT:   Resource booking (@12c): 
+# CHECK-NEXT:   Resource booking (@12c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -1068,7 +1068,7 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@12c): 
+# CHECK-NEXT:   getNextResourceCycle (@12c):
 # CHECK-NEXT:     Instance 0 available @12c
 # CHECK-NEXT:     Instance 1 available @12c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[0] available @12c
@@ -1094,7 +1094,7 @@ body:             |
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
 # CHECK-NEXT: ** ScheduleDAGMILive::schedule picking next node
-# CHECK-NEXT:   Resource booking (@13c): 
+# CHECK-NEXT:   Resource booking (@13c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -1107,11 +1107,11 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@13c): 
+# CHECK-NEXT:   getNextResourceCycle (@13c):
 # CHECK-NEXT:     Instance 0 available @13c
 # CHECK-NEXT:     Instance 1 available @13c
 # CHECK-NEXT:     selecting CortexA55UnitALU[0] available @13c
-# CHECK-NEXT:   Resource booking (@13c): 
+# CHECK-NEXT:   Resource booking (@13c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -1124,11 +1124,11 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@13c): 
+# CHECK-NEXT:   getNextResourceCycle (@13c):
 # CHECK-NEXT:     Instance 0 available @14c
 # CHECK-NEXT:     Instance 1 available @13c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[1] available @13c
-# CHECK-NEXT:   Resource booking (@13c): 
+# CHECK-NEXT:   Resource booking (@13c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -1141,18 +1141,18 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@13c): 
+# CHECK-NEXT:   getNextResourceCycle (@13c):
 # CHECK-NEXT:     Instance 0 available @13c
 # CHECK-NEXT:     Instance 1 available @13c
 # CHECK-NEXT:     selecting CortexA55UnitALU[0] available @13c
-# CHECK-NEXT: Queue BotQ.P: 1 4 
-# CHECK-NEXT: Queue BotQ.A: 3 0 
-# CHECK-NEXT:   Cand SU(3) ORDER                              
-# CHECK-NEXT: Pick Bot PHYS-REG  
+# CHECK-NEXT: Queue BotQ.P: 1 4
+# CHECK-NEXT: Queue BotQ.A: 3 0
+# CHECK-NEXT:   Cand SU(3) ORDER
+# CHECK-NEXT: Pick Bot PHYS-REG
 # CHECK-NEXT: Scheduling SU(3) %3:fpr128 = EXTv16i8 %0:fpr128, %0:fpr128, 8
 # CHECK-NEXT:   Ready @13c
 # CHECK-NEXT:   CortexA55UnitFPALU +2x1u
-# CHECK-NEXT:   Resource booking (@13c): 
+# CHECK-NEXT:   Resource booking (@13c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -1165,11 +1165,11 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@13c): 
+# CHECK-NEXT:   getNextResourceCycle (@13c):
 # CHECK-NEXT:     Instance 0 available @14c
 # CHECK-NEXT:     Instance 1 available @13c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[1] available @13c
-# CHECK-NEXT:   Resource booking (@13c): 
+# CHECK-NEXT:   Resource booking (@13c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -1182,7 +1182,7 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@13c): 
+# CHECK-NEXT:   getNextResourceCycle (@13c):
 # CHECK-NEXT:     Instance 0 available @14c
 # CHECK-NEXT:     Instance 1 available @13c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[1] available @13c
@@ -1208,7 +1208,7 @@ body:             |
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
 # CHECK-NEXT: ** ScheduleDAGMILive::schedule picking next node
-# CHECK-NEXT:   Resource booking (@14c): 
+# CHECK-NEXT:   Resource booking (@14c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -1221,16 +1221,16 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@14c): 
+# CHECK-NEXT:   getNextResourceCycle (@14c):
 # CHECK-NEXT:     Instance 0 available @14c
 # CHECK-NEXT:     Instance 1 available @14c
 # CHECK-NEXT:     selecting CortexA55UnitALU[0] available @14c
-# CHECK-NEXT: Queue BotQ.P: 1 4 2 
-# CHECK-NEXT: Queue BotQ.A: 0 
+# CHECK-NEXT: Queue BotQ.P: 1 4 2
+# CHECK-NEXT: Queue BotQ.A: 0
 # CHECK-NEXT: Scheduling SU(0) %2:fpr128 = COPY $q2
 # CHECK-NEXT:   Ready @14c
 # CHECK-NEXT:   CortexA55UnitALU +1x1u
-# CHECK-NEXT:   Resource booking (@14c): 
+# CHECK-NEXT:   Resource booking (@14c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -1243,11 +1243,11 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@14c): 
+# CHECK-NEXT:   getNextResourceCycle (@14c):
 # CHECK-NEXT:     Instance 0 available @14c
 # CHECK-NEXT:     Instance 1 available @14c
 # CHECK-NEXT:     selecting CortexA55UnitALU[0] available @14c
-# CHECK-NEXT:   Resource booking (@14c): 
+# CHECK-NEXT:   Resource booking (@14c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 3
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -1260,7 +1260,7 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@14c): 
+# CHECK-NEXT:   getNextResourceCycle (@14c):
 # CHECK-NEXT:     Instance 0 available @14c
 # CHECK-NEXT:     Instance 1 available @14c
 # CHECK-NEXT:     selecting CortexA55UnitALU[0] available @14c
@@ -1285,7 +1285,7 @@ body:             |
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
 # CHECK-NEXT: ** ScheduleDAGMILive::schedule picking next node
 # CHECK-NEXT: Cycle: 15 BotQ.A
-# CHECK-NEXT:   Resource booking (@15c): 
+# CHECK-NEXT:   Resource booking (@15c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 14
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -1298,16 +1298,16 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@15c): 
+# CHECK-NEXT:   getNextResourceCycle (@15c):
 # CHECK-NEXT:     Instance 0 available @15c
 # CHECK-NEXT:     Instance 1 available @15c
 # CHECK-NEXT:     selecting CortexA55UnitALU[0] available @15c
-# CHECK-NEXT: Queue BotQ.P: 2 4 
-# CHECK-NEXT: Queue BotQ.A: 1 
+# CHECK-NEXT: Queue BotQ.P: 2 4
+# CHECK-NEXT: Queue BotQ.A: 1
 # CHECK-NEXT: Scheduling SU(1) %1:fpr128 = COPY $q1
 # CHECK-NEXT:   Ready @15c
 # CHECK-NEXT:   CortexA55UnitALU +1x1u
-# CHECK-NEXT:   Resource booking (@15c): 
+# CHECK-NEXT:   Resource booking (@15c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 14
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -1320,11 +1320,11 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@15c): 
+# CHECK-NEXT:   getNextResourceCycle (@15c):
 # CHECK-NEXT:     Instance 0 available @15c
 # CHECK-NEXT:     Instance 1 available @15c
 # CHECK-NEXT:     selecting CortexA55UnitALU[0] available @15c
-# CHECK-NEXT:   Resource booking (@15c): 
+# CHECK-NEXT:   Resource booking (@15c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 14
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -1337,7 +1337,7 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@15c): 
+# CHECK-NEXT:   getNextResourceCycle (@15c):
 # CHECK-NEXT:     Instance 0 available @15c
 # CHECK-NEXT:     Instance 1 available @15c
 # CHECK-NEXT:     selecting CortexA55UnitALU[0] available @15c
@@ -1361,7 +1361,7 @@ body:             |
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
 # CHECK-NEXT: ** ScheduleDAGMILive::schedule picking next node
 # CHECK-NEXT: Cycle: 16 BotQ.A
-# CHECK-NEXT:   Resource booking (@16c): 
+# CHECK-NEXT:   Resource booking (@16c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 15
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -1374,11 +1374,11 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@16c): 
+# CHECK-NEXT:   getNextResourceCycle (@16c):
 # CHECK-NEXT:     Instance 0 available @16c
 # CHECK-NEXT:     Instance 1 available @16c
 # CHECK-NEXT:     selecting CortexA55UnitALU[0] available @16c
-# CHECK-NEXT:   Resource booking (@16c): 
+# CHECK-NEXT:   Resource booking (@16c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 15
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -1391,19 +1391,19 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@16c): 
+# CHECK-NEXT:   getNextResourceCycle (@16c):
 # CHECK-NEXT:     Instance 0 available @16c
 # CHECK-NEXT:     Instance 1 available @16c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[0] available @16c
-# CHECK-NEXT: Queue BotQ.P: 
-# CHECK-NEXT: Queue BotQ.A: 2 4 
-# CHECK-NEXT:   Cand SU(2) ORDER                              
-# CHECK-NEXT:   Cand SU(4) PHYS-REG                           
-# CHECK-NEXT: Pick Bot PHYS-REG  
+# CHECK-NEXT: Queue BotQ.P:
+# CHECK-NEXT: Queue BotQ.A: 2 4
+# CHECK-NEXT:   Cand SU(2) ORDER
+# CHECK-NEXT:   Cand SU(4) PHYS-REG
+# CHECK-NEXT: Pick Bot PHYS-REG
 # CHECK-NEXT: Scheduling SU(4) %6:fpr128 = MOVIv2d_ns 17
 # CHECK-NEXT:   Ready @16c
 # CHECK-NEXT:   CortexA55UnitFPALU +2x1u
-# CHECK-NEXT:   Resource booking (@16c): 
+# CHECK-NEXT:   Resource booking (@16c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 15
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -1416,11 +1416,11 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@16c): 
+# CHECK-NEXT:   getNextResourceCycle (@16c):
 # CHECK-NEXT:     Instance 0 available @16c
 # CHECK-NEXT:     Instance 1 available @16c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[0] available @16c
-# CHECK-NEXT:   Resource booking (@16c): 
+# CHECK-NEXT:   Resource booking (@16c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 15
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -1433,7 +1433,7 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@16c): 
+# CHECK-NEXT:   getNextResourceCycle (@16c):
 # CHECK-NEXT:     Instance 0 available @16c
 # CHECK-NEXT:     Instance 1 available @16c
 # CHECK-NEXT:     selecting CortexA55UnitFPALU[0] available @16c
@@ -1459,7 +1459,7 @@ body:             |
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
 # CHECK-NEXT: ** ScheduleDAGMILive::schedule picking next node
-# CHECK-NEXT:   Resource booking (@17c): 
+# CHECK-NEXT:   Resource booking (@17c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 15
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -1472,16 +1472,16 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@17c): 
+# CHECK-NEXT:   getNextResourceCycle (@17c):
 # CHECK-NEXT:     Instance 0 available @17c
 # CHECK-NEXT:     Instance 1 available @17c
 # CHECK-NEXT:     selecting CortexA55UnitALU[0] available @17c
-# CHECK-NEXT: Queue BotQ.P: 
-# CHECK-NEXT: Queue BotQ.A: 2 
+# CHECK-NEXT: Queue BotQ.P:
+# CHECK-NEXT: Queue BotQ.A: 2
 # CHECK-NEXT: Scheduling SU(2) %0:fpr128 = COPY $q0
 # CHECK-NEXT:   Ready @17c
 # CHECK-NEXT:   CortexA55UnitALU +1x1u
-# CHECK-NEXT:   Resource booking (@17c): 
+# CHECK-NEXT:   Resource booking (@17c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 15
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -1494,11 +1494,11 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@17c): 
+# CHECK-NEXT:   getNextResourceCycle (@17c):
 # CHECK-NEXT:     Instance 0 available @17c
 # CHECK-NEXT:     Instance 1 available @17c
 # CHECK-NEXT:     selecting CortexA55UnitALU[0] available @17c
-# CHECK-NEXT:   Resource booking (@17c): 
+# CHECK-NEXT:   Resource booking (@17c):
 # CHECK-NEXT: CortexA55UnitALU(0) = 15
 # CHECK-NEXT: CortexA55UnitALU(1) = 3
 # CHECK-NEXT: CortexA55UnitB(0) = 4294967295
@@ -1511,7 +1511,7 @@ body:             |
 # CHECK-NEXT: CortexA55UnitLd(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitMAC(0) = 4294967295
 # CHECK-NEXT: CortexA55UnitSt(0) = 4294967295
-# CHECK-NEXT:   getNextResourceCycle (@17c): 
+# CHECK-NEXT:   getNextResourceCycle (@17c):
 # CHECK-NEXT:     Instance 0 available @17c
 # CHECK-NEXT:     Instance 1 available @17c
 # CHECK-NEXT:     selecting CortexA55UnitALU[0] available @17c
@@ -1540,31 +1540,31 @@ body:             |
 # CHECK-NEXT:   x: resource booked
 # CHECK-NEXT: Cycle              | 17 | 16 | 15 | 14 | 13 | 12 | 11 | 10 | 9  | 8  | 7  | 6  | 5  | 4  | 3  |
 # CHECK-NEXT: SU(2)              | i  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
-# CHECK-NEXT:   CortexA55UnitALU | x  |    |    |    |    |    |    |    |    |    |    |    |    |    |    | 
+# CHECK-NEXT:   CortexA55UnitALU | x  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
 # CHECK-NEXT: SU(4)              |    | i  |    |    |    |    |    |    |    |    |    |    |    |    |    |
-# CHECK-NEXT: CortexA55UnitFPALU |    | x  | x  |    |    |    |    |    |    |    |    |    |    |    |    | 
+# CHECK-NEXT: CortexA55UnitFPALU |    | x  | x  |    |    |    |    |    |    |    |    |    |    |    |    |
 # CHECK-NEXT: SU(1)              |    |    | i  |    |    |    |    |    |    |    |    |    |    |    |    |
-# CHECK-NEXT:   CortexA55UnitALU |    |    | x  |    |    |    |    |    |    |    |    |    |    |    |    | 
+# CHECK-NEXT:   CortexA55UnitALU |    |    | x  |    |    |    |    |    |    |    |    |    |    |    |    |
 # CHECK-NEXT: SU(0)              |    |    |    | i  |    |    |    |    |    |    |    |    |    |    |    |
-# CHECK-NEXT:   CortexA55UnitALU |    |    |    | x  |    |    |    |    |    |    |    |    |    |    |    | 
+# CHECK-NEXT:   CortexA55UnitALU |    |    |    | x  |    |    |    |    |    |    |    |    |    |    |    |
 # CHECK-NEXT: SU(3)              |    |    |    |    | i  |    |    |    |    |    |    |    |    |    |    |
-# CHECK-NEXT: CortexA55UnitFPALU |    |    |    |    | x  | x  |    |    |    |    |    |    |    |    |    | 
+# CHECK-NEXT: CortexA55UnitFPALU |    |    |    |    | x  | x  |    |    |    |    |    |    |    |    |    |
 # CHECK-NEXT: SU(6)              |    |    |    |    |    | i  |    |    |    |    |    |    |    |    |    |
-# CHECK-NEXT: CortexA55UnitFPALU |    |    |    |    |    | x  | x  |    |    |    |    |    |    |    |    | 
+# CHECK-NEXT: CortexA55UnitFPALU |    |    |    |    |    | x  | x  |    |    |    |    |    |    |    |    |
 # CHECK-NEXT: SU(5)              |    |    |    |    |    |    |    | i  |    |    |    |    |    |    |    |
-# CHECK-NEXT: CortexA55UnitFPALU |    |    |    |    |    |    |    | x  | x  |    |    |    |    |    |    | 
+# CHECK-NEXT: CortexA55UnitFPALU |    |    |    |    |    |    |    | x  | x  |    |    |    |    |    |    |
 # CHECK-NEXT: SU(7)              |    |    |    |    |    |    |    | i  |    |    |    |    |    |    |    |
-# CHECK-NEXT: CortexA55UnitFPALU |    |    |    |    |    |    |    | x  |    |    |    |    |    |    |    | 
+# CHECK-NEXT: CortexA55UnitFPALU |    |    |    |    |    |    |    | x  |    |    |    |    |    |    |    |
 # CHECK-NEXT: SU(9)              |    |    |    |    |    |    |    |    | i  |    |    |    |    |    |    |
-# CHECK-NEXT: CortexA55UnitFPALU |    |    |    |    |    |    |    |    | x  |    |    |    |    |    |    | 
+# CHECK-NEXT: CortexA55UnitFPALU |    |    |    |    |    |    |    |    | x  |    |    |    |    |    |    |
 # CHECK-NEXT: SU(8)              |    |    |    |    |    |    |    |    |    | i  |    |    |    |    |    |
-# CHECK-NEXT: CortexA55UnitFPALU |    |    |    |    |    |    |    |    |    | x  | x  |    |    |    |    | 
+# CHECK-NEXT: CortexA55UnitFPALU |    |    |    |    |    |    |    |    |    | x  | x  |    |    |    |    |
 # CHECK-NEXT: SU(10)             |    |    |    |    |    |    |    |    |    |    | i  |    |    |    |    |
-# CHECK-NEXT: CortexA55UnitFPALU |    |    |    |    |    |    |    |    |    |    | x  | x  |    |    |    | 
+# CHECK-NEXT: CortexA55UnitFPALU |    |    |    |    |    |    |    |    |    |    | x  | x  |    |    |    |
 # CHECK-NEXT: SU(11)             |    |    |    |    |    |    |    |    |    |    |    |    |    |    | i  |
-# CHECK-NEXT:   CortexA55UnitALU |    |    |    |    |    |    |    |    |    |    |    |    |    |    | x  | 
+# CHECK-NEXT:   CortexA55UnitALU |    |    |    |    |    |    |    |    |    |    |    |    |    |    | x  |
 # CHECK-NEXT: SU(12)             |    |    |    |    |    |    |    |    |    |    |    |    |    |    | i  |
-# CHECK-NEXT:   CortexA55UnitALU |    |    |    |    |    |    |    |    |    |    |    |    |    |    | x  | 
+# CHECK-NEXT:   CortexA55UnitALU |    |    |    |    |    |    |    |    |    |    |    |    |    |    | x  |
 # CHECK-NEXT: SU(2) [TopReadyCycle = 0, BottomReadyCycle = 17]:   %0:fpr128 = COPY $q0
 # CHECK-NEXT: SU(4) [TopReadyCycle = 0, BottomReadyCycle = 16]:   %6:fpr128 = MOVIv2d_ns 17
 # CHECK-NEXT: SU(1) [TopReadyCycle = 0, BottomReadyCycle = 15]:   %1:fpr128 = COPY $q1
@@ -1578,7 +1578,7 @@ body:             |
 # CHECK-NEXT: SU(10) [TopReadyCycle = 0, BottomReadyCycle = 7]:   %12:fpr128 = UMULLv4i16_v4i32 %3.dsub:fpr128, %11:fpr64
 # CHECK-NEXT: SU(11) [TopReadyCycle = 0, BottomReadyCycle = 3]:   $q0 = COPY %10:fpr128
 # CHECK-NEXT: SU(12) [TopReadyCycle = 0, BottomReadyCycle = 3]:   $q1 = COPY %12:fpr128
-# CHECK-EMPTY: 
+# CHECK-EMPTY:
 # CHECK-NEXT: ********** INTERVALS **********
 # CHECK-NEXT: B0 [0B,48r:0)[192r,224r:1) 0 at 0B-phi 1 at 192r
 # CHECK-NEXT: B1 [0B,88r:0)[208r,224r:1) 0 at 0B-phi 1 at 208r
@@ -1598,7 +1598,7 @@ body:             |
 # CHECK-NEXT: ********** MACHINEINSTRS **********
 # CHECK-NEXT: # Machine code for function umull_and_v8i32: IsSSA, NoPHIs, TracksLiveness
 # CHECK-NEXT: Function Live Ins: $q0 in %0, $q1 in %1, $q2 in %2
-# CHECK-EMPTY: 
+# CHECK-EMPTY:
 # CHECK-NEXT: 0B	bb.0.entry:
 # CHECK-NEXT: 	  liveins: $q0, $q1, $q2
 # CHECK-NEXT: 48B	  %0:fpr128 = COPY $q0
@@ -1615,5 +1615,5 @@ body:             |
 # CHECK-NEXT: 192B	  $q0 = COPY %10:fpr128
 # CHECK-NEXT: 208B	  $q1 = COPY %12:fpr128
 # CHECK-NEXT: 224B	  RET_ReallyLR implicit $q0, implicit $q1
-# CHECK-EMPTY: 
+# CHECK-EMPTY:
 # CHECK-NEXT: # End machine code for function umull_and_v8i32.

diff  --git a/llvm/test/CodeGen/AArch64/misched-fusion-lit.ll b/llvm/test/CodeGen/AArch64/misched-fusion-lit.ll
index ad244d30df11fe..fedbb642a36206 100644
--- a/llvm/test/CodeGen/AArch64/misched-fusion-lit.ll
+++ b/llvm/test/CodeGen/AArch64/misched-fusion-lit.ll
@@ -33,6 +33,7 @@ entry:
 
 ; CHECK-LABEL: litp_tune_generic:
 ; CHECK:         adrp [[R:x[0-9]+]], litp_tune_generic
+; CHECKDONT:     add  {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
 ; CHECK-NEXT:    add {{x[0-9]+}}, [[R]], :lo12:litp_tune_generic
 }
 

diff  --git a/llvm/test/CodeGen/AArch64/mul_pow2.ll b/llvm/test/CodeGen/AArch64/mul_pow2.ll
index 8614424edbdd74..33c766f382e6b6 100644
--- a/llvm/test/CodeGen/AArch64/mul_pow2.ll
+++ b/llvm/test/CodeGen/AArch64/mul_pow2.ll
@@ -107,13 +107,13 @@ define i64 @test6_64b(i64 %x) {
 define i64 @test6_umull(i32 %x) {
 ; CHECK-LABEL: test6_umull:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #6
+; CHECK-NEXT:    mov w8, #6 // =0x6
 ; CHECK-NEXT:    umull x0, w0, w8
 ; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: test6_umull:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #6
+; GISEL-NEXT:    mov w8, #6 // =0x6
 ; GISEL-NEXT:    umull x0, w0, w8
 ; GISEL-NEXT:    ret
 
@@ -125,13 +125,13 @@ define i64 @test6_umull(i32 %x) {
 define i64 @test6_smull(i32 %x) {
 ; CHECK-LABEL: test6_smull:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #6
+; CHECK-NEXT:    mov w8, #6 // =0x6
 ; CHECK-NEXT:    smull x0, w0, w8
 ; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: test6_smull:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #6
+; GISEL-NEXT:    mov w8, #6 // =0x6
 ; GISEL-NEXT:    smull x0, w0, w8
 ; GISEL-NEXT:    ret
 
@@ -143,13 +143,13 @@ define i64 @test6_smull(i32 %x) {
 define i32 @test6_madd(i32 %x, i32 %y) {
 ; CHECK-LABEL: test6_madd:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #6
+; CHECK-NEXT:    mov w8, #6 // =0x6
 ; CHECK-NEXT:    madd w0, w0, w8, w1
 ; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: test6_madd:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #6
+; GISEL-NEXT:    mov w8, #6 // =0x6
 ; GISEL-NEXT:    madd w0, w0, w8, w1
 ; GISEL-NEXT:    ret
 
@@ -161,13 +161,13 @@ define i32 @test6_madd(i32 %x, i32 %y) {
 define i32 @test6_msub(i32 %x, i32 %y) {
 ; CHECK-LABEL: test6_msub:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #6
+; CHECK-NEXT:    mov w8, #6 // =0x6
 ; CHECK-NEXT:    msub w0, w0, w8, w1
 ; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: test6_msub:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #6
+; GISEL-NEXT:    mov w8, #6 // =0x6
 ; GISEL-NEXT:    msub w0, w0, w8, w1
 ; GISEL-NEXT:    ret
 
@@ -179,13 +179,13 @@ define i32 @test6_msub(i32 %x, i32 %y) {
 define i64 @test6_umaddl(i32 %x, i64 %y) {
 ; CHECK-LABEL: test6_umaddl:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #6
+; CHECK-NEXT:    mov w8, #6 // =0x6
 ; CHECK-NEXT:    umaddl x0, w0, w8, x1
 ; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: test6_umaddl:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #6
+; GISEL-NEXT:    mov w8, #6 // =0x6
 ; GISEL-NEXT:    umaddl x0, w0, w8, x1
 ; GISEL-NEXT:    ret
 
@@ -198,13 +198,13 @@ define i64 @test6_umaddl(i32 %x, i64 %y) {
 define i64 @test6_smaddl(i32 %x, i64 %y) {
 ; CHECK-LABEL: test6_smaddl:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #6
+; CHECK-NEXT:    mov w8, #6 // =0x6
 ; CHECK-NEXT:    smaddl x0, w0, w8, x1
 ; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: test6_smaddl:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #6
+; GISEL-NEXT:    mov w8, #6 // =0x6
 ; GISEL-NEXT:    smaddl x0, w0, w8, x1
 ; GISEL-NEXT:    ret
 
@@ -217,13 +217,13 @@ define i64 @test6_smaddl(i32 %x, i64 %y) {
 define i64 @test6_umsubl(i32 %x, i64 %y) {
 ; CHECK-LABEL: test6_umsubl:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #6
+; CHECK-NEXT:    mov w8, #6 // =0x6
 ; CHECK-NEXT:    umsubl x0, w0, w8, x1
 ; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: test6_umsubl:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #6
+; GISEL-NEXT:    mov w8, #6 // =0x6
 ; GISEL-NEXT:    umsubl x0, w0, w8, x1
 ; GISEL-NEXT:    ret
 
@@ -236,13 +236,13 @@ define i64 @test6_umsubl(i32 %x, i64 %y) {
 define i64 @test6_smsubl(i32 %x, i64 %y) {
 ; CHECK-LABEL: test6_smsubl:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #6
+; CHECK-NEXT:    mov w8, #6 // =0x6
 ; CHECK-NEXT:    smsubl x0, w0, w8, x1
 ; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: test6_smsubl:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #6
+; GISEL-NEXT:    mov w8, #6 // =0x6
 ; GISEL-NEXT:    smsubl x0, w0, w8, x1
 ; GISEL-NEXT:    ret
 
@@ -255,13 +255,13 @@ define i64 @test6_smsubl(i32 %x, i64 %y) {
 define i64 @test6_umnegl(i32 %x) {
 ; CHECK-LABEL: test6_umnegl:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #6
+; CHECK-NEXT:    mov w8, #6 // =0x6
 ; CHECK-NEXT:    umnegl x0, w0, w8
 ; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: test6_umnegl:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #6
+; GISEL-NEXT:    mov w8, #6 // =0x6
 ; GISEL-NEXT:    umnegl x0, w0, w8
 ; GISEL-NEXT:    ret
 
@@ -274,13 +274,13 @@ define i64 @test6_umnegl(i32 %x) {
 define i64 @test6_smnegl(i32 %x) {
 ; CHECK-LABEL: test6_smnegl:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #6
+; CHECK-NEXT:    mov w8, #6 // =0x6
 ; CHECK-NEXT:    smnegl x0, w0, w8
 ; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: test6_smnegl:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #6
+; GISEL-NEXT:    mov w8, #6 // =0x6
 ; GISEL-NEXT:    smnegl x0, w0, w8
 ; GISEL-NEXT:    ret
 
@@ -294,15 +294,15 @@ define i64 @test6_smnegl(i32 %x) {
 define i32 @mull6_sub(i32 %x) {
 ; CHECK-LABEL: mull6_sub:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #6
-; CHECK-NEXT:    mov w9, #-1
+; CHECK-NEXT:    mov w8, #6 // =0x6
+; CHECK-NEXT:    mov w9, #-1 // =0xffffffff
 ; CHECK-NEXT:    madd w0, w0, w8, w9
 ; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: mull6_sub:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #6
-; GISEL-NEXT:    mov w9, #-1
+; GISEL-NEXT:    mov w8, #6 // =0x6
+; GISEL-NEXT:    mov w9, #-1 // =0xffffffff
 ; GISEL-NEXT:    madd w0, w0, w8, w9
 ; GISEL-NEXT:    ret
   %mul = mul nsw i32 %x, 6
@@ -313,15 +313,15 @@ define i32 @mull6_sub(i32 %x) {
 define i64 @mull6_sub_orr(i64 %x) {
 ; CHECK-LABEL: mull6_sub_orr:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #6
-; CHECK-NEXT:    mov x9, #16773120
+; CHECK-NEXT:    mov w8, #6 // =0x6
+; CHECK-NEXT:    mov x9, #16773120 // =0xfff000
 ; CHECK-NEXT:    madd x0, x0, x8, x9
 ; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: mull6_sub_orr:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #6
-; GISEL-NEXT:    mov x9, #16773120
+; GISEL-NEXT:    mov w8, #6 // =0x6
+; GISEL-NEXT:    mov x9, #16773120 // =0xfff000
 ; GISEL-NEXT:    madd x0, x0, x8, x9
 ; GISEL-NEXT:    ret
   %mul = mul nsw i64 %x, 6
@@ -396,13 +396,13 @@ define i32 @test10(i32 %x) {
 define i32 @test11(i32 %x) {
 ; CHECK-LABEL: test11:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #11
+; CHECK-NEXT:    mov w8, #11 // =0xb
 ; CHECK-NEXT:    mul w0, w0, w8
 ; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: test11:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #11
+; GISEL-NEXT:    mov w8, #11 // =0xb
 ; GISEL-NEXT:    mul w0, w0, w8
 ; GISEL-NEXT:    ret
 
@@ -430,13 +430,13 @@ define i32 @test12(i32 %x) {
 define i32 @test13(i32 %x) {
 ; CHECK-LABEL: test13:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #13
+; CHECK-NEXT:    mov w8, #13 // =0xd
 ; CHECK-NEXT:    mul w0, w0, w8
 ; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: test13:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #13
+; GISEL-NEXT:    mov w8, #13 // =0xd
 ; GISEL-NEXT:    mul w0, w0, w8
 ; GISEL-NEXT:    ret
 
@@ -453,7 +453,7 @@ define i32 @test14(i32 %x) {
 ;
 ; GISEL-LABEL: test14:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #14
+; GISEL-NEXT:    mov w8, #14 // =0xe
 ; GISEL-NEXT:    mul w0, w0, w8
 ; GISEL-NEXT:    ret
 
@@ -502,7 +502,7 @@ define i32 @test25_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
 ;
 ; GISEL-LABEL: test25_fast_shift:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #25
+; GISEL-NEXT:    mov w8, #25 // =0x19
 ; GISEL-NEXT:    mul w0, w0, w8
 ; GISEL-NEXT:    ret
 
@@ -519,7 +519,7 @@ define i32 @test45_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
 ;
 ; GISEL-LABEL: test45_fast_shift:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #45
+; GISEL-NEXT:    mov w8, #45 // =0x2d
 ; GISEL-NEXT:    mul w0, w0, w8
 ; GISEL-NEXT:    ret
 
@@ -531,13 +531,13 @@ define i32 @test45_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
 define i32 @test45(i32 %x) {
 ; CHECK-LABEL: test45:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #45
+; CHECK-NEXT:    mov w8, #45 // =0x2d
 ; CHECK-NEXT:    mul w0, w0, w8
 ; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: test45:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #45
+; GISEL-NEXT:    mov w8, #45 // =0x2d
 ; GISEL-NEXT:    mul w0, w0, w8
 ; GISEL-NEXT:    ret
 
@@ -549,13 +549,13 @@ define i32 @test45(i32 %x) {
 define i32 @test85_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
 ; CHECK-LABEL: test85_fast_shift:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #85
+; CHECK-NEXT:    mov w8, #85 // =0x55
 ; CHECK-NEXT:    mul w0, w0, w8
 ; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: test85_fast_shift:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #85
+; GISEL-NEXT:    mov w8, #85 // =0x55
 ; GISEL-NEXT:    mul w0, w0, w8
 ; GISEL-NEXT:    ret
 
@@ -567,13 +567,13 @@ define i32 @test85_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
 define i32 @test297_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
 ; CHECK-LABEL: test297_fast_shift:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #297
+; CHECK-NEXT:    mov w8, #297 // =0x129
 ; CHECK-NEXT:    mul w0, w0, w8
 ; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: test297_fast_shift:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #297
+; GISEL-NEXT:    mov w8, #297 // =0x129
 ; GISEL-NEXT:    mul w0, w0, w8
 ; GISEL-NEXT:    ret
 
@@ -593,7 +593,7 @@ define i32 @ntest2(i32 %x) {
 ;
 ; GISEL-LABEL: ntest2:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #-2
+; GISEL-NEXT:    mov w8, #-2 // =0xfffffffe
 ; GISEL-NEXT:    mul w0, w0, w8
 ; GISEL-NEXT:    ret
 
@@ -624,7 +624,7 @@ define i32 @ntest4(i32 %x) {
 ;
 ; GISEL-LABEL: ntest4:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #-4
+; GISEL-NEXT:    mov w8, #-4 // =0xfffffffc
 ; GISEL-NEXT:    mul w0, w0, w8
 ; GISEL-NEXT:    ret
 
@@ -657,7 +657,7 @@ define i32 @ntest6(i32 %x) {
 ;
 ; GISEL-LABEL: ntest6:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #-6
+; GISEL-NEXT:    mov w8, #-6 // =0xfffffffa
 ; GISEL-NEXT:    mul w0, w0, w8
 ; GISEL-NEXT:    ret
 
@@ -688,7 +688,7 @@ define i32 @ntest8(i32 %x) {
 ;
 ; GISEL-LABEL: ntest8:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #-8
+; GISEL-NEXT:    mov w8, #-8 // =0xfffffff8
 ; GISEL-NEXT:    mul w0, w0, w8
 ; GISEL-NEXT:    ret
 
@@ -716,13 +716,13 @@ define i32 @ntest9(i32 %x) {
 define i32 @ntest10(i32 %x) {
 ; CHECK-LABEL: ntest10:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-10
+; CHECK-NEXT:    mov w8, #-10 // =0xfffffff6
 ; CHECK-NEXT:    mul w0, w0, w8
 ; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: ntest10:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #-10
+; GISEL-NEXT:    mov w8, #-10 // =0xfffffff6
 ; GISEL-NEXT:    mul w0, w0, w8
 ; GISEL-NEXT:    ret
 
@@ -733,13 +733,13 @@ define i32 @ntest10(i32 %x) {
 define i32 @ntest11(i32 %x) {
 ; CHECK-LABEL: ntest11:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-11
+; CHECK-NEXT:    mov w8, #-11 // =0xfffffff5
 ; CHECK-NEXT:    mul w0, w0, w8
 ; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: ntest11:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #-11
+; GISEL-NEXT:    mov w8, #-11 // =0xfffffff5
 ; GISEL-NEXT:    mul w0, w0, w8
 ; GISEL-NEXT:    ret
 
@@ -756,7 +756,7 @@ define i32 @ntest12(i32 %x) {
 ;
 ; GISEL-LABEL: ntest12:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #-12
+; GISEL-NEXT:    mov w8, #-12 // =0xfffffff4
 ; GISEL-NEXT:    mul w0, w0, w8
 ; GISEL-NEXT:    ret
 
@@ -767,13 +767,13 @@ define i32 @ntest12(i32 %x) {
 define i32 @ntest13(i32 %x) {
 ; CHECK-LABEL: ntest13:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-13
+; CHECK-NEXT:    mov w8, #-13 // =0xfffffff3
 ; CHECK-NEXT:    mul w0, w0, w8
 ; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: ntest13:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #-13
+; GISEL-NEXT:    mov w8, #-13 // =0xfffffff3
 ; GISEL-NEXT:    mul w0, w0, w8
 ; GISEL-NEXT:    ret
   %mul = mul nsw i32 %x, -13
@@ -789,7 +789,7 @@ define i32 @ntest14(i32 %x) {
 ;
 ; GISEL-LABEL: ntest14:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #-14
+; GISEL-NEXT:    mov w8, #-14 // =0xfffffff2
 ; GISEL-NEXT:    mul w0, w0, w8
 ; GISEL-NEXT:    ret
 
@@ -820,7 +820,7 @@ define i32 @ntest16(i32 %x) {
 ;
 ; GISEL-LABEL: ntest16:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #-16
+; GISEL-NEXT:    mov w8, #-16 // =0xfffffff0
 ; GISEL-NEXT:    mul w0, w0, w8
 ; GISEL-NEXT:    ret
 
@@ -837,7 +837,7 @@ define i32 @muladd_demand(i32 %x, i32 %y) {
 ;
 ; GISEL-LABEL: muladd_demand:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #131008
+; GISEL-NEXT:    mov w8, #131008 // =0x1ffc0
 ; GISEL-NEXT:    madd w8, w0, w8, w1
 ; GISEL-NEXT:    and w0, w8, #0x1ffc0
 ; GISEL-NEXT:    ret
@@ -850,8 +850,8 @@ define i32 @muladd_demand(i32 %x, i32 %y) {
 define <4 x i32> @muladd_demand_commute(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: muladd_demand_commute:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.4s, #1, msl #16
 ; CHECK-NEXT:    shl v0.4s, v0.4s, #6
+; CHECK-NEXT:    movi v2.4s, #1, msl #16
 ; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/mulcmle.ll b/llvm/test/CodeGen/AArch64/mulcmle.ll
index 7a0c946410164e..5c216b85500801 100644
--- a/llvm/test/CodeGen/AArch64/mulcmle.ll
+++ b/llvm/test/CodeGen/AArch64/mulcmle.ll
@@ -83,8 +83,8 @@ define <8 x i8> @v8i8(<8 x i8> %a) {
 ; CHECK-LABEL: v8i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.8b, #17
-; CHECK-NEXT:    movi v2.8b, #15
 ; CHECK-NEXT:    ushr v0.8b, v0.8b, #3
+; CHECK-NEXT:    movi v2.8b, #15
 ; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    mul v0.8b, v0.8b, v2.8b
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll
index 38bd6447ac7ce2..9210a5ec1c8b7d 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll
@@ -39,9 +39,8 @@ define <2 x i8> @splice_v2i8_idx(<2 x i8> %a, <2 x i8> %b) #0 {
 define <8 x i32> @splice_v8i32_idx(<8 x i32> %a, <8 x i32> %b) #0 {
 ; CHECK-LABEL: splice_v8i32_idx:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v3.16b, v2.16b, v3.16b, #4
 ; CHECK-NEXT:    ext v0.16b, v1.16b, v2.16b, #4
-; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    ext v1.16b, v2.16b, v3.16b, #4
 ; CHECK-NEXT:    ret
   %res = call <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32> %a, <8 x i32> %b, i32 5)
   ret <8 x i32> %res
@@ -51,12 +50,11 @@ define <8 x i32> @splice_v8i32_idx(<8 x i32> %a, <8 x i32> %b) #0 {
 define <16 x float> @splice_v16f32_idx(<16 x float> %a, <16 x float> %b) #0 {
 ; CHECK-LABEL: splice_v16f32_idx:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v5.16b, v4.16b, v5.16b, #12
-; CHECK-NEXT:    ext v6.16b, v2.16b, v3.16b, #12
+; CHECK-NEXT:    ext v6.16b, v3.16b, v4.16b, #12
 ; CHECK-NEXT:    ext v0.16b, v1.16b, v2.16b, #12
-; CHECK-NEXT:    ext v2.16b, v3.16b, v4.16b, #12
-; CHECK-NEXT:    mov v3.16b, v5.16b
-; CHECK-NEXT:    mov v1.16b, v6.16b
+; CHECK-NEXT:    ext v1.16b, v2.16b, v3.16b, #12
+; CHECK-NEXT:    ext v3.16b, v4.16b, v5.16b, #12
+; CHECK-NEXT:    mov v2.16b, v6.16b
 ; CHECK-NEXT:    ret
   %res = call <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 7)
   ret <16 x float> %res
@@ -98,9 +96,8 @@ define <2 x i8> @splice_v2i8(<2 x i8> %a, <2 x i8> %b) #0 {
 define <8 x i32> @splice_v8i32(<8 x i32> %a, <8 x i32> %b) #0 {
 ; CHECK-LABEL: splice_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v3.16b, v2.16b, v3.16b, #4
 ; CHECK-NEXT:    ext v0.16b, v1.16b, v2.16b, #4
-; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    ext v1.16b, v2.16b, v3.16b, #4
 ; CHECK-NEXT:    ret
   %res = call <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32> %a, <8 x i32> %b, i32 -3)
   ret <8 x i32> %res
@@ -110,12 +107,11 @@ define <8 x i32> @splice_v8i32(<8 x i32> %a, <8 x i32> %b) #0 {
 define <16 x float> @splice_v16f32(<16 x float> %a, <16 x float> %b) #0 {
 ; CHECK-LABEL: splice_v16f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v5.16b, v4.16b, v5.16b, #12
-; CHECK-NEXT:    ext v6.16b, v2.16b, v3.16b, #12
+; CHECK-NEXT:    ext v6.16b, v3.16b, v4.16b, #12
 ; CHECK-NEXT:    ext v0.16b, v1.16b, v2.16b, #12
-; CHECK-NEXT:    ext v2.16b, v3.16b, v4.16b, #12
-; CHECK-NEXT:    mov v3.16b, v5.16b
-; CHECK-NEXT:    mov v1.16b, v6.16b
+; CHECK-NEXT:    ext v1.16b, v2.16b, v3.16b, #12
+; CHECK-NEXT:    ext v3.16b, v4.16b, v5.16b, #12
+; CHECK-NEXT:    mov v2.16b, v6.16b
 ; CHECK-NEXT:    ret
   %res = call <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 -9)
   ret <16 x float> %res

diff  --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
index 404811433ac20e..f1e95ca9c206f0 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
@@ -258,8 +258,8 @@ define <vscale x 2 x i1> @splice_nxv2i1_idx(<vscale x 2 x i1> %a, <vscale x 2 x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.d, p1/z, #1 // =0x1
 ; CHECK-NEXT:    mov z1.d, p0/z, #1 // =0x1
-; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    ptrue p2.d
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    and z1.d, z1.d, #0x1
 ; CHECK-NEXT:    cmpne p0.d, p2/z, z1.d, #0
 ; CHECK-NEXT:    ret
@@ -273,8 +273,8 @@ define <vscale x 4 x i1> @splice_nxv4i1_idx(<vscale x 4 x i1> %a, <vscale x 4 x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.s, p1/z, #1 // =0x1
 ; CHECK-NEXT:    mov z1.s, p0/z, #1 // =0x1
-; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    ptrue p2.s
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    and z1.s, z1.s, #0x1
 ; CHECK-NEXT:    cmpne p0.s, p2/z, z1.s, #0
 ; CHECK-NEXT:    ret
@@ -288,8 +288,8 @@ define <vscale x 8 x i1> @splice_nxv8i1_idx(<vscale x 8 x i1> %a, <vscale x 8 x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.h, p1/z, #1 // =0x1
 ; CHECK-NEXT:    mov z1.h, p0/z, #1 // =0x1
-; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    ptrue p2.h
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    and z1.h, z1.h, #0x1
 ; CHECK-NEXT:    cmpne p0.h, p2/z, z1.h, #0
 ; CHECK-NEXT:    ret
@@ -303,8 +303,8 @@ define <vscale x 16 x i1> @splice_nxv16i1_idx(<vscale x 16 x i1> %a, <vscale x 1
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.b, p1/z, #1 // =0x1
 ; CHECK-NEXT:    mov z1.b, p0/z, #1 // =0x1
-; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    ptrue p2.b
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    and z1.b, z1.b, #0x1
 ; CHECK-NEXT:    cmpne p0.b, p2/z, z1.b, #0
 ; CHECK-NEXT:    ret
@@ -328,8 +328,8 @@ define <vscale x 8 x i32> @splice_nxv8i32_idx(<vscale x 8 x i32> %a, <vscale x 8
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-4
-; CHECK-NEXT:    mov x8, sp
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov x8, sp
 ; CHECK-NEXT:    orr x8, x8, #0x8
 ; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
@@ -350,20 +350,20 @@ define <vscale x 16 x float> @splice_nxv16f32_16(<vscale x 16 x float> %a, <vsca
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-8
-; CHECK-NEXT:    mov x8, #-1
-; CHECK-NEXT:    mov w9, #16
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    st1w { z3.s }, p0, [sp, #3, mul vl]
-; CHECK-NEXT:    st1w { z2.s }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov w9, #16 // =0x10
 ; CHECK-NEXT:    addvl x8, x8, #1
-; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    cmp x8, #16
-; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    add x10, x9, x8, lsl #2
+; CHECK-NEXT:    st1w { z3.s }, p0, [sp, #3, mul vl]
+; CHECK-NEXT:    st1w { z2.s }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
 ; CHECK-NEXT:    st1w { z7.s }, p0, [sp, #7, mul vl]
 ; CHECK-NEXT:    st1w { z4.s }, p0, [sp, #4, mul vl]
-; CHECK-NEXT:    add x10, x9, x8, lsl #2
 ; CHECK-NEXT:    st1w { z5.s }, p0, [sp, #5, mul vl]
 ; CHECK-NEXT:    st1w { z6.s }, p0, [sp, #6, mul vl]
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x9, x8, lsl #2]
@@ -452,15 +452,15 @@ define <vscale x 16 x i8> @splice_nxv16i8_neg17(<vscale x 16 x i8> %a, <vscale x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-2
-; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    rdvl x9, #1
-; CHECK-NEXT:    cmp x9, #17
-; CHECK-NEXT:    mov w10, #17
-; CHECK-NEXT:    csel x9, x9, x10, lo
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    addvl x8, x8, #1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #17 // =0x11
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    cmp x8, #17
+; CHECK-NEXT:    addvl x10, x10, #1
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    sub x8, x10, x8
 ; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK-NEXT:    sub x8, x8, x9
 ; CHECK-NEXT:    st1b { z1.b }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x8]
 ; CHECK-NEXT:    addvl sp, sp, #2
@@ -497,15 +497,15 @@ define <vscale x 8 x i16> @splice_nxv8i16_neg9(<vscale x 8 x i16> %a, <vscale x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-2
-; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    rdvl x9, #1
-; CHECK-NEXT:    cmp x9, #18
-; CHECK-NEXT:    mov w10, #18
-; CHECK-NEXT:    csel x9, x9, x10, lo
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    addvl x8, x8, #1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #18 // =0x12
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    cmp x8, #18
+; CHECK-NEXT:    addvl x10, x10, #1
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    sub x8, x10, x8
 ; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK-NEXT:    sub x8, x8, x9
 ; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x8]
 ; CHECK-NEXT:    addvl sp, sp, #2
@@ -608,15 +608,15 @@ define <vscale x 8 x half> @splice_nxv8f16_neg9(<vscale x 8 x half> %a, <vscale
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-2
-; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    rdvl x9, #1
-; CHECK-NEXT:    cmp x9, #18
-; CHECK-NEXT:    mov w10, #18
-; CHECK-NEXT:    csel x9, x9, x10, lo
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    addvl x8, x8, #1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #18 // =0x12
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    cmp x8, #18
+; CHECK-NEXT:    addvl x10, x10, #1
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    sub x8, x10, x8
 ; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK-NEXT:    sub x8, x8, x9
 ; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x8]
 ; CHECK-NEXT:    addvl sp, sp, #2
@@ -698,10 +698,10 @@ define <vscale x 2 x i1> @splice_nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1>
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p2.d, vl1
 ; CHECK-NEXT:    mov z0.d, p1/z, #1 // =0x1
-; CHECK-NEXT:    rev p2.d, p2.d
 ; CHECK-NEXT:    mov z1.d, p0/z, #1 // =0x1
-; CHECK-NEXT:    splice z1.d, p2, z1.d, z0.d
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    rev p2.d, p2.d
+; CHECK-NEXT:    splice z1.d, p2, z1.d, z0.d
 ; CHECK-NEXT:    and z1.d, z1.d, #0x1
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
 ; CHECK-NEXT:    ret
@@ -715,10 +715,10 @@ define <vscale x 4 x i1> @splice_nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1>
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p2.s, vl1
 ; CHECK-NEXT:    mov z0.s, p1/z, #1 // =0x1
-; CHECK-NEXT:    rev p2.s, p2.s
 ; CHECK-NEXT:    mov z1.s, p0/z, #1 // =0x1
-; CHECK-NEXT:    splice z1.s, p2, z1.s, z0.s
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    rev p2.s, p2.s
+; CHECK-NEXT:    splice z1.s, p2, z1.s, z0.s
 ; CHECK-NEXT:    and z1.s, z1.s, #0x1
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
 ; CHECK-NEXT:    ret
@@ -732,10 +732,10 @@ define <vscale x 8 x i1> @splice_nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1>
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p2.h, vl1
 ; CHECK-NEXT:    mov z0.h, p1/z, #1 // =0x1
-; CHECK-NEXT:    rev p2.h, p2.h
 ; CHECK-NEXT:    mov z1.h, p0/z, #1 // =0x1
-; CHECK-NEXT:    splice z1.h, p2, z1.h, z0.h
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    rev p2.h, p2.h
+; CHECK-NEXT:    splice z1.h, p2, z1.h, z0.h
 ; CHECK-NEXT:    and z1.h, z1.h, #0x1
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z1.h, #0
 ; CHECK-NEXT:    ret
@@ -749,10 +749,10 @@ define <vscale x 16 x i1> @splice_nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p2.b, vl1
 ; CHECK-NEXT:    mov z0.b, p1/z, #1 // =0x1
-; CHECK-NEXT:    rev p2.b, p2.b
 ; CHECK-NEXT:    mov z1.b, p0/z, #1 // =0x1
-; CHECK-NEXT:    splice z1.b, p2, z1.b, z0.b
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    rev p2.b, p2.b
+; CHECK-NEXT:    splice z1.b, p2, z1.b, z0.b
 ; CHECK-NEXT:    and z1.b, z1.b, #0x1
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, #0
 ; CHECK-NEXT:    ret
@@ -778,14 +778,14 @@ define <vscale x 8 x i32> @splice_nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-4
-; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    mov x9, #-8
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    mov x9, #-8 // =0xfffffffffffffff8
+; CHECK-NEXT:    addvl x8, x8, #2
+; CHECK-NEXT:    sub x10, x8, #32
 ; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK-NEXT:    addvl x8, x8, #2
 ; CHECK-NEXT:    st1w { z3.s }, p0, [sp, #3, mul vl]
-; CHECK-NEXT:    sub x10, x8, #32
 ; CHECK-NEXT:    st1w { z2.s }, p0, [sp, #2, mul vl]
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8, x9, lsl #2]
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x10, #1, mul vl]
@@ -802,15 +802,15 @@ define <vscale x 16 x float> @splice_nxv16f32_neg17(<vscale x 16 x float> %a, <v
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-8
-; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    rdvl x8, #4
+; CHECK-NEXT:    mov w9, #68 // =0x44
+; CHECK-NEXT:    mov x10, sp
 ; CHECK-NEXT:    cmp x8, #68
-; CHECK-NEXT:    mov w9, #68
 ; CHECK-NEXT:    csel x8, x8, x9, lo
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    addvl x9, x10, #4
-; CHECK-NEXT:    st1w { z3.s }, p0, [sp, #3, mul vl]
 ; CHECK-NEXT:    sub x8, x9, x8
+; CHECK-NEXT:    st1w { z3.s }, p0, [sp, #3, mul vl]
 ; CHECK-NEXT:    st1w { z2.s }, p0, [sp, #2, mul vl]
 ; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]

diff  --git a/llvm/test/CodeGen/AArch64/neg-imm.ll b/llvm/test/CodeGen/AArch64/neg-imm.ll
index baf1463058664f..70948fdb8c6990 100644
--- a/llvm/test/CodeGen/AArch64/neg-imm.ll
+++ b/llvm/test/CodeGen/AArch64/neg-imm.ll
@@ -20,9 +20,8 @@ define void @test(i32 %px) {
 ; CHECK-NEXT:    b .LBB0_2
 ; CHECK-NEXT:  .LBB0_1: // %for.inc
 ; CHECK-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    add w8, w20, #1
 ; CHECK-NEXT:    cmp w20, w19
-; CHECK-NEXT:    mov w20, w8
+; CHECK-NEXT:    add w20, w20, #1
 ; CHECK-NEXT:    b.gt .LBB0_4
 ; CHECK-NEXT:  .LBB0_2: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1

diff  --git a/llvm/test/CodeGen/AArch64/neon-abd.ll b/llvm/test/CodeGen/AArch64/neon-abd.ll
index 0d22470b01a97f..901cb8adc23f09 100644
--- a/llvm/test/CodeGen/AArch64/neon-abd.ll
+++ b/llvm/test/CodeGen/AArch64/neon-abd.ll
@@ -145,22 +145,22 @@ define <2 x i64> @sabd_2d(<2 x i64> %a, <2 x i64> %b) #0 {
 ; CHECK-LABEL: sabd_2d:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov x8, v0.d[1]
-; CHECK-NEXT:    fmov x10, d0
 ; CHECK-NEXT:    mov x9, v1.d[1]
-; CHECK-NEXT:    fmov x11, d1
-; CHECK-NEXT:    asr x12, x10, #63
-; CHECK-NEXT:    asr x14, x8, #63
-; CHECK-NEXT:    asr x15, x9, #63
+; CHECK-NEXT:    fmov x10, d0
+; CHECK-NEXT:    fmov x12, d1
+; CHECK-NEXT:    asr x14, x10, #63
+; CHECK-NEXT:    asr x11, x8, #63
+; CHECK-NEXT:    asr x13, x9, #63
+; CHECK-NEXT:    asr x15, x12, #63
 ; CHECK-NEXT:    subs x8, x8, x9
-; CHECK-NEXT:    asr x13, x11, #63
-; CHECK-NEXT:    sbc x9, x14, x15
-; CHECK-NEXT:    subs x10, x10, x11
+; CHECK-NEXT:    sbc x9, x11, x13
+; CHECK-NEXT:    subs x10, x10, x12
+; CHECK-NEXT:    sbc x11, x14, x15
 ; CHECK-NEXT:    asr x9, x9, #63
-; CHECK-NEXT:    sbc x11, x12, x13
-; CHECK-NEXT:    eor x8, x8, x9
 ; CHECK-NEXT:    asr x11, x11, #63
-; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    eor x8, x8, x9
 ; CHECK-NEXT:    eor x10, x10, x11
+; CHECK-NEXT:    sub x8, x8, x9
 ; CHECK-NEXT:    sub x10, x10, x11
 ; CHECK-NEXT:    fmov d1, x8
 ; CHECK-NEXT:    fmov d0, x10
@@ -325,8 +325,8 @@ define <2 x i64> @uabd_2d(<2 x i64> %a, <2 x i64> %b) #0 {
 ; CHECK-LABEL: uabd_2d:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov x8, v0.d[1]
-; CHECK-NEXT:    fmov x10, d0
 ; CHECK-NEXT:    mov x9, v1.d[1]
+; CHECK-NEXT:    fmov x10, d0
 ; CHECK-NEXT:    fmov x11, d1
 ; CHECK-NEXT:    subs x8, x8, x9
 ; CHECK-NEXT:    ngc x9, xzr

diff  --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
index 50f38b1744ef5c..221a1eb693ffa4 100644
--- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -896,9 +896,9 @@ define <4 x i32> @vselect_constant_cond_v4i32(<4 x i32> %a, <4 x i32> %b) {
 define <8 x i8> @vselect_equivalent_shuffle_v8i8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: vselect_equivalent_shuffle_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI89_0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    adrp x8, .LCPI89_0
 ; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI89_0]
 ; CHECK-NEXT:    tbl v0.8b, { v0.16b }, v1.8b
@@ -929,8 +929,8 @@ define <8 x i8> @vselect_equivalent_shuffle_v8i8_zero(<8 x i8> %a) {
 define <8 x i8> @vselect_equivalent_shuffle_v8i8_zeroswap(<8 x i8> %a) {
 ; CHECK-LABEL: vselect_equivalent_shuffle_v8i8_zeroswap:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI91_0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    adrp x8, .LCPI91_0
 ; CHECK-NEXT:    mov v0.d[1], v0.d[0]
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI91_0]
 ; CHECK-NEXT:    tbl v0.8b, { v0.16b }, v1.8b
@@ -961,8 +961,8 @@ define <8 x i16> @vselect_equivalent_shuffle_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI92_0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI92_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-NEXT:    ret
   %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 9, i32 4, i32 5, i32 6, i32 7>
@@ -1495,10 +1495,11 @@ entry:
 define <8 x i32> @bic_shifted_knownbits2(<8 x i16> %v) {
 ; CHECK-LABEL: bic_shifted_knownbits2:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ushll v2.4s, v0.4h, #0
 ; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    bic v2.4s, #255, lsl #8
 ; CHECK-NEXT:    bic v1.4s, #255, lsl #8
-; CHECK-NEXT:    bic v0.4s, #255, lsl #8
+; CHECK-NEXT:    mov v0.16b, v2.16b
 ; CHECK-NEXT:    ret
 entry:
   %vshr_n = zext <8 x i16> %v to <8 x i32>
@@ -1522,8 +1523,8 @@ define <8 x i32> @bic_shifted_knownbits3(<8 x i16> %v) {
 define <8 x i32> @bic_shifted_knownbits4(<8 x i32> %v) {
 ; CHECK-LABEL: bic_shifted_knownbits4:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    shl v0.4s, v0.4s, #8
 ; CHECK-NEXT:    shl v1.4s, v1.4s, #8
+; CHECK-NEXT:    shl v0.4s, v0.4s, #8
 ; CHECK-NEXT:    bic v0.4s, #255, lsl #8
 ; CHECK-NEXT:    bic v1.4s, #255, lsl #8
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
index 4b65f54c1caa94..1919a42e0a2f8c 100644
--- a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
@@ -1788,6 +1788,12 @@ define <2 x i64> @cmhsz2xi64(<2 x i64> %A) {
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI132_0]
 ; CHECK-GI-NEXT:    cmhs v0.2d, v0.2d, v1.2d
 ; CHECK-GI-NEXT:    ret
+; GISEL-LABEL: cmhsz2xi64:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    adrp x8, .LCPI132_0
+; GISEL-NEXT:    ldr q1, [x8, :lo12:.LCPI132_0]
+; GISEL-NEXT:    cmhs v0.2d, v0.2d, v1.2d
+; GISEL-NEXT:    ret
   %tmp3 = icmp uge <2 x i64> %A, <i64 2, i64 2>
   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
   ret <2 x i64> %tmp4
@@ -1916,6 +1922,12 @@ define <2 x i64> @cmhiz2xi64(<2 x i64> %A) {
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI139_0]
 ; CHECK-GI-NEXT:    cmhi v0.2d, v0.2d, v1.2d
 ; CHECK-GI-NEXT:    ret
+; GISEL-LABEL: cmhiz2xi64:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    adrp x8, .LCPI139_0
+; GISEL-NEXT:    ldr q1, [x8, :lo12:.LCPI139_0]
+; GISEL-NEXT:    cmhi v0.2d, v0.2d, v1.2d
+; GISEL-NEXT:    ret
   %tmp3 = icmp ugt <2 x i64> %A, <i64 1, i64 1>
   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
   ret <2 x i64> %tmp4
@@ -2134,6 +2146,12 @@ define <2 x i64> @cmloz2xi64(<2 x i64> %A) {
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI153_0]
 ; CHECK-GI-NEXT:    cmhi v0.2d, v1.2d, v0.2d
 ; CHECK-GI-NEXT:    ret
+; GISEL-LABEL: cmloz2xi64:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    adrp x8, .LCPI153_0
+; GISEL-NEXT:    ldr q1, [x8, :lo12:.LCPI153_0]
+; GISEL-NEXT:    cmhi v0.2d, v1.2d, v0.2d
+; GISEL-NEXT:    ret
   %tmp3 = icmp ult <2 x i64> %A, <i64 2, i64 2>
   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
   ret <2 x i64> %tmp4
@@ -4279,11 +4297,18 @@ define <4 x i32> @fcmule4xfloat_fast_zext(<4 x float> %A, <4 x float> %B) {
 ;
 ; CHECK-GI-LABEL: fcmule4xfloat_fast_zext:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI322_0
 ; CHECK-GI-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    adrp x8, .LCPI322_0
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI322_0]
 ; CHECK-GI-NEXT:    bic v0.16b, v1.16b, v0.16b
 ; CHECK-GI-NEXT:    ret
+; GISEL-LABEL: fcmule4xfloat_fast_zext:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
+; GISEL-NEXT:    adrp x8, .LCPI322_0
+; GISEL-NEXT:    ldr q1, [x8, :lo12:.LCPI322_0]
+; GISEL-NEXT:    bic v0.16b, v1.16b, v0.16b
+; GISEL-NEXT:    ret
   %tmp3 = fcmp fast ule <4 x float> %A, %B
   %tmp4 = zext <4 x i1> %tmp3 to <4 x i32>
   ret <4 x i32> %tmp4
@@ -4310,8 +4335,8 @@ define <4 x i1> @fcmule4xfloat_fast_aext(<4 x float> %A, <4 x float> %B) {
 define <4 x i64> @fcmoeq4xdouble(<4 x double> %A, <4 x double> %B) {
 ; CHECK-SD-LABEL: fcmoeq4xdouble:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    fcmeq v0.2d, v0.2d, v2.2d
 ; CHECK-SD-NEXT:    fcmeq v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    fcmeq v0.2d, v0.2d, v2.2d
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fcmoeq4xdouble:
@@ -4331,8 +4356,8 @@ define <4 x i64> @fcmoeq4xdouble(<4 x double> %A, <4 x double> %B) {
 define <8 x i32> @fcmoeq8xfloat(<8 x float> %A, <8 x float> %B) {
 ; CHECK-SD-LABEL: fcmoeq8xfloat:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    fcmeq v0.4s, v0.4s, v2.4s
 ; CHECK-SD-NEXT:    fcmeq v1.4s, v1.4s, v3.4s
+; CHECK-SD-NEXT:    fcmeq v0.4s, v0.4s, v2.4s
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fcmoeq8xfloat:

diff  --git a/llvm/test/CodeGen/AArch64/neon-dotpattern.ll b/llvm/test/CodeGen/AArch64/neon-dotpattern.ll
index 6582dd9e382264..7f8923e6b3a925 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotpattern.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotpattern.ll
@@ -5,8 +5,8 @@ define fastcc void @test_sdot_v4i8(ptr noalias nocapture %0, ptr noalias nocaptu
 ; CHECK-LABEL: test_sdot_v4i8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr w8, [x2]
-; CHECK-NEXT:    dup v0.2s, wzr
 ; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    dup v0.2s, wzr
 ; CHECK-NEXT:    fmov s1, w8
 ; CHECK-NEXT:    fmov s2, w9
 ; CHECK-NEXT:    sdot v0.2s, v1.8b, v2.8b
@@ -51,8 +51,8 @@ define fastcc void @test_udot_v4i8(ptr noalias nocapture %0, ptr noalias nocaptu
 ; CHECK-LABEL: test_udot_v4i8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr w8, [x2]
-; CHECK-NEXT:    dup v0.2s, wzr
 ; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    dup v0.2s, wzr
 ; CHECK-NEXT:    fmov s1, w8
 ; CHECK-NEXT:    fmov s2, w9
 ; CHECK-NEXT:    udot v0.2s, v1.8b, v2.8b

diff  --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index 18537798940fc4..40a8128857cb7f 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -15,11 +15,11 @@ declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>)
 define i32 @test_udot_v4i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
 ; CHECK-LABEL: test_udot_v4i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr s0, [x1]
-; CHECK-NEXT:    ldr s1, [x0]
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ldr s1, [x1]
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    umull v0.4s, v0.4h, v1.4h
+; CHECK-NEXT:    umull v0.4s, v1.4h, v0.4h
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    add w0, w8, w2
@@ -53,11 +53,11 @@ entry:
 define i32 @test_sdot_v4i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
 ; CHECK-LABEL: test_sdot_v4i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr s0, [x1]
-; CHECK-NEXT:    ldr s1, [x0]
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ldr s1, [x1]
 ; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-NEXT:    smull v0.4s, v1.4h, v0.4h
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    add w0, w8, w2
@@ -78,17 +78,17 @@ define i32 @test_sdot_v4i8_double(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ushll v3.4s, v3.4h, #0
 ; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-NEXT:    shl v3.4s, v3.4s, #24
-; CHECK-NEXT:    shl v2.4s, v2.4s, #24
 ; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    sshr v3.4s, v3.4s, #24
 ; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    sshr v2.4s, v2.4s, #24
-; CHECK-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-NEXT:    shl v2.4s, v2.4s, #24
+; CHECK-NEXT:    shl v3.4s, v3.4s, #24
 ; CHECK-NEXT:    shl v1.4s, v1.4s, #24
-; CHECK-NEXT:    mul v2.4s, v2.4s, v3.4s
-; CHECK-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-NEXT:    sshr v2.4s, v2.4s, #24
+; CHECK-NEXT:    sshr v3.4s, v3.4s, #24
 ; CHECK-NEXT:    sshr v1.4s, v1.4s, #24
+; CHECK-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-NEXT:    mul v2.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    addv s0, v2.4s
 ; CHECK-NEXT:    fmov w0, s0
@@ -130,15 +130,15 @@ entry:
 define i32 @test_udot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
 ; CHECK-LABEL: test_udot_v5i8:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr d2, [x0]
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-NEXT:    umull2 v3.4s, v1.8h, v2.8h
-; CHECK-NEXT:    mov v0.s[0], v3.s[0]
-; CHECK-NEXT:    umlal v0.4s, v1.4h, v2.4h
-; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    umull2 v2.4s, v1.8h, v0.8h
+; CHECK-NEXT:    mov v3.s[0], v2.s[0]
+; CHECK-NEXT:    umlal v3.4s, v1.4h, v0.4h
+; CHECK-NEXT:    addv s0, v3.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    add w0, w8, w2
 ; CHECK-NEXT:    ret
@@ -156,12 +156,12 @@ entry:
 define i32 @test_udot_v5i8_nomla(ptr nocapture readonly %a1) {
 ; CHECK-LABEL: test_udot_v5i8_nomla:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    ushll2 v2.4s, v1.8h, #0
-; CHECK-NEXT:    mov v0.s[0], v2.s[0]
-; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-NEXT:    mov v1.s[0], v2.s[0]
+; CHECK-NEXT:    uaddw v0.4s, v1.4s, v0.4h
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -174,15 +174,15 @@ entry:
 define i32 @test_sdot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
 ; CHECK-LABEL: test_sdot_v5i8:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr d2, [x0]
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-NEXT:    smull2 v3.4s, v1.8h, v2.8h
-; CHECK-NEXT:    mov v0.s[0], v3.s[0]
-; CHECK-NEXT:    smlal v0.4s, v1.4h, v2.4h
-; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    smull2 v2.4s, v1.8h, v0.8h
+; CHECK-NEXT:    mov v3.s[0], v2.s[0]
+; CHECK-NEXT:    smlal v3.4s, v1.4h, v0.4h
+; CHECK-NEXT:    addv s0, v3.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    add w0, w8, w2
 ; CHECK-NEXT:    ret
@@ -200,19 +200,19 @@ entry:
 define i32 @test_sdot_v5i8_double(<5 x i8> %a, <5 x i8> %b, <5 x i8> %c, <5 x i8> %d) {
 ; CHECK-LABEL: test_sdot_v5i8_double:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
 ; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
-; CHECK-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEXT:    smull2 v5.4s, v0.8h, v1.8h
+; CHECK-NEXT:    movi v5.2d, #0000000000000000
 ; CHECK-NEXT:    movi v6.2d, #0000000000000000
+; CHECK-NEXT:    smull2 v4.4s, v0.8h, v1.8h
 ; CHECK-NEXT:    smull2 v7.4s, v2.8h, v3.8h
-; CHECK-NEXT:    mov v6.s[0], v5.s[0]
-; CHECK-NEXT:    mov v4.s[0], v7.s[0]
+; CHECK-NEXT:    mov v6.s[0], v4.s[0]
+; CHECK-NEXT:    mov v5.s[0], v7.s[0]
 ; CHECK-NEXT:    smlal v6.4s, v0.4h, v1.4h
-; CHECK-NEXT:    smlal v4.4s, v2.4h, v3.4h
-; CHECK-NEXT:    add v0.4s, v6.4s, v4.4s
+; CHECK-NEXT:    smlal v5.4s, v2.4h, v3.4h
+; CHECK-NEXT:    add v0.4s, v6.4s, v5.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -232,16 +232,16 @@ entry:
 define i32 @test_sdot_v5i8_double_nomla(<5 x i8> %a, <5 x i8> %b, <5 x i8> %c, <5 x i8> %d) {
 ; CHECK-LABEL: test_sdot_v5i8_double_nomla:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-NEXT:    sshll v1.8h, v2.8b, #0
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-NEXT:    sshll2 v4.4s, v0.8h, #0
-; CHECK-NEXT:    sshll2 v5.4s, v2.8h, #0
+; CHECK-NEXT:    sshll2 v5.4s, v1.8h, #0
 ; CHECK-NEXT:    mov v3.s[0], v4.s[0]
-; CHECK-NEXT:    mov v1.s[0], v5.s[0]
+; CHECK-NEXT:    mov v2.s[0], v5.s[0]
 ; CHECK-NEXT:    saddw v0.4s, v3.4s, v0.4h
-; CHECK-NEXT:    saddw v1.4s, v1.4s, v2.4h
+; CHECK-NEXT:    saddw v1.4s, v2.4s, v1.4h
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
@@ -278,11 +278,11 @@ entry:
 define i32 @test_udot_v8i8_nomla(ptr nocapture readonly %a1) {
 ; CHECK-LABEL: test_udot_v8i8_nomla:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v0.8b, #1
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    movi v1.8b, #1
 ; CHECK-NEXT:    ldr d2, [x0]
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    udot v1.2s, v2.8b, v0.8b
-; CHECK-NEXT:    addp v0.2s, v1.2s, v1.2s
+; CHECK-NEXT:    udot v0.2s, v2.8b, v1.8b
+; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 entry:
@@ -315,11 +315,11 @@ entry:
 define i32 @test_sdot_v8i8_nomla(ptr nocapture readonly %a1) {
 ; CHECK-LABEL: test_sdot_v8i8_nomla:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v0.8b, #1
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    movi v1.8b, #1
 ; CHECK-NEXT:    ldr d2, [x0]
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    sdot v1.2s, v2.8b, v0.8b
-; CHECK-NEXT:    addp v0.2s, v1.2s, v1.2s
+; CHECK-NEXT:    sdot v0.2s, v2.8b, v1.8b
+; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 entry:
@@ -334,9 +334,9 @@ define i32 @test_udot_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b
 ; CHECK-LABEL: test_udot_v16i8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ldr q2, [x0]
-; CHECK-NEXT:    udot v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
+; CHECK-NEXT:    udot v0.4s, v2.16b, v1.16b
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    add w0, w8, w2
@@ -356,8 +356,8 @@ define i32 @test_udot_v16i8_nomla(ptr nocapture readonly %a1) {
 ; CHECK-LABEL: test_udot_v16i8_nomla:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v0.16b, #1
-; CHECK-NEXT:    ldr q2, [x0]
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    ldr q2, [x0]
 ; CHECK-NEXT:    udot v1.4s, v2.16b, v0.16b
 ; CHECK-NEXT:    addv s0, v1.4s
 ; CHECK-NEXT:    fmov w0, s0
@@ -373,9 +373,9 @@ define i32 @test_sdot_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b
 ; CHECK-LABEL: test_sdot_v16i8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ldr q2, [x0]
-; CHECK-NEXT:    sdot v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
+; CHECK-NEXT:    sdot v0.4s, v2.16b, v1.16b
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    add w0, w8, w2
@@ -395,8 +395,8 @@ define i32 @test_sdot_v16i8_nomla(ptr nocapture readonly %a1) {
 ; CHECK-LABEL: test_sdot_v16i8_nomla:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v0.16b, #1
-; CHECK-NEXT:    ldr q2, [x0]
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    ldr q2, [x0]
 ; CHECK-NEXT:    sdot v1.4s, v2.16b, v0.16b
 ; CHECK-NEXT:    addv s0, v1.4s
 ; CHECK-NEXT:    fmov w0, s0
@@ -434,11 +434,11 @@ entry:
 define i32 @test_udot_v8i8_double_nomla(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
 ; CHECK-LABEL: test_udot_v8i8_double_nomla:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v1.8b, #1
-; CHECK-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEXT:    udot v3.2s, v2.8b, v1.8b
-; CHECK-NEXT:    udot v3.2s, v0.8b, v1.8b
-; CHECK-NEXT:    addp v0.2s, v3.2s, v3.2s
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    movi v3.8b, #1
+; CHECK-NEXT:    udot v1.2s, v2.8b, v3.8b
+; CHECK-NEXT:    udot v1.2s, v0.8b, v3.8b
+; CHECK-NEXT:    addp v0.2s, v1.2s, v1.2s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 entry:
@@ -516,11 +516,11 @@ entry:
 define i32 @test_sdot_v8i8_double_nomla(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
 ; CHECK-LABEL: test_sdot_v8i8_double_nomla:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v1.8b, #1
-; CHECK-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEXT:    sdot v3.2s, v2.8b, v1.8b
-; CHECK-NEXT:    sdot v3.2s, v0.8b, v1.8b
-; CHECK-NEXT:    addp v0.2s, v3.2s, v3.2s
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    movi v3.8b, #1
+; CHECK-NEXT:    sdot v1.2s, v2.8b, v3.8b
+; CHECK-NEXT:    sdot v1.2s, v0.8b, v3.8b
+; CHECK-NEXT:    addp v0.2s, v1.2s, v1.2s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 entry:
@@ -577,17 +577,17 @@ define i32 @test_udot_v24i8(ptr nocapture readonly %a, ptr nocapture readonly %b
 ; CHECK-LABEL: test_udot_v24i8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEXT:    ldr d2, [x0, #16]
-; CHECK-NEXT:    ldr d4, [x1, #16]
-; CHECK-NEXT:    ldr q5, [x1]
-; CHECK-NEXT:    udot v0.2s, v4.8b, v2.8b
-; CHECK-NEXT:    udot v3.4s, v5.16b, v1.16b
-; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
-; CHECK-NEXT:    addv s1, v3.4s
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    ldr q2, [x0]
+; CHECK-NEXT:    ldr q3, [x1]
+; CHECK-NEXT:    ldr d4, [x0, #16]
+; CHECK-NEXT:    ldr d5, [x1, #16]
+; CHECK-NEXT:    udot v1.2s, v5.8b, v4.8b
+; CHECK-NEXT:    udot v0.4s, v3.16b, v2.16b
+; CHECK-NEXT:    addp v1.2s, v1.2s, v1.2s
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    add w8, w9, w8
 ; CHECK-NEXT:    add w0, w8, w2
 ; CHECK-NEXT:    ret
@@ -605,18 +605,18 @@ entry:
 define i32 @test_udot_v24i8_nomla(ptr nocapture readonly %a1) {
 ; CHECK-LABEL: test_udot_v24i8_nomla:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v0.8b, #1
-; CHECK-NEXT:    ldr d4, [x0, #16]
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    ldr q5, [x0]
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    movi v1.8b, #1
+; CHECK-NEXT:    ldr q4, [x0]
 ; CHECK-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-NEXT:    movi v3.16b, #1
-; CHECK-NEXT:    udot v2.2s, v4.8b, v0.8b
-; CHECK-NEXT:    udot v1.4s, v5.16b, v3.16b
-; CHECK-NEXT:    addp v0.2s, v2.2s, v2.2s
-; CHECK-NEXT:    addv s1, v1.4s
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    ldr d5, [x0, #16]
+; CHECK-NEXT:    udot v2.2s, v5.8b, v1.8b
+; CHECK-NEXT:    udot v0.4s, v4.16b, v3.16b
+; CHECK-NEXT:    addp v1.2s, v2.2s, v2.2s
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    add w0, w9, w8
 ; CHECK-NEXT:    ret
 entry:
@@ -629,17 +629,17 @@ define i32 @test_sdot_v24i8(ptr nocapture readonly %a, ptr nocapture readonly %b
 ; CHECK-LABEL: test_sdot_v24i8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEXT:    ldr d2, [x0, #16]
-; CHECK-NEXT:    ldr d4, [x1, #16]
-; CHECK-NEXT:    ldr q5, [x1]
-; CHECK-NEXT:    sdot v0.2s, v4.8b, v2.8b
-; CHECK-NEXT:    sdot v3.4s, v5.16b, v1.16b
-; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
-; CHECK-NEXT:    addv s1, v3.4s
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    ldr q2, [x0]
+; CHECK-NEXT:    ldr q3, [x1]
+; CHECK-NEXT:    ldr d4, [x0, #16]
+; CHECK-NEXT:    ldr d5, [x1, #16]
+; CHECK-NEXT:    sdot v1.2s, v5.8b, v4.8b
+; CHECK-NEXT:    sdot v0.4s, v3.16b, v2.16b
+; CHECK-NEXT:    addp v1.2s, v1.2s, v1.2s
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    add w8, w9, w8
 ; CHECK-NEXT:    add w0, w8, w2
 ; CHECK-NEXT:    ret
@@ -660,200 +660,200 @@ define i32 @test_sdot_v24i8_double(<24 x i8> %a, <24 x i8> %b, <24 x i8> %c, <24
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ldr b0, [sp, #144]
-; CHECK-NEXT:    add x8, sp, #152
-; CHECK-NEXT:    fmov s1, w0
-; CHECK-NEXT:    add x9, sp, #168
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ldr b1, [sp, #144]
+; CHECK-NEXT:    add x10, sp, #152
+; CHECK-NEXT:    add x9, sp, #160
+; CHECK-NEXT:    add x8, sp, #168
 ; CHECK-NEXT:    ldr b2, [sp, #272]
+; CHECK-NEXT:    ld1 { v1.b }[1], [x10]
 ; CHECK-NEXT:    add x11, sp, #280
-; CHECK-NEXT:    ld1 { v0.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #160
-; CHECK-NEXT:    mov v1.b[1], w1
-; CHECK-NEXT:    add x10, sp, #184
+; CHECK-NEXT:    ldr b3, [sp, #80]
+; CHECK-NEXT:    mov v0.b[1], w1
+; CHECK-NEXT:    ldr b4, [sp, #528]
+; CHECK-NEXT:    ldr b6, [sp, #656]
+; CHECK-NEXT:    add x10, sp, #88
 ; CHECK-NEXT:    ld1 { v2.b }[1], [x11]
 ; CHECK-NEXT:    add x11, sp, #536
-; CHECK-NEXT:    ldr b4, [sp, #528]
-; CHECK-NEXT:    add x12, sp, #88
-; CHECK-NEXT:    ld1 { v0.b }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #176
-; CHECK-NEXT:    mov v1.b[2], w2
+; CHECK-NEXT:    ld1 { v1.b }[2], [x9]
 ; CHECK-NEXT:    ldr b5, [sp, #336]
+; CHECK-NEXT:    ldr b7, [sp, #464]
+; CHECK-NEXT:    add x12, sp, #664
+; CHECK-NEXT:    ld1 { v3.b }[1], [x10]
+; CHECK-NEXT:    add x10, sp, #344
+; CHECK-NEXT:    mov v0.b[2], w2
 ; CHECK-NEXT:    ld1 { v4.b }[1], [x11]
-; CHECK-NEXT:    add x11, sp, #344
-; CHECK-NEXT:    ldr b3, [sp, #80]
-; CHECK-NEXT:    add x13, sp, #96
-; CHECK-NEXT:    ld1 { v0.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #192
-; CHECK-NEXT:    mov v1.b[3], w3
-; CHECK-NEXT:    ld1 { v5.b }[1], [x11]
-; CHECK-NEXT:    ld1 { v3.b }[1], [x12]
-; CHECK-NEXT:    add x12, sp, #200
-; CHECK-NEXT:    add x11, sp, #544
-; CHECK-NEXT:    ldr b7, [sp, #656]
-; CHECK-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-NEXT:    add x8, sp, #288
-; CHECK-NEXT:    mov v1.b[4], w4
-; CHECK-NEXT:    ldr b16, [sp, #464]
-; CHECK-NEXT:    ld1 { v3.b }[2], [x13]
-; CHECK-NEXT:    ld1 { v2.b }[2], [x8]
+; CHECK-NEXT:    add x11, sp, #176
+; CHECK-NEXT:    ld1 { v1.b }[3], [x8]
+; CHECK-NEXT:    add x9, sp, #472
+; CHECK-NEXT:    ld1 { v6.b }[1], [x12]
+; CHECK-NEXT:    ld1 { v5.b }[1], [x10]
+; CHECK-NEXT:    add x12, sp, #288
+; CHECK-NEXT:    ld1 { v7.b }[1], [x9]
+; CHECK-NEXT:    ld1 { v2.b }[2], [x12]
+; CHECK-NEXT:    add x8, sp, #96
+; CHECK-NEXT:    add x13, sp, #544
+; CHECK-NEXT:    mov v0.b[3], w3
+; CHECK-NEXT:    ld1 { v1.b }[4], [x11]
+; CHECK-NEXT:    add x11, sp, #672
+; CHECK-NEXT:    add x10, sp, #184
+; CHECK-NEXT:    ld1 { v3.b }[2], [x8]
 ; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ld1 { v0.b }[5], [x10]
-; CHECK-NEXT:    add x10, sp, #296
-; CHECK-NEXT:    mov v1.b[5], w5
-; CHECK-NEXT:    ld1 { v4.b }[2], [x11]
+; CHECK-NEXT:    ld1 { v4.b }[2], [x13]
+; CHECK-NEXT:    ld1 { v6.b }[2], [x11]
+; CHECK-NEXT:    add x11, sp, #480
 ; CHECK-NEXT:    ld1 { v5.b }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #360
-; CHECK-NEXT:    ld1 { v2.b }[3], [x10]
-; CHECK-NEXT:    add x10, sp, #208
-; CHECK-NEXT:    ld1 { v0.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #104
-; CHECK-NEXT:    mov v1.b[6], w6
+; CHECK-NEXT:    ld1 { v7.b }[2], [x11]
+; CHECK-NEXT:    add x11, sp, #296
+; CHECK-NEXT:    mov v0.b[4], w4
+; CHECK-NEXT:    ld1 { v1.b }[5], [x10]
+; CHECK-NEXT:    ld1 { v2.b }[3], [x11]
 ; CHECK-NEXT:    add x11, sp, #552
-; CHECK-NEXT:    ld1 { v5.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #16
-; CHECK-NEXT:    ld1 { v3.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #304
-; CHECK-NEXT:    ld1 { v0.b }[7], [x12]
-; CHECK-NEXT:    mov v1.b[7], w7
+; CHECK-NEXT:    add x9, sp, #192
+; CHECK-NEXT:    add x15, sp, #104
 ; CHECK-NEXT:    ld1 { v4.b }[3], [x11]
-; CHECK-NEXT:    add x11, sp, #112
-; CHECK-NEXT:    ld1 { v2.b }[4], [x9]
+; CHECK-NEXT:    add x11, sp, #360
+; CHECK-NEXT:    add x12, sp, #200
+; CHECK-NEXT:    ld1 { v5.b }[3], [x11]
+; CHECK-NEXT:    ld1 { v1.b }[6], [x9]
 ; CHECK-NEXT:    add x9, sp, #560
-; CHECK-NEXT:    ld1 { v0.b }[8], [x10]
-; CHECK-NEXT:    add x10, sp, #216
-; CHECK-NEXT:    ld1 { v1.b }[8], [x8]
-; CHECK-NEXT:    add x8, sp, #24
-; CHECK-NEXT:    ld1 { v3.b }[4], [x11]
-; CHECK-NEXT:    add x11, sp, #368
+; CHECK-NEXT:    mov v0.b[5], w5
+; CHECK-NEXT:    ld1 { v3.b }[3], [x15]
+; CHECK-NEXT:    add x15, sp, #368
 ; CHECK-NEXT:    ld1 { v4.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #312
-; CHECK-NEXT:    ld1 { v0.b }[9], [x10]
-; CHECK-NEXT:    add x10, sp, #224
+; CHECK-NEXT:    add x13, sp, #208
+; CHECK-NEXT:    add x8, sp, #216
+; CHECK-NEXT:    ld1 { v5.b }[4], [x15]
+; CHECK-NEXT:    ld1 { v1.b }[7], [x12]
+; CHECK-NEXT:    add x12, sp, #568
+; CHECK-NEXT:    add x14, sp, #224
+; CHECK-NEXT:    add x16, sp, #304
+; CHECK-NEXT:    add x10, sp, #232
+; CHECK-NEXT:    mov v0.b[6], w6
+; CHECK-NEXT:    ld1 { v4.b }[5], [x12]
+; CHECK-NEXT:    add x12, sp, #376
+; CHECK-NEXT:    ld1 { v5.b }[5], [x12]
+; CHECK-NEXT:    add x12, sp, #16
+; CHECK-NEXT:    ld1 { v1.b }[8], [x13]
+; CHECK-NEXT:    add x13, sp, #576
+; CHECK-NEXT:    ld1 { v2.b }[4], [x16]
+; CHECK-NEXT:    add x11, sp, #240
+; CHECK-NEXT:    ld1 { v4.b }[6], [x13]
+; CHECK-NEXT:    add x13, sp, #384
+; CHECK-NEXT:    add x9, sp, #248
+; CHECK-NEXT:    mov v0.b[7], w7
 ; CHECK-NEXT:    ld1 { v1.b }[9], [x8]
-; CHECK-NEXT:    add x8, sp, #32
-; CHECK-NEXT:    ld1 { v5.b }[4], [x11]
+; CHECK-NEXT:    ld1 { v5.b }[6], [x13]
+; CHECK-NEXT:    add x13, sp, #112
+; CHECK-NEXT:    add x8, sp, #584
+; CHECK-NEXT:    add x15, sp, #256
+; CHECK-NEXT:    ld1 { v3.b }[4], [x13]
+; CHECK-NEXT:    add x13, sp, #32
+; CHECK-NEXT:    ld1 { v4.b }[7], [x8]
+; CHECK-NEXT:    ld1 { v1.b }[10], [x14]
+; CHECK-NEXT:    add x14, sp, #312
+; CHECK-NEXT:    add x8, sp, #40
+; CHECK-NEXT:    ld1 { v0.b }[8], [x12]
+; CHECK-NEXT:    add x12, sp, #24
+; CHECK-NEXT:    ld1 { v2.b }[5], [x14]
+; CHECK-NEXT:    add x14, sp, #592
+; CHECK-NEXT:    add x16, sp, #264
+; CHECK-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-NEXT:    ld1 { v1.b }[11], [x10]
+; CHECK-NEXT:    ld1 { v4.b }[8], [x14]
+; CHECK-NEXT:    add x14, sp, #400
+; CHECK-NEXT:    ld1 { v0.b }[9], [x12]
+; CHECK-NEXT:    add x12, sp, #392
+; CHECK-NEXT:    add x10, sp, #72
+; CHECK-NEXT:    ld1 { v5.b }[7], [x12]
+; CHECK-NEXT:    add x12, sp, #48
+; CHECK-NEXT:    movi v17.2d, #0000000000000000
+; CHECK-NEXT:    ld1 { v1.b }[12], [x11]
 ; CHECK-NEXT:    add x11, sp, #120
-; CHECK-NEXT:    ld1 { v2.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #568
-; CHECK-NEXT:    ld1 { v0.b }[10], [x10]
-; CHECK-NEXT:    add x10, sp, #232
+; CHECK-NEXT:    movi v18.2d, #0000000000000000
+; CHECK-NEXT:    ld1 { v0.b }[10], [x13]
 ; CHECK-NEXT:    ld1 { v3.b }[5], [x11]
-; CHECK-NEXT:    add x11, sp, #376
-; CHECK-NEXT:    ld1 { v1.b }[10], [x8]
-; CHECK-NEXT:    add x8, sp, #40
-; CHECK-NEXT:    ld1 { v4.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #320
-; CHECK-NEXT:    ld1 { v0.b }[11], [x10]
-; CHECK-NEXT:    add x10, sp, #240
-; CHECK-NEXT:    ld1 { v5.b }[5], [x11]
-; CHECK-NEXT:    add x11, sp, #128
-; CHECK-NEXT:    ld1 { v1.b }[11], [x8]
-; CHECK-NEXT:    add x8, sp, #48
-; CHECK-NEXT:    ld1 { v2.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #576
-; CHECK-NEXT:    ld1 { v0.b }[12], [x10]
-; CHECK-NEXT:    add x10, sp, #248
-; CHECK-NEXT:    ld1 { v3.b }[6], [x11]
-; CHECK-NEXT:    add x11, sp, #384
-; CHECK-NEXT:    ld1 { v1.b }[12], [x8]
-; CHECK-NEXT:    add x8, sp, #56
-; CHECK-NEXT:    ld1 { v4.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #328
-; CHECK-NEXT:    ld1 { v5.b }[6], [x11]
-; CHECK-NEXT:    add x11, sp, #584
-; CHECK-NEXT:    ld1 { v0.b }[13], [x10]
-; CHECK-NEXT:    add x10, sp, #256
-; CHECK-NEXT:    ld1 { v1.b }[13], [x8]
-; CHECK-NEXT:    add x8, sp, #64
-; CHECK-NEXT:    ld1 { v2.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #392
-; CHECK-NEXT:    ld1 { v4.b }[7], [x11]
-; CHECK-NEXT:    add x11, sp, #592
-; CHECK-NEXT:    ld1 { v0.b }[14], [x10]
-; CHECK-NEXT:    add x10, sp, #264
-; CHECK-NEXT:    ld1 { v5.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #136
-; CHECK-NEXT:    ld1 { v1.b }[14], [x8]
-; CHECK-NEXT:    add x8, sp, #72
-; CHECK-NEXT:    ld1 { v4.b }[8], [x11]
-; CHECK-NEXT:    add x11, sp, #400
-; CHECK-NEXT:    ld1 { v0.b }[15], [x10]
-; CHECK-NEXT:    add x10, sp, #600
-; CHECK-NEXT:    ld1 { v3.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #664
-; CHECK-NEXT:    ld1 { v5.b }[8], [x11]
+; CHECK-NEXT:    add x11, sp, #408
+; CHECK-NEXT:    ld1 { v5.b }[8], [x14]
+; CHECK-NEXT:    add x13, sp, #56
+; CHECK-NEXT:    add x14, sp, #64
+; CHECK-NEXT:    ld1 { v1.b }[13], [x9]
+; CHECK-NEXT:    add x9, sp, #616
+; CHECK-NEXT:    movi v19.2d, #0000000000000000
+; CHECK-NEXT:    ld1 { v0.b }[11], [x8]
+; CHECK-NEXT:    add x8, sp, #600
+; CHECK-NEXT:    ld1 { v4.b }[9], [x8]
+; CHECK-NEXT:    ld1 { v5.b }[9], [x11]
 ; CHECK-NEXT:    add x11, sp, #608
-; CHECK-NEXT:    ld1 { v1.b }[15], [x8]
-; CHECK-NEXT:    add x8, sp, #408
-; CHECK-NEXT:    ld1 { v4.b }[9], [x10]
-; CHECK-NEXT:    add x10, sp, #472
-; CHECK-NEXT:    ld1 { v7.b }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #672
-; CHECK-NEXT:    ld1 { v5.b }[9], [x8]
-; CHECK-NEXT:    add x8, sp, #416
-; CHECK-NEXT:    ld1 { v16.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #480
+; CHECK-NEXT:    ld1 { v1.b }[14], [x15]
+; CHECK-NEXT:    add x15, sp, #488
+; CHECK-NEXT:    add x8, sp, #320
+; CHECK-NEXT:    ld1 { v0.b }[12], [x12]
+; CHECK-NEXT:    ld1 { v7.b }[3], [x15]
+; CHECK-NEXT:    ld1 { v2.b }[6], [x8]
 ; CHECK-NEXT:    ld1 { v4.b }[10], [x11]
-; CHECK-NEXT:    add x11, sp, #616
-; CHECK-NEXT:    ld1 { v7.b }[2], [x9]
+; CHECK-NEXT:    add x8, sp, #624
+; CHECK-NEXT:    add x12, sp, #328
+; CHECK-NEXT:    add x11, sp, #128
+; CHECK-NEXT:    ld1 { v1.b }[15], [x16]
+; CHECK-NEXT:    ld1 { v0.b }[13], [x13]
+; CHECK-NEXT:    add x13, sp, #416
+; CHECK-NEXT:    ld1 { v3.b }[6], [x11]
+; CHECK-NEXT:    ld1 { v5.b }[10], [x13]
+; CHECK-NEXT:    ld1 { v4.b }[11], [x9]
 ; CHECK-NEXT:    add x9, sp, #680
-; CHECK-NEXT:    ld1 { v5.b }[10], [x8]
-; CHECK-NEXT:    add x8, sp, #424
-; CHECK-NEXT:    ld1 { v16.b }[2], [x10]
-; CHECK-NEXT:    add x10, sp, #488
-; CHECK-NEXT:    ld1 { v4.b }[11], [x11]
-; CHECK-NEXT:    add x11, sp, #624
-; CHECK-NEXT:    ld1 { v7.b }[3], [x9]
+; CHECK-NEXT:    ld1 { v6.b }[3], [x9]
 ; CHECK-NEXT:    add x9, sp, #688
-; CHECK-NEXT:    ld1 { v5.b }[11], [x8]
+; CHECK-NEXT:    add x13, sp, #632
+; CHECK-NEXT:    ld1 { v0.b }[14], [x14]
+; CHECK-NEXT:    add x14, sp, #424
+; CHECK-NEXT:    ld1 { v2.b }[7], [x12]
+; CHECK-NEXT:    ld1 { v5.b }[11], [x14]
+; CHECK-NEXT:    ld1 { v4.b }[12], [x8]
 ; CHECK-NEXT:    add x8, sp, #432
-; CHECK-NEXT:    ld1 { v16.b }[3], [x10]
+; CHECK-NEXT:    ld1 { v6.b }[4], [x9]
+; CHECK-NEXT:    add x11, sp, #696
+; CHECK-NEXT:    add x12, sp, #504
+; CHECK-NEXT:    ld1 { v0.b }[15], [x10]
 ; CHECK-NEXT:    add x10, sp, #496
-; CHECK-NEXT:    ld1 { v4.b }[12], [x11]
-; CHECK-NEXT:    add x11, sp, #632
-; CHECK-NEXT:    ld1 { v7.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #696
+; CHECK-NEXT:    add x9, sp, #640
 ; CHECK-NEXT:    ld1 { v5.b }[12], [x8]
-; CHECK-NEXT:    add x8, sp, #440
-; CHECK-NEXT:    ld1 { v16.b }[4], [x10]
-; CHECK-NEXT:    add x10, sp, #504
-; CHECK-NEXT:    ld1 { v4.b }[13], [x11]
-; CHECK-NEXT:    add x11, sp, #640
-; CHECK-NEXT:    ld1 { v7.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #704
-; CHECK-NEXT:    ld1 { v5.b }[13], [x8]
-; CHECK-NEXT:    add x8, sp, #448
-; CHECK-NEXT:    ld1 { v16.b }[5], [x10]
-; CHECK-NEXT:    add x10, sp, #512
-; CHECK-NEXT:    ld1 { v4.b }[14], [x11]
-; CHECK-NEXT:    add x11, sp, #648
-; CHECK-NEXT:    ld1 { v7.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #712
-; CHECK-NEXT:    ld1 { v5.b }[14], [x8]
-; CHECK-NEXT:    add x8, sp, #456
-; CHECK-NEXT:    ld1 { v16.b }[6], [x10]
-; CHECK-NEXT:    add x10, sp, #520
-; CHECK-NEXT:    movi v6.2d, #0000000000000000
-; CHECK-NEXT:    ld1 { v4.b }[15], [x11]
-; CHECK-NEXT:    movi v17.2d, #0000000000000000
-; CHECK-NEXT:    ld1 { v7.b }[7], [x9]
-; CHECK-NEXT:    ld1 { v5.b }[15], [x8]
-; CHECK-NEXT:    movi v18.2d, #0000000000000000
-; CHECK-NEXT:    ld1 { v16.b }[7], [x10]
-; CHECK-NEXT:    movi v19.2d, #0000000000000000
-; CHECK-NEXT:    sdot v17.4s, v1.16b, v0.16b
-; CHECK-NEXT:    sdot v6.4s, v5.16b, v4.16b
-; CHECK-NEXT:    sdot v18.2s, v3.8b, v2.8b
-; CHECK-NEXT:    sdot v19.2s, v16.8b, v7.8b
+; CHECK-NEXT:    ld1 { v7.b }[4], [x10]
+; CHECK-NEXT:    ld1 { v4.b }[13], [x13]
+; CHECK-NEXT:    add x10, sp, #440
+; CHECK-NEXT:    ld1 { v6.b }[5], [x11]
+; CHECK-NEXT:    add x11, sp, #512
+; CHECK-NEXT:    add x8, sp, #136
+; CHECK-NEXT:    sdot v17.4s, v0.16b, v1.16b
+; CHECK-NEXT:    ld1 { v5.b }[13], [x10]
+; CHECK-NEXT:    ld1 { v7.b }[5], [x12]
+; CHECK-NEXT:    ld1 { v4.b }[14], [x9]
+; CHECK-NEXT:    add x9, sp, #448
+; CHECK-NEXT:    add x10, sp, #704
+; CHECK-NEXT:    ld1 { v3.b }[7], [x8]
+; CHECK-NEXT:    ld1 { v6.b }[6], [x10]
+; CHECK-NEXT:    add x8, sp, #648
+; CHECK-NEXT:    add x10, sp, #520
+; CHECK-NEXT:    ld1 { v5.b }[14], [x9]
+; CHECK-NEXT:    ld1 { v7.b }[6], [x11]
+; CHECK-NEXT:    ld1 { v4.b }[15], [x8]
+; CHECK-NEXT:    add x8, sp, #456
+; CHECK-NEXT:    add x9, sp, #712
+; CHECK-NEXT:    sdot v19.2s, v3.8b, v2.8b
+; CHECK-NEXT:    ld1 { v6.b }[7], [x9]
 ; CHECK-NEXT:    addv s0, v17.4s
-; CHECK-NEXT:    addv s2, v6.4s
-; CHECK-NEXT:    addp v1.2s, v18.2s, v18.2s
-; CHECK-NEXT:    addp v3.2s, v19.2s, v19.2s
+; CHECK-NEXT:    ld1 { v5.b }[15], [x8]
+; CHECK-NEXT:    ld1 { v7.b }[7], [x10]
+; CHECK-NEXT:    addp v1.2s, v19.2s, v19.2s
+; CHECK-NEXT:    sdot v16.4s, v5.16b, v4.16b
+; CHECK-NEXT:    sdot v18.2s, v7.8b, v6.8b
 ; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov w10, s2
 ; CHECK-NEXT:    fmov w9, s1
-; CHECK-NEXT:    fmov w11, s3
+; CHECK-NEXT:    addv s2, v16.4s
+; CHECK-NEXT:    addp v3.2s, v18.2s, v18.2s
 ; CHECK-NEXT:    add w8, w8, w9
+; CHECK-NEXT:    fmov w10, s2
+; CHECK-NEXT:    fmov w11, s3
 ; CHECK-NEXT:    add w9, w10, w11
 ; CHECK-NEXT:    add w0, w8, w9
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -880,107 +880,107 @@ define i32 @test_sdot_v24i8_double_nomla(<24 x i8> %a, <24 x i8> %b, <24 x i8> %
 ; CHECK-NEXT:    fmov s0, w0
 ; CHECK-NEXT:    ldr b1, [sp, #336]
 ; CHECK-NEXT:    add x8, sp, #344
-; CHECK-NEXT:    add x9, sp, #392
-; CHECK-NEXT:    add x10, sp, #24
+; CHECK-NEXT:    add x9, sp, #400
 ; CHECK-NEXT:    ldr b2, [sp, #80]
-; CHECK-NEXT:    mov v0.b[1], w1
-; CHECK-NEXT:    add x13, sp, #88
+; CHECK-NEXT:    ldr b3, [sp, #464]
 ; CHECK-NEXT:    ld1 { v1.b }[1], [x8]
 ; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr b3, [sp, #464]
-; CHECK-NEXT:    add x14, sp, #472
-; CHECK-NEXT:    ld1 { v2.b }[1], [x13]
-; CHECK-NEXT:    add x11, sp, #32
-; CHECK-NEXT:    mov v0.b[2], w2
-; CHECK-NEXT:    add x13, sp, #96
+; CHECK-NEXT:    add x10, sp, #408
+; CHECK-NEXT:    mov v0.b[1], w1
+; CHECK-NEXT:    add x11, sp, #472
+; CHECK-NEXT:    add x12, sp, #480
+; CHECK-NEXT:    ld1 { v3.b }[1], [x11]
+; CHECK-NEXT:    add x11, sp, #416
+; CHECK-NEXT:    add x13, sp, #488
 ; CHECK-NEXT:    ld1 { v1.b }[2], [x8]
 ; CHECK-NEXT:    add x8, sp, #360
-; CHECK-NEXT:    ld1 { v3.b }[1], [x14]
-; CHECK-NEXT:    add x14, sp, #480
-; CHECK-NEXT:    ld1 { v2.b }[2], [x13]
-; CHECK-NEXT:    add x12, sp, #40
-; CHECK-NEXT:    mov v0.b[3], w3
-; CHECK-NEXT:    add x13, sp, #104
+; CHECK-NEXT:    add x14, sp, #496
+; CHECK-NEXT:    movi v4.16b, #1
+; CHECK-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-NEXT:    movi v6.2d, #0000000000000000
+; CHECK-NEXT:    mov v0.b[2], w2
+; CHECK-NEXT:    ld1 { v3.b }[2], [x12]
+; CHECK-NEXT:    add x12, sp, #424
 ; CHECK-NEXT:    ld1 { v1.b }[3], [x8]
 ; CHECK-NEXT:    add x8, sp, #368
-; CHECK-NEXT:    ld1 { v3.b }[2], [x14]
-; CHECK-NEXT:    add x14, sp, #488
-; CHECK-NEXT:    ld1 { v2.b }[3], [x13]
-; CHECK-NEXT:    add x13, sp, #112
-; CHECK-NEXT:    mov v0.b[4], w4
-; CHECK-NEXT:    movi v6.8b, #1
+; CHECK-NEXT:    movi v7.2d, #0000000000000000
+; CHECK-NEXT:    movi v16.8b, #1
+; CHECK-NEXT:    movi v17.2d, #0000000000000000
+; CHECK-NEXT:    ld1 { v3.b }[3], [x13]
+; CHECK-NEXT:    add x13, sp, #432
+; CHECK-NEXT:    mov v0.b[3], w3
 ; CHECK-NEXT:    ld1 { v1.b }[4], [x8]
 ; CHECK-NEXT:    add x8, sp, #376
-; CHECK-NEXT:    ld1 { v3.b }[3], [x14]
-; CHECK-NEXT:    add x14, sp, #496
-; CHECK-NEXT:    ld1 { v2.b }[4], [x13]
-; CHECK-NEXT:    add x13, sp, #120
-; CHECK-NEXT:    mov v0.b[5], w5
+; CHECK-NEXT:    ld1 { v3.b }[4], [x14]
 ; CHECK-NEXT:    ld1 { v1.b }[5], [x8]
 ; CHECK-NEXT:    add x8, sp, #384
-; CHECK-NEXT:    ld1 { v3.b }[4], [x14]
-; CHECK-NEXT:    add x14, sp, #504
-; CHECK-NEXT:    ld1 { v2.b }[5], [x13]
-; CHECK-NEXT:    add x13, sp, #512
-; CHECK-NEXT:    mov v0.b[6], w6
+; CHECK-NEXT:    mov v0.b[4], w4
 ; CHECK-NEXT:    ld1 { v1.b }[6], [x8]
+; CHECK-NEXT:    add x8, sp, #392
+; CHECK-NEXT:    mov v0.b[5], w5
+; CHECK-NEXT:    ld1 { v1.b }[7], [x8]
 ; CHECK-NEXT:    add x8, sp, #16
-; CHECK-NEXT:    ld1 { v3.b }[5], [x14]
-; CHECK-NEXT:    movi v4.16b, #1
+; CHECK-NEXT:    mov v0.b[6], w6
+; CHECK-NEXT:    ld1 { v1.b }[8], [x9]
+; CHECK-NEXT:    add x9, sp, #88
+; CHECK-NEXT:    ld1 { v2.b }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #40
+; CHECK-NEXT:    ld1 { v1.b }[9], [x10]
+; CHECK-NEXT:    add x10, sp, #96
 ; CHECK-NEXT:    mov v0.b[7], w7
-; CHECK-NEXT:    ld1 { v1.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #400
-; CHECK-NEXT:    ld1 { v3.b }[6], [x13]
-; CHECK-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-NEXT:    ld1 { v2.b }[2], [x10]
+; CHECK-NEXT:    add x10, sp, #56
+; CHECK-NEXT:    ld1 { v1.b }[10], [x11]
+; CHECK-NEXT:    add x11, sp, #104
+; CHECK-NEXT:    ld1 { v2.b }[3], [x11]
+; CHECK-NEXT:    add x11, sp, #72
 ; CHECK-NEXT:    ld1 { v0.b }[8], [x8]
+; CHECK-NEXT:    add x8, sp, #24
+; CHECK-NEXT:    ld1 { v1.b }[11], [x12]
+; CHECK-NEXT:    add x12, sp, #112
+; CHECK-NEXT:    ld1 { v2.b }[4], [x12]
+; CHECK-NEXT:    add x12, sp, #440
+; CHECK-NEXT:    ld1 { v0.b }[9], [x8]
+; CHECK-NEXT:    add x8, sp, #32
+; CHECK-NEXT:    ld1 { v1.b }[12], [x13]
+; CHECK-NEXT:    add x13, sp, #504
+; CHECK-NEXT:    ld1 { v3.b }[5], [x13]
+; CHECK-NEXT:    add x13, sp, #512
+; CHECK-NEXT:    ld1 { v0.b }[10], [x8]
 ; CHECK-NEXT:    add x8, sp, #48
-; CHECK-NEXT:    ld1 { v1.b }[8], [x9]
-; CHECK-NEXT:    add x9, sp, #408
-; CHECK-NEXT:    movi v7.2d, #0000000000000000
-; CHECK-NEXT:    movi v16.2d, #0000000000000000
-; CHECK-NEXT:    ld1 { v0.b }[9], [x10]
-; CHECK-NEXT:    add x10, sp, #56
-; CHECK-NEXT:    ld1 { v1.b }[9], [x9]
-; CHECK-NEXT:    add x9, sp, #416
-; CHECK-NEXT:    movi v17.2d, #0000000000000000
-; CHECK-NEXT:    ld1 { v0.b }[10], [x11]
-; CHECK-NEXT:    add x11, sp, #64
-; CHECK-NEXT:    ld1 { v1.b }[10], [x9]
-; CHECK-NEXT:    add x9, sp, #424
-; CHECK-NEXT:    ld1 { v0.b }[11], [x12]
-; CHECK-NEXT:    add x12, sp, #72
-; CHECK-NEXT:    ld1 { v1.b }[11], [x9]
-; CHECK-NEXT:    add x9, sp, #432
+; CHECK-NEXT:    ld1 { v1.b }[13], [x12]
+; CHECK-NEXT:    add x12, sp, #448
+; CHECK-NEXT:    ld1 { v3.b }[6], [x13]
+; CHECK-NEXT:    ld1 { v0.b }[11], [x9]
+; CHECK-NEXT:    add x9, sp, #64
+; CHECK-NEXT:    ld1 { v1.b }[14], [x12]
 ; CHECK-NEXT:    ld1 { v0.b }[12], [x8]
-; CHECK-NEXT:    add x8, sp, #440
-; CHECK-NEXT:    ld1 { v1.b }[12], [x9]
-; CHECK-NEXT:    add x9, sp, #128
+; CHECK-NEXT:    add x8, sp, #120
+; CHECK-NEXT:    ld1 { v2.b }[5], [x8]
+; CHECK-NEXT:    add x8, sp, #128
 ; CHECK-NEXT:    ld1 { v0.b }[13], [x10]
 ; CHECK-NEXT:    add x10, sp, #136
-; CHECK-NEXT:    ld1 { v1.b }[13], [x8]
-; CHECK-NEXT:    add x8, sp, #448
-; CHECK-NEXT:    ld1 { v2.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #520
-; CHECK-NEXT:    ld1 { v0.b }[14], [x11]
-; CHECK-NEXT:    ld1 { v1.b }[14], [x8]
+; CHECK-NEXT:    ld1 { v2.b }[6], [x8]
 ; CHECK-NEXT:    add x8, sp, #456
+; CHECK-NEXT:    ld1 { v1.b }[15], [x8]
+; CHECK-NEXT:    ld1 { v0.b }[14], [x9]
+; CHECK-NEXT:    add x9, sp, #520
 ; CHECK-NEXT:    ld1 { v2.b }[7], [x10]
 ; CHECK-NEXT:    ld1 { v3.b }[7], [x9]
-; CHECK-NEXT:    ld1 { v0.b }[15], [x12]
-; CHECK-NEXT:    ld1 { v1.b }[15], [x8]
-; CHECK-NEXT:    sdot v7.2s, v2.8b, v6.8b
-; CHECK-NEXT:    sdot v5.2s, v3.8b, v6.8b
-; CHECK-NEXT:    sdot v16.4s, v0.16b, v4.16b
-; CHECK-NEXT:    sdot v17.4s, v1.16b, v4.16b
-; CHECK-NEXT:    addp v0.2s, v7.2s, v7.2s
-; CHECK-NEXT:    addp v1.2s, v5.2s, v5.2s
-; CHECK-NEXT:    addv s2, v16.4s
-; CHECK-NEXT:    addv s3, v17.4s
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    fmov w11, s1
+; CHECK-NEXT:    sdot v5.4s, v1.16b, v4.16b
+; CHECK-NEXT:    ld1 { v0.b }[15], [x11]
+; CHECK-NEXT:    sdot v17.2s, v2.8b, v16.8b
+; CHECK-NEXT:    sdot v7.2s, v3.8b, v16.8b
+; CHECK-NEXT:    sdot v6.4s, v0.16b, v4.16b
+; CHECK-NEXT:    addv s3, v5.4s
+; CHECK-NEXT:    addp v1.2s, v17.2s, v17.2s
+; CHECK-NEXT:    addp v2.2s, v7.2s, v7.2s
+; CHECK-NEXT:    addv s0, v6.4s
 ; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    add w8, w9, w8
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    fmov w11, s2
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    add w8, w8, w9
 ; CHECK-NEXT:    add w9, w10, w11
 ; CHECK-NEXT:    add w0, w8, w9
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -998,26 +998,26 @@ entry:
 define i32 @test_udot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
 ; CHECK-LABEL: test_udot_v25i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp q1, q4, [x1]
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    ldp q3, q2, [x0]
-; CHECK-NEXT:    ushll2 v7.8h, v4.16b, #0
-; CHECK-NEXT:    ushll v4.8h, v4.8b, #0
-; CHECK-NEXT:    ushll2 v6.8h, v3.16b, #0
-; CHECK-NEXT:    ushll v3.8h, v3.8b, #0
-; CHECK-NEXT:    ushll v5.8h, v2.8b, #0
-; CHECK-NEXT:    ushll2 v2.8h, v2.16b, #0
-; CHECK-NEXT:    umull v2.4s, v7.4h, v2.4h
-; CHECK-NEXT:    ushll v7.8h, v1.8b, #0
-; CHECK-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-NEXT:    umull2 v16.4s, v7.8h, v3.8h
-; CHECK-NEXT:    mov v0.s[0], v2.s[0]
-; CHECK-NEXT:    umull v2.4s, v7.4h, v3.4h
-; CHECK-NEXT:    umlal2 v16.4s, v4.8h, v5.8h
-; CHECK-NEXT:    umlal v0.4s, v1.4h, v6.4h
-; CHECK-NEXT:    umlal v2.4s, v4.4h, v5.4h
-; CHECK-NEXT:    umlal2 v16.4s, v1.8h, v6.8h
-; CHECK-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    ldp q4, q0, [x0]
+; CHECK-NEXT:    ldp q5, q1, [x1]
+; CHECK-NEXT:    ushll2 v2.8h, v0.16b, #0
+; CHECK-NEXT:    ushll v6.8h, v4.8b, #0
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushll2 v3.8h, v1.16b, #0
+; CHECK-NEXT:    ushll v7.8h, v5.8b, #0
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    umull v2.4s, v3.4h, v2.4h
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-NEXT:    umull2 v16.4s, v7.8h, v6.8h
+; CHECK-NEXT:    umull v6.4s, v7.4h, v6.4h
+; CHECK-NEXT:    mov v3.s[0], v2.s[0]
+; CHECK-NEXT:    ushll2 v2.8h, v4.16b, #0
+; CHECK-NEXT:    ushll2 v4.8h, v5.16b, #0
+; CHECK-NEXT:    umlal v6.4s, v1.4h, v0.4h
+; CHECK-NEXT:    umlal2 v16.4s, v1.8h, v0.8h
+; CHECK-NEXT:    umlal v3.4s, v4.4h, v2.4h
+; CHECK-NEXT:    umlal2 v16.4s, v4.8h, v2.8h
+; CHECK-NEXT:    add v0.4s, v6.4s, v3.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v16.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
@@ -1037,19 +1037,19 @@ entry:
 define i32 @test_udot_v25i8_nomla(ptr nocapture readonly %a1) {
 ; CHECK-LABEL: test_udot_v25i8_nomla:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    ldp q2, q1, [x0]
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    ushll2 v3.8h, v1.16b, #0
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    ushll2 v4.8h, v2.16b, #0
-; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-NEXT:    ushll v4.4s, v4.4h, #0
-; CHECK-NEXT:    uaddl2 v5.4s, v1.8h, v2.8h
-; CHECK-NEXT:    mov v0.s[0], v4.s[0]
-; CHECK-NEXT:    uaddl v1.4s, v1.4h, v2.4h
-; CHECK-NEXT:    uaddw2 v2.4s, v5.4s, v3.8h
-; CHECK-NEXT:    uaddw v0.4s, v0.4s, v3.4h
-; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NEXT:    uaddl2 v5.4s, v4.8h, v3.8h
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    uaddl v3.4s, v4.4h, v3.4h
+; CHECK-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-NEXT:    uaddw2 v1.4s, v5.4s, v2.8h
+; CHECK-NEXT:    uaddw v0.4s, v0.4s, v2.4h
+; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
@@ -1063,26 +1063,26 @@ entry:
 define i32 @test_sdot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
 ; CHECK-LABEL: test_sdot_v25i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp q1, q4, [x1]
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    ldp q3, q2, [x0]
-; CHECK-NEXT:    sshll2 v7.8h, v4.16b, #0
-; CHECK-NEXT:    sshll v4.8h, v4.8b, #0
-; CHECK-NEXT:    sshll2 v6.8h, v3.16b, #0
-; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
-; CHECK-NEXT:    sshll v5.8h, v2.8b, #0
-; CHECK-NEXT:    sshll2 v2.8h, v2.16b, #0
-; CHECK-NEXT:    smull v2.4s, v7.4h, v2.4h
-; CHECK-NEXT:    sshll v7.8h, v1.8b, #0
-; CHECK-NEXT:    sshll2 v1.8h, v1.16b, #0
-; CHECK-NEXT:    smull2 v16.4s, v7.8h, v3.8h
-; CHECK-NEXT:    mov v0.s[0], v2.s[0]
-; CHECK-NEXT:    smull v2.4s, v7.4h, v3.4h
-; CHECK-NEXT:    smlal2 v16.4s, v4.8h, v5.8h
-; CHECK-NEXT:    smlal v0.4s, v1.4h, v6.4h
-; CHECK-NEXT:    smlal v2.4s, v4.4h, v5.4h
-; CHECK-NEXT:    smlal2 v16.4s, v1.8h, v6.8h
-; CHECK-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    ldp q4, q0, [x0]
+; CHECK-NEXT:    ldp q5, q1, [x1]
+; CHECK-NEXT:    sshll2 v2.8h, v0.16b, #0
+; CHECK-NEXT:    sshll v6.8h, v4.8b, #0
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    sshll2 v3.8h, v1.16b, #0
+; CHECK-NEXT:    sshll v7.8h, v5.8b, #0
+; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-NEXT:    smull v2.4s, v3.4h, v2.4h
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-NEXT:    smull2 v16.4s, v7.8h, v6.8h
+; CHECK-NEXT:    smull v6.4s, v7.4h, v6.4h
+; CHECK-NEXT:    mov v3.s[0], v2.s[0]
+; CHECK-NEXT:    sshll2 v2.8h, v4.16b, #0
+; CHECK-NEXT:    sshll2 v4.8h, v5.16b, #0
+; CHECK-NEXT:    smlal v6.4s, v1.4h, v0.4h
+; CHECK-NEXT:    smlal2 v16.4s, v1.8h, v0.8h
+; CHECK-NEXT:    smlal v3.4s, v4.4h, v2.4h
+; CHECK-NEXT:    smlal2 v16.4s, v4.8h, v2.8h
+; CHECK-NEXT:    add v0.4s, v6.4s, v3.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v16.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
@@ -1105,221 +1105,221 @@ define i32 @test_sdot_v25i8_double(<25 x i8> %a, <25 x i8> %b, <25 x i8> %c, <25
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ldr b2, [sp, #80]
+; CHECK-NEXT:    fmov s3, w0
+; CHECK-NEXT:    ldr b0, [sp, #80]
 ; CHECK-NEXT:    add x8, sp, #88
-; CHECK-NEXT:    ldr b0, [sp, #16]
-; CHECK-NEXT:    add x9, sp, #24
-; CHECK-NEXT:    add x10, sp, #40
-; CHECK-NEXT:    add x11, sp, #128
-; CHECK-NEXT:    ld1 { v2.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #96
-; CHECK-NEXT:    ld1 { v0.b }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #32
-; CHECK-NEXT:    ldr b17, [sp, #152]
-; CHECK-NEXT:    fmov s4, w0
-; CHECK-NEXT:    ldr b6, [sp, #280]
-; CHECK-NEXT:    add x12, sp, #224
-; CHECK-NEXT:    ld1 { v2.b }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #104
-; CHECK-NEXT:    ld1 { v0.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #112
-; CHECK-NEXT:    ldr b1, [sp, #216]
-; CHECK-NEXT:    mov v4.b[1], w1
-; CHECK-NEXT:    ldr b3, [sp, #480]
-; CHECK-NEXT:    ld1 { v2.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #120
-; CHECK-NEXT:    ld1 { v0.b }[3], [x10]
-; CHECK-NEXT:    add x10, sp, #48
-; CHECK-NEXT:    ld1 { v1.b }[1], [x12]
-; CHECK-NEXT:    mov v4.b[2], w2
-; CHECK-NEXT:    ldr b18, [sp, #352]
-; CHECK-NEXT:    ld1 { v2.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #56
-; CHECK-NEXT:    ld1 { v0.b }[4], [x10]
+; CHECK-NEXT:    ldr b1, [sp, #16]
+; CHECK-NEXT:    add x10, sp, #24
+; CHECK-NEXT:    ldr b2, [sp, #280]
+; CHECK-NEXT:    ld1 { v0.b }[1], [x8]
+; CHECK-NEXT:    ldr b5, [sp, #152]
+; CHECK-NEXT:    add x9, sp, #96
+; CHECK-NEXT:    mov v3.b[1], w1
+; CHECK-NEXT:    ld1 { v1.b }[1], [x10]
 ; CHECK-NEXT:    add x10, sp, #288
-; CHECK-NEXT:    ldr b20, [sp, #680]
-; CHECK-NEXT:    mov v4.b[3], w3
-; CHECK-NEXT:    ldr b5, [sp, #144]
-; CHECK-NEXT:    ld1 { v2.b }[5], [x8]
-; CHECK-NEXT:    add x8, sp, #160
-; CHECK-NEXT:    ld1 { v0.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #64
-; CHECK-NEXT:    ld1 { v6.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #296
-; CHECK-NEXT:    ld1 { v17.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #136
-; CHECK-NEXT:    ld1 { v2.b }[6], [x11]
-; CHECK-NEXT:    add x11, sp, #320
-; CHECK-NEXT:    ld1 { v0.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #72
-; CHECK-NEXT:    ld1 { v6.b }[2], [x10]
-; CHECK-NEXT:    add x10, sp, #304
-; CHECK-NEXT:    mov v4.b[4], w4
-; CHECK-NEXT:    ld1 { v2.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #168
-; CHECK-NEXT:    ld1 { v0.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #232
-; CHECK-NEXT:    ld1 { v6.b }[3], [x10]
-; CHECK-NEXT:    add x10, sp, #312
-; CHECK-NEXT:    ld1 { v17.b }[2], [x8]
+; CHECK-NEXT:    ld1 { v2.b }[1], [x10]
+; CHECK-NEXT:    add x10, sp, #160
+; CHECK-NEXT:    ldr b4, [sp, #216]
+; CHECK-NEXT:    ld1 { v0.b }[2], [x9]
+; CHECK-NEXT:    ld1 { v5.b }[1], [x10]
+; CHECK-NEXT:    add x10, sp, #32
+; CHECK-NEXT:    add x11, sp, #224
+; CHECK-NEXT:    ld1 { v1.b }[2], [x10]
+; CHECK-NEXT:    add x8, sp, #104
+; CHECK-NEXT:    mov v3.b[2], w2
+; CHECK-NEXT:    ld1 { v4.b }[1], [x11]
+; CHECK-NEXT:    add x11, sp, #296
+; CHECK-NEXT:    ld1 { v0.b }[3], [x8]
+; CHECK-NEXT:    add x8, sp, #40
+; CHECK-NEXT:    add x10, sp, #232
+; CHECK-NEXT:    ld1 { v1.b }[3], [x8]
+; CHECK-NEXT:    ld1 { v2.b }[2], [x11]
+; CHECK-NEXT:    add x11, sp, #168
+; CHECK-NEXT:    ld1 { v4.b }[2], [x10]
+; CHECK-NEXT:    ld1 { v5.b }[2], [x11]
+; CHECK-NEXT:    add x13, sp, #48
+; CHECK-NEXT:    mov v3.b[3], w3
+; CHECK-NEXT:    add x8, sp, #240
+; CHECK-NEXT:    add x15, sp, #56
+; CHECK-NEXT:    ld1 { v1.b }[4], [x13]
+; CHECK-NEXT:    add x12, sp, #112
+; CHECK-NEXT:    add x11, sp, #304
+; CHECK-NEXT:    ld1 { v4.b }[3], [x8]
 ; CHECK-NEXT:    add x8, sp, #176
-; CHECK-NEXT:    ld1 { v1.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #488
-; CHECK-NEXT:    mov v4.b[5], w5
-; CHECK-NEXT:    ld1 { v6.b }[4], [x10]
-; CHECK-NEXT:    add x10, sp, #240
-; CHECK-NEXT:    ld1 { v3.b }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #496
-; CHECK-NEXT:    ld1 { v17.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #184
-; CHECK-NEXT:    ld1 { v1.b }[3], [x10]
-; CHECK-NEXT:    add x10, sp, #248
-; CHECK-NEXT:    mov v4.b[6], w6
-; CHECK-NEXT:    ld1 { v6.b }[5], [x11]
-; CHECK-NEXT:    ld1 { v3.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #504
-; CHECK-NEXT:    ld1 { v17.b }[4], [x8]
-; CHECK-NEXT:    add x8, sp, #192
-; CHECK-NEXT:    ld1 { v1.b }[4], [x10]
-; CHECK-NEXT:    add x10, sp, #256
-; CHECK-NEXT:    add x11, sp, #328
-; CHECK-NEXT:    ld1 { v3.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #512
-; CHECK-NEXT:    ld1 { v17.b }[5], [x8]
-; CHECK-NEXT:    add x8, sp, #200
-; CHECK-NEXT:    ld1 { v1.b }[5], [x10]
-; CHECK-NEXT:    add x10, sp, #264
-; CHECK-NEXT:    mov v4.b[7], w7
-; CHECK-NEXT:    ld1 { v6.b }[6], [x11]
-; CHECK-NEXT:    ld1 { v3.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #520
-; CHECK-NEXT:    ld1 { v17.b }[6], [x8]
-; CHECK-NEXT:    add x8, sp, #208
-; CHECK-NEXT:    ld1 { v1.b }[6], [x10]
-; CHECK-NEXT:    add x11, sp, #336
-; CHECK-NEXT:    add x10, sp, #272
-; CHECK-NEXT:    ld1 { v3.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #536
-; CHECK-NEXT:    ld1 { v17.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #528
-; CHECK-NEXT:    sshll v19.8h, v4.8b, #0
-; CHECK-NEXT:    ldr b4, [sp, #416]
-; CHECK-NEXT:    ld1 { v6.b }[7], [x11]
-; CHECK-NEXT:    add x11, sp, #688
-; CHECK-NEXT:    ld1 { v3.b }[6], [x8]
-; CHECK-NEXT:    add x8, sp, #424
-; CHECK-NEXT:    ld1 { v1.b }[7], [x10]
-; CHECK-NEXT:    add x10, sp, #360
-; CHECK-NEXT:    sshll v7.8h, v2.8b, #0
-; CHECK-NEXT:    ldr b2, [sp, #344]
-; CHECK-NEXT:    ld1 { v4.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #432
-; CHECK-NEXT:    sshll v17.8h, v17.8b, #0
-; CHECK-NEXT:    ld1 { v18.b }[1], [x10]
-; CHECK-NEXT:    sshll v16.8h, v6.8b, #0
-; CHECK-NEXT:    ld1 { v3.b }[7], [x9]
-; CHECK-NEXT:    sshll v6.8h, v2.8b, #0
-; CHECK-NEXT:    add x9, sp, #560
-; CHECK-NEXT:    smull v2.4s, v19.4h, v17.4h
-; CHECK-NEXT:    ld1 { v4.b }[2], [x8]
-; CHECK-NEXT:    smull2 v17.4s, v19.8h, v17.8h
-; CHECK-NEXT:    ldr b19, [sp, #552]
+; CHECK-NEXT:    ld1 { v0.b }[4], [x12]
+; CHECK-NEXT:    ld1 { v5.b }[3], [x8]
+; CHECK-NEXT:    add x12, sp, #184
+; CHECK-NEXT:    ld1 { v2.b }[3], [x11]
+; CHECK-NEXT:    mov v3.b[4], w4
+; CHECK-NEXT:    ld1 { v1.b }[5], [x15]
+; CHECK-NEXT:    add x11, sp, #64
+; CHECK-NEXT:    add x9, sp, #120
+; CHECK-NEXT:    add x8, sp, #312
+; CHECK-NEXT:    ldr b6, [sp, #352]
+; CHECK-NEXT:    ld1 { v5.b }[4], [x12]
+; CHECK-NEXT:    ld1 { v0.b }[5], [x9]
+; CHECK-NEXT:    add x9, sp, #192
+; CHECK-NEXT:    ld1 { v1.b }[6], [x11]
+; CHECK-NEXT:    ld1 { v2.b }[4], [x8]
+; CHECK-NEXT:    add x8, sp, #72
+; CHECK-NEXT:    mov v3.b[5], w5
+; CHECK-NEXT:    add x11, sp, #360
+; CHECK-NEXT:    ldr b16, [sp, #552]
+; CHECK-NEXT:    ld1 { v5.b }[5], [x9]
+; CHECK-NEXT:    ld1 { v6.b }[1], [x11]
+; CHECK-NEXT:    add x15, sp, #200
+; CHECK-NEXT:    ld1 { v1.b }[7], [x8]
+; CHECK-NEXT:    add x8, sp, #560
+; CHECK-NEXT:    add x14, sp, #128
+; CHECK-NEXT:    ld1 { v16.b }[1], [x8]
 ; CHECK-NEXT:    add x8, sp, #368
-; CHECK-NEXT:    add x10, sp, #440
-; CHECK-NEXT:    ld1 { v20.b }[1], [x11]
-; CHECK-NEXT:    add x11, sp, #696
-; CHECK-NEXT:    ld1 { v19.b }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #376
-; CHECK-NEXT:    ld1 { v18.b }[2], [x8]
+; CHECK-NEXT:    ld1 { v0.b }[6], [x14]
+; CHECK-NEXT:    mov v3.b[6], w6
+; CHECK-NEXT:    ld1 { v5.b }[6], [x15]
+; CHECK-NEXT:    ld1 { v6.b }[2], [x8]
 ; CHECK-NEXT:    add x8, sp, #568
-; CHECK-NEXT:    ld1 { v4.b }[3], [x10]
-; CHECK-NEXT:    add x10, sp, #448
-; CHECK-NEXT:    ld1 { v20.b }[2], [x11]
-; CHECK-NEXT:    add x11, sp, #704
-; CHECK-NEXT:    ld1 { v19.b }[2], [x8]
+; CHECK-NEXT:    add x14, sp, #208
+; CHECK-NEXT:    ldr b18, [sp, #480]
+; CHECK-NEXT:    ld1 { v16.b }[2], [x8]
+; CHECK-NEXT:    ldr b7, [sp, #144]
+; CHECK-NEXT:    add x11, sp, #488
+; CHECK-NEXT:    ld1 { v5.b }[7], [x14]
+; CHECK-NEXT:    add x8, sp, #376
+; CHECK-NEXT:    ld1 { v18.b }[1], [x11]
+; CHECK-NEXT:    mov v3.b[7], w7
+; CHECK-NEXT:    ld1 { v6.b }[3], [x8]
 ; CHECK-NEXT:    add x8, sp, #576
-; CHECK-NEXT:    ld1 { v18.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #384
-; CHECK-NEXT:    smlal v2.4s, v7.4h, v16.4h
-; CHECK-NEXT:    ld1 { v4.b }[4], [x10]
-; CHECK-NEXT:    smlal2 v17.4s, v7.8h, v16.8h
-; CHECK-NEXT:    ldr b7, [sp, #616]
-; CHECK-NEXT:    ld1 { v19.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #584
-; CHECK-NEXT:    ld1 { v18.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #392
-; CHECK-NEXT:    add x10, sp, #456
-; CHECK-NEXT:    ld1 { v20.b }[3], [x11]
-; CHECK-NEXT:    add x11, sp, #400
-; CHECK-NEXT:    ld1 { v19.b }[4], [x8]
-; CHECK-NEXT:    add x8, sp, #592
-; CHECK-NEXT:    ld1 { v18.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #624
-; CHECK-NEXT:    ld1 { v4.b }[5], [x10]
-; CHECK-NEXT:    add x10, sp, #712
+; CHECK-NEXT:    sshll v7.8h, v7.8b, #0
+; CHECK-NEXT:    ld1 { v16.b }[3], [x8]
+; CHECK-NEXT:    add x11, sp, #496
 ; CHECK-NEXT:    sshll v5.8h, v5.8b, #0
-; CHECK-NEXT:    ld1 { v7.b }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #632
-; CHECK-NEXT:    ld1 { v19.b }[5], [x8]
+; CHECK-NEXT:    add x8, sp, #384
+; CHECK-NEXT:    ld1 { v18.b }[2], [x11]
+; CHECK-NEXT:    ld1 { v6.b }[4], [x8]
+; CHECK-NEXT:    add x8, sp, #584
+; CHECK-NEXT:    add x11, sp, #504
+; CHECK-NEXT:    sshll v17.8h, v3.8b, #0
+; CHECK-NEXT:    ldr b3, [sp, #344]
+; CHECK-NEXT:    ld1 { v16.b }[4], [x8]
+; CHECK-NEXT:    add x8, sp, #424
+; CHECK-NEXT:    add x16, sp, #320
+; CHECK-NEXT:    ld1 { v18.b }[3], [x11]
+; CHECK-NEXT:    sshll v19.8h, v3.8b, #0
+; CHECK-NEXT:    add x11, sp, #392
+; CHECK-NEXT:    ld1 { v2.b }[5], [x16]
+; CHECK-NEXT:    smull2 v3.4s, v17.8h, v5.8h
+; CHECK-NEXT:    smull v5.4s, v17.4h, v5.4h
+; CHECK-NEXT:    movi v17.2d, #0000000000000000
+; CHECK-NEXT:    ld1 { v6.b }[5], [x11]
+; CHECK-NEXT:    add x12, sp, #248
+; CHECK-NEXT:    add x11, sp, #512
+; CHECK-NEXT:    smull v7.4s, v7.4h, v19.4h
+; CHECK-NEXT:    ldr b19, [sp, #416]
+; CHECK-NEXT:    ld1 { v4.b }[4], [x12]
+; CHECK-NEXT:    add x12, sp, #328
+; CHECK-NEXT:    ld1 { v18.b }[4], [x11]
+; CHECK-NEXT:    add x11, sp, #400
+; CHECK-NEXT:    ld1 { v19.b }[1], [x8]
+; CHECK-NEXT:    add x8, sp, #592
+; CHECK-NEXT:    ldr b20, [sp, #616]
+; CHECK-NEXT:    ld1 { v16.b }[5], [x8]
+; CHECK-NEXT:    add x8, sp, #432
+; CHECK-NEXT:    ld1 { v2.b }[6], [x12]
+; CHECK-NEXT:    mov v17.s[0], v7.s[0]
+; CHECK-NEXT:    ldr b7, [sp, #680]
+; CHECK-NEXT:    ld1 { v6.b }[6], [x11]
+; CHECK-NEXT:    ld1 { v19.b }[2], [x8]
 ; CHECK-NEXT:    add x8, sp, #600
-; CHECK-NEXT:    ld1 { v20.b }[4], [x10]
-; CHECK-NEXT:    add x10, sp, #720
-; CHECK-NEXT:    ld1 { v18.b }[6], [x11]
-; CHECK-NEXT:    add x11, sp, #408
-; CHECK-NEXT:    ld1 { v7.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #640
-; CHECK-NEXT:    ld1 { v19.b }[6], [x8]
-; CHECK-NEXT:    add x8, sp, #608
-; CHECK-NEXT:    ld1 { v20.b }[5], [x10]
-; CHECK-NEXT:    add x10, sp, #728
-; CHECK-NEXT:    ld1 { v18.b }[7], [x11]
-; CHECK-NEXT:    add x11, sp, #464
-; CHECK-NEXT:    ld1 { v7.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #664
-; CHECK-NEXT:    ld1 { v19.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #648
-; CHECK-NEXT:    ld1 { v20.b }[6], [x10]
-; CHECK-NEXT:    add x10, sp, #736
-; CHECK-NEXT:    sshll v16.8h, v18.8b, #0
-; CHECK-NEXT:    ld1 { v4.b }[6], [x11]
+; CHECK-NEXT:    add x11, sp, #688
+; CHECK-NEXT:    ld1 { v16.b }[6], [x8]
+; CHECK-NEXT:    add x12, sp, #624
+; CHECK-NEXT:    ld1 { v7.b }[1], [x11]
+; CHECK-NEXT:    ld1 { v20.b }[1], [x12]
+; CHECK-NEXT:    add x8, sp, #408
+; CHECK-NEXT:    add x11, sp, #608
+; CHECK-NEXT:    add x12, sp, #440
+; CHECK-NEXT:    ld1 { v6.b }[7], [x8]
+; CHECK-NEXT:    add x8, sp, #696
+; CHECK-NEXT:    ld1 { v16.b }[7], [x11]
+; CHECK-NEXT:    ld1 { v19.b }[3], [x12]
+; CHECK-NEXT:    add x11, sp, #632
+; CHECK-NEXT:    ld1 { v7.b }[2], [x8]
+; CHECK-NEXT:    ld1 { v20.b }[2], [x11]
+; CHECK-NEXT:    add x8, sp, #448
+; CHECK-NEXT:    add x11, sp, #640
+; CHECK-NEXT:    sshll v6.8h, v6.8b, #0
+; CHECK-NEXT:    add x13, sp, #256
+; CHECK-NEXT:    ld1 { v19.b }[4], [x8]
+; CHECK-NEXT:    add x8, sp, #704
+; CHECK-NEXT:    sshll v16.8h, v16.8b, #0
+; CHECK-NEXT:    ld1 { v7.b }[3], [x8]
+; CHECK-NEXT:    ld1 { v20.b }[3], [x11]
+; CHECK-NEXT:    add x8, sp, #712
+; CHECK-NEXT:    add x11, sp, #648
+; CHECK-NEXT:    add x12, sp, #520
+; CHECK-NEXT:    ld1 { v4.b }[5], [x13]
+; CHECK-NEXT:    ldr b21, [sp, #544]
+; CHECK-NEXT:    smull2 v22.4s, v6.8h, v16.8h
+; CHECK-NEXT:    smull v6.4s, v6.4h, v16.4h
 ; CHECK-NEXT:    ld1 { v7.b }[4], [x8]
-; CHECK-NEXT:    add x8, sp, #656
-; CHECK-NEXT:    sshll v18.8h, v19.8b, #0
-; CHECK-NEXT:    ld1 { v20.b }[7], [x10]
-; CHECK-NEXT:    smull v19.4s, v16.4h, v18.4h
-; CHECK-NEXT:    ld1 { v7.b }[5], [x8]
+; CHECK-NEXT:    ld1 { v20.b }[4], [x11]
+; CHECK-NEXT:    add x11, sp, #456
+; CHECK-NEXT:    ldr b16, [sp, #744]
+; CHECK-NEXT:    ld1 { v18.b }[5], [x12]
+; CHECK-NEXT:    ld1 { v19.b }[5], [x11]
+; CHECK-NEXT:    add x11, sp, #720
+; CHECK-NEXT:    add x12, sp, #656
+; CHECK-NEXT:    add x9, sp, #264
+; CHECK-NEXT:    ld1 { v7.b }[5], [x11]
+; CHECK-NEXT:    ld1 { v20.b }[5], [x12]
+; CHECK-NEXT:    sshll v21.8h, v21.8b, #0
+; CHECK-NEXT:    sshll v16.8h, v16.8b, #0
+; CHECK-NEXT:    add x8, sp, #528
+; CHECK-NEXT:    ld1 { v4.b }[6], [x9]
+; CHECK-NEXT:    ld1 { v18.b }[6], [x8]
+; CHECK-NEXT:    add x11, sp, #464
+; CHECK-NEXT:    add x12, sp, #728
+; CHECK-NEXT:    add x13, sp, #664
+; CHECK-NEXT:    add x10, sp, #136
+; CHECK-NEXT:    ld1 { v19.b }[6], [x11]
+; CHECK-NEXT:    ld1 { v7.b }[6], [x12]
+; CHECK-NEXT:    ld1 { v20.b }[6], [x13]
+; CHECK-NEXT:    ld1 { v0.b }[7], [x10]
+; CHECK-NEXT:    add x10, sp, #336
+; CHECK-NEXT:    add x9, sp, #272
+; CHECK-NEXT:    smull v16.4s, v21.4h, v16.4h
+; CHECK-NEXT:    movi v21.2d, #0000000000000000
+; CHECK-NEXT:    add x8, sp, #536
+; CHECK-NEXT:    ld1 { v2.b }[7], [x10]
+; CHECK-NEXT:    ld1 { v4.b }[7], [x9]
+; CHECK-NEXT:    ld1 { v18.b }[7], [x8]
 ; CHECK-NEXT:    add x8, sp, #472
-; CHECK-NEXT:    smull2 v16.4s, v16.8h, v18.8h
-; CHECK-NEXT:    ldr b18, [sp, #544]
-; CHECK-NEXT:    smull v5.4s, v5.4h, v6.4h
-; CHECK-NEXT:    ldr b6, [sp, #744]
-; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
-; CHECK-NEXT:    ld1 { v4.b }[7], [x8]
-; CHECK-NEXT:    sshll v20.8h, v20.8b, #0
-; CHECK-NEXT:    ld1 { v7.b }[6], [x9]
-; CHECK-NEXT:    sshll v18.8h, v18.8b, #0
-; CHECK-NEXT:    add x9, sp, #672
-; CHECK-NEXT:    sshll v6.8h, v6.8b, #0
-; CHECK-NEXT:    smlal v19.4s, v3.4h, v20.4h
-; CHECK-NEXT:    smlal2 v16.4s, v3.8h, v20.8h
+; CHECK-NEXT:    add x9, sp, #736
+; CHECK-NEXT:    add x10, sp, #672
+; CHECK-NEXT:    ld1 { v19.b }[7], [x8]
 ; CHECK-NEXT:    ld1 { v7.b }[7], [x9]
-; CHECK-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEXT:    smull v6.4s, v18.4h, v6.4h
-; CHECK-NEXT:    movi v18.2d, #0000000000000000
+; CHECK-NEXT:    ld1 { v20.b }[7], [x10]
 ; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    mov v3.s[0], v5.s[0]
+; CHECK-NEXT:    mov v21.s[0], v16.s[0]
 ; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    mov v18.s[0], v6.s[0]
+; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
 ; CHECK-NEXT:    sshll v4.8h, v4.8b, #0
-; CHECK-NEXT:    sshll v5.8h, v7.8b, #0
-; CHECK-NEXT:    smlal v3.4s, v0.4h, v1.4h
-; CHECK-NEXT:    smlal v18.4s, v4.4h, v5.4h
-; CHECK-NEXT:    smlal2 v17.4s, v0.8h, v1.8h
-; CHECK-NEXT:    smlal2 v16.4s, v4.8h, v5.8h
-; CHECK-NEXT:    add v0.4s, v2.4s, v3.4s
-; CHECK-NEXT:    add v1.4s, v19.4s, v18.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v17.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v16.4s
+; CHECK-NEXT:    sshll v16.8h, v18.8b, #0
+; CHECK-NEXT:    sshll v18.8h, v19.8b, #0
+; CHECK-NEXT:    sshll v7.8h, v7.8b, #0
+; CHECK-NEXT:    sshll v19.8h, v20.8b, #0
+; CHECK-NEXT:    smlal v5.4s, v0.4h, v2.4h
+; CHECK-NEXT:    smlal2 v3.4s, v0.8h, v2.8h
+; CHECK-NEXT:    smlal v17.4s, v1.4h, v4.4h
+; CHECK-NEXT:    smlal v6.4s, v16.4h, v7.4h
+; CHECK-NEXT:    smlal2 v22.4s, v16.8h, v7.8h
+; CHECK-NEXT:    smlal v21.4s, v18.4h, v19.4h
+; CHECK-NEXT:    smlal2 v3.4s, v1.8h, v4.8h
+; CHECK-NEXT:    add v0.4s, v5.4s, v17.4s
+; CHECK-NEXT:    add v1.4s, v6.4s, v21.4s
+; CHECK-NEXT:    smlal2 v22.4s, v18.8h, v19.8h
+; CHECK-NEXT:    add v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v22.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
@@ -1344,116 +1344,116 @@ define i32 @test_sdot_v25i8_double_nomla(<25 x i8> %a, <25 x i8> %b, <25 x i8> %
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ldr b0, [sp, #80]
-; CHECK-NEXT:    add x8, sp, #88
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ldr b1, [sp, #80]
+; CHECK-NEXT:    add x10, sp, #88
 ; CHECK-NEXT:    ldr b2, [sp, #16]
-; CHECK-NEXT:    add x9, sp, #24
-; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    add x9, sp, #96
 ; CHECK-NEXT:    ldr b3, [sp, #480]
-; CHECK-NEXT:    ld1 { v0.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #96
-; CHECK-NEXT:    ld1 { v2.b }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #32
-; CHECK-NEXT:    mov v1.b[1], w1
-; CHECK-NEXT:    add x10, sp, #488
-; CHECK-NEXT:    add x11, sp, #496
+; CHECK-NEXT:    ld1 { v1.b }[1], [x10]
+; CHECK-NEXT:    add x10, sp, #24
 ; CHECK-NEXT:    ldr b4, [sp, #352]
-; CHECK-NEXT:    ld1 { v0.b }[2], [x8]
+; CHECK-NEXT:    mov v0.b[1], w1
+; CHECK-NEXT:    ld1 { v2.b }[1], [x10]
+; CHECK-NEXT:    add x11, sp, #488
+; CHECK-NEXT:    add x10, sp, #360
+; CHECK-NEXT:    ldr b5, [sp, #416]
 ; CHECK-NEXT:    add x8, sp, #104
+; CHECK-NEXT:    ld1 { v1.b }[2], [x9]
+; CHECK-NEXT:    add x9, sp, #32
+; CHECK-NEXT:    ld1 { v3.b }[1], [x11]
 ; CHECK-NEXT:    ld1 { v2.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #40
-; CHECK-NEXT:    ld1 { v3.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #48
-; CHECK-NEXT:    mov v1.b[2], w2
-; CHECK-NEXT:    ldr b6, [sp, #416]
-; CHECK-NEXT:    ld1 { v0.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #112
-; CHECK-NEXT:    ld1 { v2.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #128
-; CHECK-NEXT:    ld1 { v3.b }[2], [x11]
-; CHECK-NEXT:    add x11, sp, #56
-; CHECK-NEXT:    mov v1.b[3], w3
-; CHECK-NEXT:    add x12, sp, #504
-; CHECK-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-NEXT:    add x8, sp, #120
-; CHECK-NEXT:    ld1 { v2.b }[4], [x10]
+; CHECK-NEXT:    add x11, sp, #424
+; CHECK-NEXT:    ld1 { v4.b }[1], [x10]
+; CHECK-NEXT:    mov v0.b[2], w2
+; CHECK-NEXT:    ld1 { v5.b }[1], [x11]
+; CHECK-NEXT:    add x9, sp, #368
+; CHECK-NEXT:    ld1 { v1.b }[3], [x8]
+; CHECK-NEXT:    add x8, sp, #40
+; CHECK-NEXT:    add x12, sp, #496
+; CHECK-NEXT:    ld1 { v2.b }[3], [x8]
+; CHECK-NEXT:    ld1 { v4.b }[2], [x9]
+; CHECK-NEXT:    add x8, sp, #432
+; CHECK-NEXT:    ld1 { v3.b }[2], [x12]
+; CHECK-NEXT:    add x13, sp, #48
+; CHECK-NEXT:    ld1 { v5.b }[2], [x8]
+; CHECK-NEXT:    mov v0.b[3], w3
+; CHECK-NEXT:    add x10, sp, #112
+; CHECK-NEXT:    add x8, sp, #504
+; CHECK-NEXT:    ld1 { v2.b }[4], [x13]
+; CHECK-NEXT:    add x13, sp, #376
+; CHECK-NEXT:    ld1 { v1.b }[4], [x10]
+; CHECK-NEXT:    ld1 { v4.b }[3], [x13]
+; CHECK-NEXT:    add x13, sp, #440
+; CHECK-NEXT:    ld1 { v3.b }[3], [x8]
+; CHECK-NEXT:    ld1 { v5.b }[3], [x13]
+; CHECK-NEXT:    add x11, sp, #120
+; CHECK-NEXT:    add x8, sp, #56
+; CHECK-NEXT:    mov v0.b[4], w4
+; CHECK-NEXT:    add x13, sp, #512
+; CHECK-NEXT:    ld1 { v1.b }[5], [x11]
+; CHECK-NEXT:    ld1 { v2.b }[5], [x8]
+; CHECK-NEXT:    add x8, sp, #384
+; CHECK-NEXT:    add x11, sp, #448
+; CHECK-NEXT:    ld1 { v3.b }[4], [x13]
+; CHECK-NEXT:    ld1 { v4.b }[4], [x8]
+; CHECK-NEXT:    ld1 { v5.b }[4], [x11]
+; CHECK-NEXT:    add x12, sp, #128
 ; CHECK-NEXT:    add x10, sp, #64
-; CHECK-NEXT:    ldr b5, [sp, #144]
-; CHECK-NEXT:    mov v1.b[4], w4
-; CHECK-NEXT:    ld1 { v3.b }[3], [x12]
-; CHECK-NEXT:    ld1 { v0.b }[5], [x8]
-; CHECK-NEXT:    add x8, sp, #136
-; CHECK-NEXT:    ld1 { v2.b }[5], [x11]
-; CHECK-NEXT:    add x11, sp, #360
-; CHECK-NEXT:    add x12, sp, #72
-; CHECK-NEXT:    mov v1.b[5], w5
-; CHECK-NEXT:    ld1 { v0.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #424
-; CHECK-NEXT:    ld1 { v4.b }[1], [x11]
-; CHECK-NEXT:    add x11, sp, #512
+; CHECK-NEXT:    add x8, sp, #520
+; CHECK-NEXT:    mov v0.b[5], w5
+; CHECK-NEXT:    ld1 { v1.b }[6], [x12]
 ; CHECK-NEXT:    ld1 { v2.b }[6], [x10]
-; CHECK-NEXT:    add x10, sp, #368
-; CHECK-NEXT:    ld1 { v6.b }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #376
-; CHECK-NEXT:    ld1 { v0.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #432
-; CHECK-NEXT:    ld1 { v4.b }[2], [x10]
-; CHECK-NEXT:    add x10, sp, #520
-; CHECK-NEXT:    mov v1.b[6], w6
-; CHECK-NEXT:    ld1 { v2.b }[7], [x12]
-; CHECK-NEXT:    ld1 { v6.b }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #440
-; CHECK-NEXT:    sshll v5.8h, v5.8b, #0
-; CHECK-NEXT:    ld1 { v3.b }[4], [x11]
-; CHECK-NEXT:    ld1 { v4.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #384
-; CHECK-NEXT:    movi v7.2d, #0000000000000000
-; CHECK-NEXT:    ld1 { v6.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #448
-; CHECK-NEXT:    mov v1.b[7], w7
-; CHECK-NEXT:    ld1 { v3.b }[5], [x10]
-; CHECK-NEXT:    sshll v5.4s, v5.4h, #0
-; CHECK-NEXT:    ld1 { v4.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #392
-; CHECK-NEXT:    add x10, sp, #528
-; CHECK-NEXT:    ld1 { v6.b }[4], [x8]
-; CHECK-NEXT:    add x8, sp, #456
-; CHECK-NEXT:    mov v7.s[0], v5.s[0]
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ld1 { v4.b }[5], [x9]
-; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-NEXT:    add x10, sp, #392
+; CHECK-NEXT:    add x11, sp, #456
+; CHECK-NEXT:    ldr b6, [sp, #144]
+; CHECK-NEXT:    ldr b7, [sp, #544]
+; CHECK-NEXT:    ld1 { v3.b }[5], [x8]
+; CHECK-NEXT:    ld1 { v4.b }[5], [x10]
+; CHECK-NEXT:    ld1 { v5.b }[5], [x11]
+; CHECK-NEXT:    add x9, sp, #136
+; CHECK-NEXT:    sshll v6.8h, v6.8b, #0
+; CHECK-NEXT:    mov v0.b[6], w6
+; CHECK-NEXT:    ld1 { v1.b }[7], [x9]
+; CHECK-NEXT:    add x8, sp, #528
 ; CHECK-NEXT:    add x9, sp, #400
-; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-NEXT:    ld1 { v6.b }[5], [x8]
-; CHECK-NEXT:    add x8, sp, #464
-; CHECK-NEXT:    ld1 { v3.b }[6], [x10]
-; CHECK-NEXT:    saddw v5.4s, v7.4s, v2.4h
+; CHECK-NEXT:    add x10, sp, #464
+; CHECK-NEXT:    sshll v7.8h, v7.8b, #0
+; CHECK-NEXT:    ld1 { v3.b }[6], [x8]
 ; CHECK-NEXT:    ld1 { v4.b }[6], [x9]
-; CHECK-NEXT:    saddl v7.4s, v1.4h, v0.4h
-; CHECK-NEXT:    add x10, sp, #536
+; CHECK-NEXT:    ld1 { v5.b }[6], [x10]
+; CHECK-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-NEXT:    movi v17.2d, #0000000000000000
+; CHECK-NEXT:    add x14, sp, #72
+; CHECK-NEXT:    mov v0.b[7], w7
+; CHECK-NEXT:    sshll v6.4s, v6.4h, #0
+; CHECK-NEXT:    add x8, sp, #536
 ; CHECK-NEXT:    add x9, sp, #408
-; CHECK-NEXT:    ld1 { v6.b }[6], [x8]
-; CHECK-NEXT:    add x8, sp, #472
-; CHECK-NEXT:    add v5.4s, v7.4s, v5.4s
-; CHECK-NEXT:    ldr b7, [sp, #544]
-; CHECK-NEXT:    saddl2 v0.4s, v1.8h, v0.8h
-; CHECK-NEXT:    ld1 { v3.b }[7], [x10]
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    ld1 { v4.b }[7], [x9]
-; CHECK-NEXT:    sshll v7.8h, v7.8b, #0
-; CHECK-NEXT:    ld1 { v6.b }[7], [x8]
-; CHECK-NEXT:    saddw2 v0.4s, v0.4s, v2.8h
+; CHECK-NEXT:    add x10, sp, #472
 ; CHECK-NEXT:    sshll v7.4s, v7.4h, #0
+; CHECK-NEXT:    ld1 { v2.b }[7], [x14]
+; CHECK-NEXT:    ld1 { v3.b }[7], [x8]
+; CHECK-NEXT:    ld1 { v4.b }[7], [x9]
+; CHECK-NEXT:    ld1 { v5.b }[7], [x10]
+; CHECK-NEXT:    mov v16.s[0], v6.s[0]
+; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-NEXT:    mov v17.s[0], v7.s[0]
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
 ; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
-; CHECK-NEXT:    mov v1.s[0], v7.s[0]
 ; CHECK-NEXT:    sshll v4.8h, v4.8b, #0
-; CHECK-NEXT:    sshll v6.8h, v6.8b, #0
-; CHECK-NEXT:    saddl v7.4s, v4.4h, v3.4h
+; CHECK-NEXT:    sshll v5.8h, v5.8b, #0
+; CHECK-NEXT:    saddl v7.4s, v0.4h, v1.4h
+; CHECK-NEXT:    saddl2 v0.4s, v0.8h, v1.8h
+; CHECK-NEXT:    saddw v6.4s, v16.4s, v2.4h
+; CHECK-NEXT:    saddl v1.4s, v4.4h, v3.4h
 ; CHECK-NEXT:    saddl2 v3.4s, v4.8h, v3.8h
-; CHECK-NEXT:    saddw v1.4s, v1.4s, v6.4h
-; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
-; CHECK-NEXT:    saddw2 v2.4s, v3.4s, v6.8h
-; CHECK-NEXT:    add v1.4s, v7.4s, v1.4s
+; CHECK-NEXT:    saddw v4.4s, v17.4s, v5.4h
+; CHECK-NEXT:    saddw2 v0.4s, v0.4s, v2.8h
+; CHECK-NEXT:    add v6.4s, v7.4s, v6.4s
+; CHECK-NEXT:    saddw2 v2.4s, v3.4s, v5.8h
+; CHECK-NEXT:    add v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    add v0.4s, v6.4s, v0.4s
 ; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    addv s0, v0.4s
@@ -1472,12 +1472,13 @@ entry:
 define i32 @test_udot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
 ; CHECK-LABEL: test_udot_v32i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp q3, q2, [x0]
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    ldr q1, [x1, #16]
-; CHECK-NEXT:    udot v0.4s, v1.16b, v2.16b
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    udot v0.4s, v1.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x0, #16]
+; CHECK-NEXT:    ldr q2, [x1, #16]
+; CHECK-NEXT:    udot v0.4s, v2.16b, v1.16b
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
+; CHECK-NEXT:    udot v0.4s, v2.16b, v1.16b
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    add w0, w8, w2
@@ -1497,8 +1498,8 @@ define i32 @test_udot_v32i8_nomla(ptr nocapture readonly %a1) {
 ; CHECK-LABEL: test_udot_v32i8_nomla:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v0.16b, #1
-; CHECK-NEXT:    ldr q2, [x0, #16]
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    ldr q2, [x0, #16]
 ; CHECK-NEXT:    udot v1.4s, v2.16b, v0.16b
 ; CHECK-NEXT:    ldr q2, [x0]
 ; CHECK-NEXT:    udot v1.4s, v2.16b, v0.16b
@@ -1514,12 +1515,13 @@ entry:
 define i32 @test_sdot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
 ; CHECK-LABEL: test_sdot_v32i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp q3, q2, [x0]
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    ldr q1, [x1, #16]
-; CHECK-NEXT:    sdot v0.4s, v1.16b, v2.16b
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    sdot v0.4s, v1.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x0, #16]
+; CHECK-NEXT:    ldr q2, [x1, #16]
+; CHECK-NEXT:    sdot v0.4s, v2.16b, v1.16b
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
+; CHECK-NEXT:    sdot v0.4s, v2.16b, v1.16b
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    add w0, w8, w2
@@ -1540,11 +1542,11 @@ define i32 @test_sdot_v32i8_double(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v16.2d, #0000000000000000
 ; CHECK-NEXT:    movi v17.2d, #0000000000000000
-; CHECK-NEXT:    sdot v16.4s, v1.16b, v3.16b
-; CHECK-NEXT:    sdot v17.4s, v5.16b, v7.16b
-; CHECK-NEXT:    sdot v16.4s, v0.16b, v2.16b
-; CHECK-NEXT:    sdot v17.4s, v4.16b, v6.16b
-; CHECK-NEXT:    add v0.4s, v16.4s, v17.4s
+; CHECK-NEXT:    sdot v17.4s, v1.16b, v3.16b
+; CHECK-NEXT:    sdot v16.4s, v5.16b, v7.16b
+; CHECK-NEXT:    sdot v17.4s, v0.16b, v2.16b
+; CHECK-NEXT:    sdot v16.4s, v4.16b, v6.16b
+; CHECK-NEXT:    add v0.4s, v17.4s, v16.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -1567,11 +1569,11 @@ define i32 @test_sdot_v32i8_double_nomla(<32 x i8> %a, <32 x i8> %b, <32 x i8> %
 ; CHECK-NEXT:    movi v2.16b, #1
 ; CHECK-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-NEXT:    movi v6.2d, #0000000000000000
-; CHECK-NEXT:    sdot v3.4s, v1.16b, v2.16b
-; CHECK-NEXT:    sdot v6.4s, v5.16b, v2.16b
-; CHECK-NEXT:    sdot v3.4s, v0.16b, v2.16b
-; CHECK-NEXT:    sdot v6.4s, v4.16b, v2.16b
-; CHECK-NEXT:    add v0.4s, v3.4s, v6.4s
+; CHECK-NEXT:    sdot v6.4s, v1.16b, v2.16b
+; CHECK-NEXT:    sdot v3.4s, v5.16b, v2.16b
+; CHECK-NEXT:    sdot v6.4s, v0.16b, v2.16b
+; CHECK-NEXT:    sdot v3.4s, v4.16b, v2.16b
+; CHECK-NEXT:    add v0.4s, v6.4s, v3.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -1587,34 +1589,34 @@ entry:
 define i32 @test_udot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
 ; CHECK-LABEL: test_udot_v33i8:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    movi v18.2d, #0000000000000000
+; CHECK-NEXT:    ldp q4, q5, [x1]
+; CHECK-NEXT:    ldr b0, [x0, #32]
 ; CHECK-NEXT:    ldr b1, [x1, #32]
-; CHECK-NEXT:    ldr b2, [x0, #32]
-; CHECK-NEXT:    ldp q3, q4, [x0]
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushll v6.8h, v2.8b, #0
+; CHECK-NEXT:    ushll2 v2.8h, v2.16b, #0
 ; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    umull v1.4s, v1.4h, v2.4h
-; CHECK-NEXT:    ushll v16.8h, v3.8b, #0
-; CHECK-NEXT:    ldp q5, q6, [x1]
-; CHECK-NEXT:    ushll2 v3.8h, v3.16b, #0
-; CHECK-NEXT:    mov v0.s[0], v1.s[0]
-; CHECK-NEXT:    ushll2 v7.8h, v4.16b, #0
-; CHECK-NEXT:    ushll2 v2.8h, v5.16b, #0
+; CHECK-NEXT:    ushll v7.8h, v4.8b, #0
+; CHECK-NEXT:    ushll2 v4.8h, v4.16b, #0
+; CHECK-NEXT:    ushll2 v16.8h, v3.16b, #0
+; CHECK-NEXT:    ushll v3.8h, v3.8b, #0
+; CHECK-NEXT:    ushll2 v19.8h, v5.16b, #0
 ; CHECK-NEXT:    ushll v5.8h, v5.8b, #0
-; CHECK-NEXT:    umull2 v18.4s, v2.8h, v3.8h
-; CHECK-NEXT:    umull2 v1.4s, v5.8h, v16.8h
-; CHECK-NEXT:    ushll v4.8h, v4.8b, #0
-; CHECK-NEXT:    ushll2 v17.8h, v6.16b, #0
-; CHECK-NEXT:    ushll v6.8h, v6.8b, #0
-; CHECK-NEXT:    umull v2.4s, v2.4h, v3.4h
-; CHECK-NEXT:    umlal2 v18.4s, v17.8h, v7.8h
-; CHECK-NEXT:    umlal2 v1.4s, v6.8h, v4.8h
-; CHECK-NEXT:    umlal v0.4s, v5.4h, v16.4h
-; CHECK-NEXT:    umlal v2.4s, v17.4h, v7.4h
-; CHECK-NEXT:    add v1.4s, v1.4s, v18.4s
-; CHECK-NEXT:    umlal v0.4s, v6.4h, v4.4h
-; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    umull v0.4s, v1.4h, v0.4h
+; CHECK-NEXT:    umull2 v1.4s, v7.8h, v6.8h
+; CHECK-NEXT:    umull2 v17.4s, v4.8h, v2.8h
+; CHECK-NEXT:    umull v2.4s, v4.4h, v2.4h
+; CHECK-NEXT:    umlal2 v17.4s, v19.8h, v16.8h
+; CHECK-NEXT:    umlal2 v1.4s, v5.8h, v3.8h
+; CHECK-NEXT:    mov v18.s[0], v0.s[0]
+; CHECK-NEXT:    umlal v2.4s, v19.4h, v16.4h
+; CHECK-NEXT:    add v0.4s, v1.4s, v17.4s
+; CHECK-NEXT:    umlal v18.4s, v7.4h, v6.4h
+; CHECK-NEXT:    umlal v18.4s, v5.4h, v3.4h
+; CHECK-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    add v0.4s, v18.4s, v0.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    add w0, w8, w2
@@ -1637,19 +1639,19 @@ define i32 @test_udot_v33i8_nomla(ptr nocapture readonly %a1) {
 ; CHECK-NEXT:    ldp q3, q2, [x0]
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    ushll2 v5.8h, v3.16b, #0
-; CHECK-NEXT:    mov v0.s[0], v1.s[0]
 ; CHECK-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-NEXT:    ushll v5.8h, v3.8b, #0
 ; CHECK-NEXT:    ushll2 v2.8h, v2.16b, #0
-; CHECK-NEXT:    ushll v1.8h, v3.8b, #0
-; CHECK-NEXT:    uaddl2 v3.4s, v5.8h, v2.8h
-; CHECK-NEXT:    uaddl2 v6.4s, v1.8h, v4.8h
-; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
-; CHECK-NEXT:    uaddl v1.4s, v5.4h, v2.4h
-; CHECK-NEXT:    add v2.4s, v6.4s, v3.4s
+; CHECK-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    uaddl2 v6.4s, v3.8h, v2.8h
+; CHECK-NEXT:    uaddl v2.4s, v3.4h, v2.4h
+; CHECK-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-NEXT:    uaddl2 v1.4s, v5.8h, v4.8h
+; CHECK-NEXT:    add v1.4s, v1.4s, v6.4s
+; CHECK-NEXT:    uaddw v0.4s, v0.4s, v5.4h
 ; CHECK-NEXT:    uaddw v0.4s, v0.4s, v4.4h
-; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
@@ -1663,34 +1665,34 @@ entry:
 define i32 @test_sdot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
 ; CHECK-LABEL: test_sdot_v33i8:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    movi v18.2d, #0000000000000000
+; CHECK-NEXT:    ldp q4, q5, [x1]
+; CHECK-NEXT:    ldr b0, [x0, #32]
 ; CHECK-NEXT:    ldr b1, [x1, #32]
-; CHECK-NEXT:    ldr b2, [x0, #32]
-; CHECK-NEXT:    ldp q3, q4, [x0]
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    sshll v6.8h, v2.8b, #0
+; CHECK-NEXT:    sshll2 v2.8h, v2.16b, #0
 ; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    smull v1.4s, v1.4h, v2.4h
-; CHECK-NEXT:    sshll v16.8h, v3.8b, #0
-; CHECK-NEXT:    ldp q5, q6, [x1]
-; CHECK-NEXT:    sshll2 v3.8h, v3.16b, #0
-; CHECK-NEXT:    mov v0.s[0], v1.s[0]
-; CHECK-NEXT:    sshll2 v7.8h, v4.16b, #0
-; CHECK-NEXT:    sshll2 v2.8h, v5.16b, #0
+; CHECK-NEXT:    sshll v7.8h, v4.8b, #0
+; CHECK-NEXT:    sshll2 v4.8h, v4.16b, #0
+; CHECK-NEXT:    sshll2 v16.8h, v3.16b, #0
+; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
+; CHECK-NEXT:    sshll2 v19.8h, v5.16b, #0
 ; CHECK-NEXT:    sshll v5.8h, v5.8b, #0
-; CHECK-NEXT:    smull2 v18.4s, v2.8h, v3.8h
-; CHECK-NEXT:    smull2 v1.4s, v5.8h, v16.8h
-; CHECK-NEXT:    sshll v4.8h, v4.8b, #0
-; CHECK-NEXT:    sshll2 v17.8h, v6.16b, #0
-; CHECK-NEXT:    sshll v6.8h, v6.8b, #0
-; CHECK-NEXT:    smull v2.4s, v2.4h, v3.4h
-; CHECK-NEXT:    smlal2 v18.4s, v17.8h, v7.8h
-; CHECK-NEXT:    smlal2 v1.4s, v6.8h, v4.8h
-; CHECK-NEXT:    smlal v0.4s, v5.4h, v16.4h
-; CHECK-NEXT:    smlal v2.4s, v17.4h, v7.4h
-; CHECK-NEXT:    add v1.4s, v1.4s, v18.4s
-; CHECK-NEXT:    smlal v0.4s, v6.4h, v4.4h
-; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    smull v0.4s, v1.4h, v0.4h
+; CHECK-NEXT:    smull2 v1.4s, v7.8h, v6.8h
+; CHECK-NEXT:    smull2 v17.4s, v4.8h, v2.8h
+; CHECK-NEXT:    smull v2.4s, v4.4h, v2.4h
+; CHECK-NEXT:    smlal2 v17.4s, v19.8h, v16.8h
+; CHECK-NEXT:    smlal2 v1.4s, v5.8h, v3.8h
+; CHECK-NEXT:    mov v18.s[0], v0.s[0]
+; CHECK-NEXT:    smlal v2.4s, v19.4h, v16.4h
+; CHECK-NEXT:    add v0.4s, v1.4s, v17.4s
+; CHECK-NEXT:    smlal v18.4s, v7.4h, v6.4h
+; CHECK-NEXT:    smlal v18.4s, v5.4h, v3.4h
+; CHECK-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    add v0.4s, v18.4s, v0.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    add w0, w8, w2
@@ -1714,287 +1716,287 @@ define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ldr b0, [sp, #80]
 ; CHECK-NEXT:    add x8, sp, #88
-; CHECK-NEXT:    ldr b1, [sp, #144]
-; CHECK-NEXT:    add x9, sp, #96
+; CHECK-NEXT:    ldr b2, [sp, #144]
+; CHECK-NEXT:    fmov s4, w0
+; CHECK-NEXT:    add x10, sp, #152
 ; CHECK-NEXT:    ldr b3, [sp, #16]
-; CHECK-NEXT:    add x10, sp, #104
 ; CHECK-NEXT:    ld1 { v0.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #152
-; CHECK-NEXT:    ldr b4, [sp, #344]
-; CHECK-NEXT:    fmov s2, w0
-; CHECK-NEXT:    ldr b6, [sp, #216]
-; CHECK-NEXT:    add x11, sp, #136
-; CHECK-NEXT:    ld1 { v1.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #160
+; CHECK-NEXT:    ld1 { v2.b }[1], [x10]
+; CHECK-NEXT:    add x10, sp, #24
+; CHECK-NEXT:    ldr b1, [sp, #344]
+; CHECK-NEXT:    add x9, sp, #96
+; CHECK-NEXT:    ld1 { v3.b }[1], [x10]
+; CHECK-NEXT:    add x10, sp, #352
+; CHECK-NEXT:    mov v4.b[1], w1
+; CHECK-NEXT:    add x8, sp, #104
 ; CHECK-NEXT:    ld1 { v0.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #24
-; CHECK-NEXT:    mov v2.b[1], w1
-; CHECK-NEXT:    ldr b17, [sp, #280]
-; CHECK-NEXT:    ldr b7, [sp, #408]
-; CHECK-NEXT:    ld1 { v1.b }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #168
-; CHECK-NEXT:    ld1 { v3.b }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #160
+; CHECK-NEXT:    ld1 { v1.b }[1], [x10]
+; CHECK-NEXT:    ld1 { v2.b }[2], [x9]
 ; CHECK-NEXT:    add x9, sp, #32
-; CHECK-NEXT:    ld1 { v0.b }[3], [x10]
-; CHECK-NEXT:    add x10, sp, #112
-; CHECK-NEXT:    mov v2.b[2], w2
-; CHECK-NEXT:    ldr b5, [sp, #208]
-; CHECK-NEXT:    ld1 { v1.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #176
+; CHECK-NEXT:    add x12, sp, #360
 ; CHECK-NEXT:    ld1 { v3.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #40
-; CHECK-NEXT:    ld1 { v0.b }[4], [x10]
+; CHECK-NEXT:    add x11, sp, #112
 ; CHECK-NEXT:    add x10, sp, #120
-; CHECK-NEXT:    mov v2.b[3], w3
-; CHECK-NEXT:    ld1 { v1.b }[4], [x8]
-; CHECK-NEXT:    add x8, sp, #184
-; CHECK-NEXT:    ld1 { v3.b }[3], [x9]
+; CHECK-NEXT:    ld1 { v1.b }[2], [x12]
+; CHECK-NEXT:    add x12, sp, #168
+; CHECK-NEXT:    ld1 { v0.b }[3], [x8]
+; CHECK-NEXT:    mov v4.b[2], w2
+; CHECK-NEXT:    ld1 { v2.b }[3], [x12]
+; CHECK-NEXT:    add x12, sp, #40
+; CHECK-NEXT:    ld1 { v3.b }[3], [x12]
+; CHECK-NEXT:    add x13, sp, #176
+; CHECK-NEXT:    ldr b16, [sp, #216]
+; CHECK-NEXT:    ld1 { v0.b }[4], [x11]
+; CHECK-NEXT:    add x11, sp, #48
+; CHECK-NEXT:    add x12, sp, #368
+; CHECK-NEXT:    ld1 { v2.b }[4], [x13]
+; CHECK-NEXT:    add x13, sp, #224
 ; CHECK-NEXT:    add x9, sp, #128
+; CHECK-NEXT:    mov v4.b[3], w3
+; CHECK-NEXT:    ld1 { v3.b }[4], [x11]
+; CHECK-NEXT:    ld1 { v16.b }[1], [x13]
 ; CHECK-NEXT:    ld1 { v0.b }[5], [x10]
-; CHECK-NEXT:    add x10, sp, #48
-; CHECK-NEXT:    mov v2.b[4], w4
-; CHECK-NEXT:    ld1 { v1.b }[5], [x8]
-; CHECK-NEXT:    add x8, sp, #192
-; CHECK-NEXT:    ld1 { v3.b }[4], [x10]
-; CHECK-NEXT:    add x10, sp, #360
+; CHECK-NEXT:    add x10, sp, #56
+; CHECK-NEXT:    ld1 { v1.b }[3], [x12]
+; CHECK-NEXT:    add x12, sp, #184
+; CHECK-NEXT:    ldr b5, [sp, #280]
+; CHECK-NEXT:    add x11, sp, #376
+; CHECK-NEXT:    ld1 { v3.b }[5], [x10]
+; CHECK-NEXT:    ld1 { v2.b }[5], [x12]
+; CHECK-NEXT:    add x10, sp, #232
+; CHECK-NEXT:    mov v4.b[4], w4
 ; CHECK-NEXT:    ld1 { v0.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #56
-; CHECK-NEXT:    mov v2.b[5], w5
-; CHECK-NEXT:    ld1 { v1.b }[6], [x8]
-; CHECK-NEXT:    add x8, sp, #200
-; CHECK-NEXT:    ld1 { v3.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #64
-; CHECK-NEXT:    ld1 { v0.b }[7], [x11]
-; CHECK-NEXT:    add x11, sp, #232
-; CHECK-NEXT:    mov v2.b[6], w6
-; CHECK-NEXT:    ld1 { v1.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ld1 { v3.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #72
-; CHECK-NEXT:    sshll v5.8h, v5.8b, #0
-; CHECK-NEXT:    ld1 { v4.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #224
-; CHECK-NEXT:    mov v2.b[7], w7
-; CHECK-NEXT:    ld1 { v3.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #416
-; CHECK-NEXT:    ld1 { v6.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #288
-; CHECK-NEXT:    ld1 { v4.b }[2], [x10]
-; CHECK-NEXT:    add x10, sp, #368
-; CHECK-NEXT:    ld1 { v7.b }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #424
-; CHECK-NEXT:    ld1 { v17.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #296
-; CHECK-NEXT:    ld1 { v6.b }[2], [x11]
-; CHECK-NEXT:    add x11, sp, #240
-; CHECK-NEXT:    ld1 { v4.b }[3], [x10]
-; CHECK-NEXT:    add x10, sp, #376
-; CHECK-NEXT:    ld1 { v7.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #432
+; CHECK-NEXT:    add x9, sp, #288
+; CHECK-NEXT:    add x15, sp, #64
+; CHECK-NEXT:    ld1 { v16.b }[2], [x10]
+; CHECK-NEXT:    ldr b17, [sp, #408]
+; CHECK-NEXT:    ld1 { v5.b }[1], [x9]
+; CHECK-NEXT:    add x14, sp, #192
+; CHECK-NEXT:    ld1 { v1.b }[4], [x11]
+; CHECK-NEXT:    ld1 { v3.b }[6], [x15]
+; CHECK-NEXT:    add x15, sp, #416
+; CHECK-NEXT:    ld1 { v2.b }[6], [x14]
+; CHECK-NEXT:    add x14, sp, #240
+; CHECK-NEXT:    ld1 { v17.b }[1], [x15]
+; CHECK-NEXT:    add x9, sp, #296
+; CHECK-NEXT:    add x8, sp, #136
+; CHECK-NEXT:    mov v4.b[5], w5
+; CHECK-NEXT:    add x13, sp, #384
+; CHECK-NEXT:    ld1 { v16.b }[3], [x14]
+; CHECK-NEXT:    ld1 { v5.b }[2], [x9]
+; CHECK-NEXT:    ld1 { v1.b }[5], [x13]
+; CHECK-NEXT:    ld1 { v0.b }[7], [x8]
+; CHECK-NEXT:    add x8, sp, #424
+; CHECK-NEXT:    add x9, sp, #248
 ; CHECK-NEXT:    ld1 { v17.b }[2], [x8]
 ; CHECK-NEXT:    add x8, sp, #304
-; CHECK-NEXT:    ld1 { v6.b }[3], [x11]
-; CHECK-NEXT:    add x11, sp, #248
-; CHECK-NEXT:    ld1 { v4.b }[4], [x10]
-; CHECK-NEXT:    add x10, sp, #384
-; CHECK-NEXT:    ld1 { v7.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #440
-; CHECK-NEXT:    ld1 { v17.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #312
-; CHECK-NEXT:    ld1 { v6.b }[4], [x11]
-; CHECK-NEXT:    add x11, sp, #256
-; CHECK-NEXT:    ld1 { v4.b }[5], [x10]
 ; CHECK-NEXT:    add x10, sp, #392
-; CHECK-NEXT:    ld1 { v7.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #448
+; CHECK-NEXT:    ld1 { v16.b }[4], [x9]
+; CHECK-NEXT:    ld1 { v5.b }[3], [x8]
+; CHECK-NEXT:    mov v4.b[6], w6
+; CHECK-NEXT:    ld1 { v1.b }[6], [x10]
+; CHECK-NEXT:    add x10, sp, #432
+; CHECK-NEXT:    add x9, sp, #256
+; CHECK-NEXT:    ld1 { v17.b }[3], [x10]
+; CHECK-NEXT:    add x10, sp, #312
+; CHECK-NEXT:    ldr b22, [sp, #608]
+; CHECK-NEXT:    add x8, sp, #400
+; CHECK-NEXT:    ld1 { v16.b }[5], [x9]
+; CHECK-NEXT:    ld1 { v5.b }[4], [x10]
+; CHECK-NEXT:    add x9, sp, #616
+; CHECK-NEXT:    ld1 { v1.b }[7], [x8]
+; CHECK-NEXT:    add x8, sp, #440
+; CHECK-NEXT:    ld1 { v22.b }[1], [x9]
+; CHECK-NEXT:    mov v4.b[7], w7
 ; CHECK-NEXT:    ld1 { v17.b }[4], [x8]
 ; CHECK-NEXT:    add x8, sp, #320
-; CHECK-NEXT:    ld1 { v6.b }[5], [x11]
-; CHECK-NEXT:    add x11, sp, #264
-; CHECK-NEXT:    sshll v19.8h, v2.8b, #0
-; CHECK-NEXT:    ld1 { v4.b }[6], [x10]
-; CHECK-NEXT:    ld1 { v7.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #456
-; CHECK-NEXT:    ld1 { v17.b }[5], [x8]
-; CHECK-NEXT:    add x8, sp, #328
-; CHECK-NEXT:    ld1 { v6.b }[6], [x11]
-; CHECK-NEXT:    add x11, sp, #272
-; CHECK-NEXT:    sshll v2.8h, v1.8b, #0
-; CHECK-NEXT:    ldr b1, [sp, #608]
-; CHECK-NEXT:    ld1 { v7.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #464
+; CHECK-NEXT:    add x10, sp, #448
+; CHECK-NEXT:    ldr b6, [sp, #208]
+; CHECK-NEXT:    ld1 { v5.b }[5], [x8]
+; CHECK-NEXT:    add x8, sp, #624
+; CHECK-NEXT:    ldr b7, [sp, #472]
+; CHECK-NEXT:    ld1 { v22.b }[2], [x8]
+; CHECK-NEXT:    ld1 { v17.b }[5], [x10]
+; CHECK-NEXT:    add x10, sp, #328
+; CHECK-NEXT:    sshll v20.8h, v4.8b, #0
+; CHECK-NEXT:    ldr b4, [sp, #480]
+; CHECK-NEXT:    add x8, sp, #456
+; CHECK-NEXT:    ld1 { v5.b }[6], [x10]
+; CHECK-NEXT:    add x10, sp, #632
+; CHECK-NEXT:    sshll v6.8h, v6.8b, #0
+; CHECK-NEXT:    ld1 { v22.b }[3], [x10]
+; CHECK-NEXT:    add x10, sp, #488
 ; CHECK-NEXT:    ld1 { v17.b }[6], [x8]
 ; CHECK-NEXT:    add x8, sp, #336
-; CHECK-NEXT:    ld1 { v6.b }[7], [x11]
-; CHECK-NEXT:    add x10, sp, #400
-; CHECK-NEXT:    sshll v16.8h, v3.8b, #0
-; CHECK-NEXT:    add x11, sp, #648
-; CHECK-NEXT:    ld1 { v7.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #624
-; CHECK-NEXT:    ld1 { v17.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #616
-; CHECK-NEXT:    sshll v21.8h, v6.8b, #0
-; CHECK-NEXT:    ldr b6, [sp, #472]
-; CHECK-NEXT:    ld1 { v4.b }[7], [x10]
-; CHECK-NEXT:    add x10, sp, #552
-; CHECK-NEXT:    ld1 { v1.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #488
-; CHECK-NEXT:    sshll v18.8h, v17.8b, #0
-; CHECK-NEXT:    ldr b17, [sp, #480]
-; CHECK-NEXT:    sshll v6.8h, v6.8b, #0
-; CHECK-NEXT:    sshll v3.8h, v4.8b, #0
-; CHECK-NEXT:    ld1 { v17.b }[1], [x8]
+; CHECK-NEXT:    ld1 { v4.b }[1], [x10]
+; CHECK-NEXT:    sshll v7.8h, v7.8b, #0
+; CHECK-NEXT:    ld1 { v5.b }[7], [x8]
+; CHECK-NEXT:    add x8, sp, #640
+; CHECK-NEXT:    add x9, sp, #264
+; CHECK-NEXT:    ld1 { v22.b }[4], [x8]
 ; CHECK-NEXT:    add x8, sp, #496
-; CHECK-NEXT:    ld1 { v1.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #632
-; CHECK-NEXT:    sshll v4.8h, v7.8b, #0
-; CHECK-NEXT:    smull v20.4s, v5.4h, v6.4h
-; CHECK-NEXT:    movi v7.2d, #0000000000000000
-; CHECK-NEXT:    ld1 { v17.b }[2], [x8]
-; CHECK-NEXT:    smull v5.4s, v16.4h, v18.4h
-; CHECK-NEXT:    ld1 { v1.b }[3], [x9]
-; CHECK-NEXT:    smull2 v16.4s, v16.8h, v18.8h
-; CHECK-NEXT:    ldr b18, [sp, #544]
+; CHECK-NEXT:    ld1 { v16.b }[6], [x9]
+; CHECK-NEXT:    ld1 { v4.b }[2], [x8]
+; CHECK-NEXT:    add x8, sp, #648
+; CHECK-NEXT:    smull v18.4s, v6.4h, v7.4h
+; CHECK-NEXT:    ldr b7, [sp, #544]
+; CHECK-NEXT:    add x9, sp, #272
+; CHECK-NEXT:    movi v6.2d, #0000000000000000
+; CHECK-NEXT:    ld1 { v22.b }[5], [x8]
 ; CHECK-NEXT:    add x8, sp, #504
-; CHECK-NEXT:    add x9, sp, #640
-; CHECK-NEXT:    mov v7.s[0], v20.s[0]
-; CHECK-NEXT:    ldr b20, [sp, #672]
-; CHECK-NEXT:    ld1 { v18.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #680
-; CHECK-NEXT:    ld1 { v17.b }[3], [x8]
+; CHECK-NEXT:    ld1 { v16.b }[7], [x9]
+; CHECK-NEXT:    ld1 { v4.b }[3], [x8]
+; CHECK-NEXT:    add x8, sp, #552
+; CHECK-NEXT:    add x9, sp, #656
+; CHECK-NEXT:    ld1 { v7.b }[1], [x8]
+; CHECK-NEXT:    add x8, sp, #512
+; CHECK-NEXT:    ldr b21, [sp, #672]
+; CHECK-NEXT:    ld1 { v22.b }[6], [x9]
+; CHECK-NEXT:    mov v6.s[0], v18.s[0]
+; CHECK-NEXT:    add x9, sp, #664
+; CHECK-NEXT:    ld1 { v4.b }[4], [x8]
 ; CHECK-NEXT:    add x8, sp, #560
-; CHECK-NEXT:    ld1 { v1.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #512
-; CHECK-NEXT:    ld1 { v20.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #520
-; CHECK-NEXT:    ld1 { v18.b }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #688
-; CHECK-NEXT:    ld1 { v17.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #568
-; CHECK-NEXT:    smull2 v6.4s, v19.8h, v21.8h
-; CHECK-NEXT:    ld1 { v1.b }[5], [x11]
-; CHECK-NEXT:    ld1 { v20.b }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #696
-; CHECK-NEXT:    ld1 { v18.b }[3], [x9]
+; CHECK-NEXT:    sshll v23.8h, v16.8b, #0
+; CHECK-NEXT:    ld1 { v7.b }[2], [x8]
+; CHECK-NEXT:    add x8, sp, #520
+; CHECK-NEXT:    ldr b24, [sp, #872]
+; CHECK-NEXT:    ld1 { v22.b }[7], [x9]
+; CHECK-NEXT:    add x9, sp, #528
+; CHECK-NEXT:    movi v19.2d, #0000000000000000
+; CHECK-NEXT:    ld1 { v4.b }[5], [x8]
+; CHECK-NEXT:    add x8, sp, #568
+; CHECK-NEXT:    smull2 v18.4s, v20.8h, v23.8h
+; CHECK-NEXT:    ld1 { v7.b }[3], [x8]
+; CHECK-NEXT:    add x8, sp, #680
+; CHECK-NEXT:    smlal v6.4s, v20.4h, v23.4h
+; CHECK-NEXT:    ld1 { v21.b }[1], [x8]
+; CHECK-NEXT:    sshll v20.8h, v22.8b, #0
+; CHECK-NEXT:    ldr b22, [sp, #736]
+; CHECK-NEXT:    ld1 { v4.b }[6], [x9]
 ; CHECK-NEXT:    add x9, sp, #576
-; CHECK-NEXT:    ld1 { v17.b }[5], [x10]
-; CHECK-NEXT:    add x10, sp, #528
-; CHECK-NEXT:    smlal v7.4s, v19.4h, v21.4h
-; CHECK-NEXT:    ldr b19, [sp, #872]
-; CHECK-NEXT:    ld1 { v20.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #704
-; CHECK-NEXT:    ld1 { v18.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #584
-; CHECK-NEXT:    ld1 { v17.b }[6], [x10]
-; CHECK-NEXT:    add x10, sp, #536
-; CHECK-NEXT:    ldr b21, [sp, #936]
-; CHECK-NEXT:    add x11, sp, #656
-; CHECK-NEXT:    ld1 { v20.b }[4], [x8]
-; CHECK-NEXT:    add x8, sp, #712
-; CHECK-NEXT:    ld1 { v18.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #592
-; CHECK-NEXT:    ld1 { v17.b }[7], [x10]
-; CHECK-NEXT:    add x10, sp, #880
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ld1 { v1.b }[6], [x11]
-; CHECK-NEXT:    ld1 { v20.b }[5], [x8]
-; CHECK-NEXT:    add x8, sp, #600
-; CHECK-NEXT:    ld1 { v18.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #720
-; CHECK-NEXT:    ld1 { v19.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #944
-; CHECK-NEXT:    smlal2 v6.4s, v0.8h, v3.8h
-; CHECK-NEXT:    add x11, sp, #664
-; CHECK-NEXT:    ld1 { v20.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #888
-; CHECK-NEXT:    ld1 { v18.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #728
-; CHECK-NEXT:    ld1 { v21.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #752
-; CHECK-NEXT:    ld1 { v19.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #952
-; CHECK-NEXT:    ld1 { v20.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #896
-; CHECK-NEXT:    smlal v7.4s, v0.4h, v3.4h
-; CHECK-NEXT:    ldr b0, [sp, #744]
+; CHECK-NEXT:    ldr b23, [sp, #1000]
+; CHECK-NEXT:    ld1 { v7.b }[4], [x9]
+; CHECK-NEXT:    add x9, sp, #688
+; CHECK-NEXT:    sshll v22.8h, v22.8b, #0
 ; CHECK-NEXT:    ld1 { v21.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #904
-; CHECK-NEXT:    ld1 { v19.b }[3], [x8]
+; CHECK-NEXT:    add x9, sp, #696
+; CHECK-NEXT:    sshll v23.8h, v23.8b, #0
+; CHECK-NEXT:    add x8, sp, #536
+; CHECK-NEXT:    ldr b25, [sp, #936]
+; CHECK-NEXT:    add x10, sp, #464
+; CHECK-NEXT:    ld1 { v4.b }[7], [x8]
+; CHECK-NEXT:    add x8, sp, #584
+; CHECK-NEXT:    ld1 { v17.b }[7], [x10]
+; CHECK-NEXT:    ld1 { v21.b }[3], [x9]
+; CHECK-NEXT:    ld1 { v7.b }[5], [x8]
+; CHECK-NEXT:    add x8, sp, #880
+; CHECK-NEXT:    add x9, sp, #704
+; CHECK-NEXT:    smull v22.4s, v22.4h, v23.4h
+; CHECK-NEXT:    ldr b23, [sp, #744]
+; CHECK-NEXT:    ld1 { v24.b }[1], [x8]
+; CHECK-NEXT:    add x8, sp, #944
+; CHECK-NEXT:    add x10, sp, #888
+; CHECK-NEXT:    ld1 { v21.b }[4], [x9]
+; CHECK-NEXT:    add x9, sp, #752
+; CHECK-NEXT:    ld1 { v25.b }[1], [x8]
+; CHECK-NEXT:    ld1 { v23.b }[1], [x9]
+; CHECK-NEXT:    add x8, sp, #712
+; CHECK-NEXT:    add x9, sp, #760
+; CHECK-NEXT:    ld1 { v24.b }[2], [x10]
+; CHECK-NEXT:    add x10, sp, #952
+; CHECK-NEXT:    mov v19.s[0], v22.s[0]
+; CHECK-NEXT:    ldr b22, [sp, #808]
+; CHECK-NEXT:    ld1 { v25.b }[2], [x10]
+; CHECK-NEXT:    ld1 { v21.b }[5], [x8]
+; CHECK-NEXT:    ld1 { v23.b }[2], [x9]
+; CHECK-NEXT:    add x8, sp, #816
+; CHECK-NEXT:    add x9, sp, #896
+; CHECK-NEXT:    ld1 { v22.b }[1], [x8]
 ; CHECK-NEXT:    add x8, sp, #960
-; CHECK-NEXT:    ld1 { v0.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #760
-; CHECK-NEXT:    ld1 { v1.b }[7], [x11]
-; CHECK-NEXT:    add x11, sp, #816
-; CHECK-NEXT:    ld1 { v21.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #968
-; CHECK-NEXT:    ldr b3, [sp, #808]
-; CHECK-NEXT:    ld1 { v19.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #912
-; CHECK-NEXT:    ld1 { v0.b }[2], [x10]
-; CHECK-NEXT:    add x10, sp, #768
-; CHECK-NEXT:    ld1 { v3.b }[1], [x11]
-; CHECK-NEXT:    add x11, sp, #824
-; CHECK-NEXT:    ld1 { v21.b }[4], [x8]
-; CHECK-NEXT:    add x8, sp, #976
-; CHECK-NEXT:    ld1 { v19.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #920
-; CHECK-NEXT:    ld1 { v0.b }[3], [x10]
+; CHECK-NEXT:    ld1 { v24.b }[3], [x9]
+; CHECK-NEXT:    add x9, sp, #768
+; CHECK-NEXT:    ld1 { v25.b }[3], [x8]
+; CHECK-NEXT:    add x10, sp, #904
+; CHECK-NEXT:    ld1 { v23.b }[3], [x9]
+; CHECK-NEXT:    add x9, sp, #824
+; CHECK-NEXT:    add x8, sp, #720
+; CHECK-NEXT:    ld1 { v22.b }[2], [x9]
+; CHECK-NEXT:    add x9, sp, #968
+; CHECK-NEXT:    ld1 { v24.b }[4], [x10]
 ; CHECK-NEXT:    add x10, sp, #776
-; CHECK-NEXT:    ld1 { v3.b }[2], [x11]
-; CHECK-NEXT:    add x11, sp, #832
-; CHECK-NEXT:    ld1 { v21.b }[5], [x8]
-; CHECK-NEXT:    add x8, sp, #984
-; CHECK-NEXT:    ld1 { v19.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #928
-; CHECK-NEXT:    ld1 { v0.b }[4], [x10]
-; CHECK-NEXT:    add x10, sp, #848
-; CHECK-NEXT:    ld1 { v3.b }[3], [x11]
-; CHECK-NEXT:    add x11, sp, #840
+; CHECK-NEXT:    ld1 { v25.b }[4], [x9]
 ; CHECK-NEXT:    ld1 { v21.b }[6], [x8]
-; CHECK-NEXT:    add x8, sp, #992
-; CHECK-NEXT:    ld1 { v19.b }[7], [x9]
+; CHECK-NEXT:    ld1 { v23.b }[4], [x10]
+; CHECK-NEXT:    add x8, sp, #832
+; CHECK-NEXT:    add x9, sp, #912
+; CHECK-NEXT:    ld1 { v22.b }[3], [x8]
+; CHECK-NEXT:    add x8, sp, #976
+; CHECK-NEXT:    ld1 { v24.b }[5], [x9]
 ; CHECK-NEXT:    add x9, sp, #784
-; CHECK-NEXT:    smlal2 v16.4s, v2.8h, v4.8h
-; CHECK-NEXT:    ld1 { v3.b }[4], [x11]
+; CHECK-NEXT:    ld1 { v25.b }[5], [x8]
+; CHECK-NEXT:    add x10, sp, #920
+; CHECK-NEXT:    ld1 { v23.b }[5], [x9]
+; CHECK-NEXT:    add x9, sp, #840
+; CHECK-NEXT:    add x8, sp, #728
+; CHECK-NEXT:    ld1 { v22.b }[4], [x9]
+; CHECK-NEXT:    add x9, sp, #984
+; CHECK-NEXT:    ld1 { v24.b }[6], [x10]
+; CHECK-NEXT:    add x10, sp, #792
+; CHECK-NEXT:    ld1 { v25.b }[6], [x9]
 ; CHECK-NEXT:    ld1 { v21.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #792
-; CHECK-NEXT:    ld1 { v0.b }[5], [x9]
+; CHECK-NEXT:    ld1 { v23.b }[6], [x10]
+; CHECK-NEXT:    add x8, sp, #848
+; CHECK-NEXT:    add x9, sp, #928
+; CHECK-NEXT:    ld1 { v22.b }[5], [x8]
+; CHECK-NEXT:    add x12, sp, #72
+; CHECK-NEXT:    add x8, sp, #992
+; CHECK-NEXT:    ld1 { v24.b }[7], [x9]
+; CHECK-NEXT:    add x9, sp, #800
+; CHECK-NEXT:    ld1 { v3.b }[7], [x12]
+; CHECK-NEXT:    ld1 { v25.b }[7], [x8]
+; CHECK-NEXT:    add x8, sp, #592
+; CHECK-NEXT:    ld1 { v23.b }[7], [x9]
 ; CHECK-NEXT:    add x9, sp, #856
-; CHECK-NEXT:    smlal v5.4s, v2.4h, v4.4h
-; CHECK-NEXT:    ldr b2, [sp, #736]
-; CHECK-NEXT:    sshll v4.8h, v20.8b, #0
-; CHECK-NEXT:    ldr b20, [sp, #1000]
-; CHECK-NEXT:    ld1 { v3.b }[5], [x10]
-; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-NEXT:    ld1 { v0.b }[6], [x8]
-; CHECK-NEXT:    sshll v20.8h, v20.8b, #0
-; CHECK-NEXT:    add x8, sp, #800
+; CHECK-NEXT:    ld1 { v7.b }[6], [x8]
+; CHECK-NEXT:    add x11, sp, #200
+; CHECK-NEXT:    ld1 { v22.b }[6], [x9]
+; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
+; CHECK-NEXT:    sshll v5.8h, v5.8b, #0
+; CHECK-NEXT:    sshll v4.8h, v4.8b, #0
 ; CHECK-NEXT:    sshll v21.8h, v21.8b, #0
-; CHECK-NEXT:    smull v2.4s, v2.4h, v20.4h
-; CHECK-NEXT:    ld1 { v3.b }[6], [x9]
-; CHECK-NEXT:    smull v20.4s, v4.4h, v21.4h
-; CHECK-NEXT:    ld1 { v0.b }[7], [x8]
-; CHECK-NEXT:    smull2 v4.4s, v4.8h, v21.8h
+; CHECK-NEXT:    sshll v24.8h, v24.8b, #0
+; CHECK-NEXT:    sshll v25.8h, v25.8b, #0
+; CHECK-NEXT:    add x8, sp, #600
+; CHECK-NEXT:    sshll v23.8h, v23.8b, #0
 ; CHECK-NEXT:    add x9, sp, #864
-; CHECK-NEXT:    movi v21.2d, #0000000000000000
-; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-NEXT:    ld1 { v2.b }[7], [x11]
+; CHECK-NEXT:    ld1 { v7.b }[7], [x8]
+; CHECK-NEXT:    ld1 { v22.b }[7], [x9]
+; CHECK-NEXT:    smull v16.4s, v3.4h, v5.4h
+; CHECK-NEXT:    smull2 v3.4s, v3.8h, v5.8h
+; CHECK-NEXT:    smull v5.4s, v21.4h, v25.4h
+; CHECK-NEXT:    smull2 v21.4s, v21.8h, v25.8h
+; CHECK-NEXT:    smull2 v25.4s, v20.8h, v24.8h
+; CHECK-NEXT:    smlal v19.4s, v4.4h, v23.4h
+; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
 ; CHECK-NEXT:    sshll v17.8h, v17.8b, #0
-; CHECK-NEXT:    ld1 { v3.b }[7], [x9]
-; CHECK-NEXT:    sshll v19.8h, v19.8b, #0
-; CHECK-NEXT:    mov v21.s[0], v2.s[0]
 ; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    smull2 v2.4s, v1.8h, v19.8h
-; CHECK-NEXT:    sshll v18.8h, v18.8b, #0
-; CHECK-NEXT:    smlal v21.4s, v17.4h, v0.4h
-; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
-; CHECK-NEXT:    smlal2 v2.4s, v17.8h, v0.8h
-; CHECK-NEXT:    smlal2 v4.4s, v18.8h, v3.8h
-; CHECK-NEXT:    smlal v20.4s, v18.4h, v3.4h
-; CHECK-NEXT:    smlal v21.4s, v1.4h, v19.4h
-; CHECK-NEXT:    add v0.4s, v6.4s, v16.4s
-; CHECK-NEXT:    add v1.4s, v7.4s, v5.4s
-; CHECK-NEXT:    add v2.4s, v2.4s, v4.4s
-; CHECK-NEXT:    add v3.4s, v21.4s, v20.4s
+; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-NEXT:    sshll v7.8h, v7.8b, #0
+; CHECK-NEXT:    sshll v22.8h, v22.8b, #0
+; CHECK-NEXT:    smlal2 v3.4s, v2.8h, v17.8h
+; CHECK-NEXT:    smlal v16.4s, v2.4h, v17.4h
+; CHECK-NEXT:    smlal2 v25.4s, v4.8h, v23.8h
+; CHECK-NEXT:    smlal2 v18.4s, v0.8h, v1.8h
+; CHECK-NEXT:    smlal v6.4s, v0.4h, v1.4h
+; CHECK-NEXT:    smlal v19.4s, v20.4h, v24.4h
+; CHECK-NEXT:    smlal2 v21.4s, v7.8h, v22.8h
+; CHECK-NEXT:    smlal v5.4s, v7.4h, v22.4h
+; CHECK-NEXT:    add v0.4s, v18.4s, v3.4s
+; CHECK-NEXT:    add v1.4s, v6.4s, v16.4s
+; CHECK-NEXT:    add v2.4s, v25.4s, v21.4s
+; CHECK-NEXT:    add v3.4s, v19.4s, v5.4s
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    add v1.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
@@ -2021,153 +2023,153 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ldr b0, [sp, #80]
+; CHECK-NEXT:    ldr b1, [sp, #80]
 ; CHECK-NEXT:    add x8, sp, #88
 ; CHECK-NEXT:    ldr b2, [sp, #144]
 ; CHECK-NEXT:    add x9, sp, #152
-; CHECK-NEXT:    fmov s3, w0
-; CHECK-NEXT:    ldr b4, [sp, #16]
-; CHECK-NEXT:    ld1 { v0.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #96
-; CHECK-NEXT:    add x10, sp, #104
+; CHECK-NEXT:    ldr b3, [sp, #16]
+; CHECK-NEXT:    add x12, sp, #32
+; CHECK-NEXT:    ld1 { v1.b }[1], [x8]
 ; CHECK-NEXT:    ld1 { v2.b }[1], [x9]
-; CHECK-NEXT:    mov v3.b[1], w1
-; CHECK-NEXT:    add x9, sp, #160
-; CHECK-NEXT:    add x11, sp, #128
-; CHECK-NEXT:    ldr b1, [sp, #208]
-; CHECK-NEXT:    ld1 { v0.b }[2], [x8]
+; CHECK-NEXT:    add x9, sp, #96
 ; CHECK-NEXT:    add x8, sp, #24
-; CHECK-NEXT:    ld1 { v2.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #168
-; CHECK-NEXT:    mov v3.b[2], w2
-; CHECK-NEXT:    ld1 { v4.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #112
-; CHECK-NEXT:    ld1 { v0.b }[3], [x10]
-; CHECK-NEXT:    add x10, sp, #32
-; CHECK-NEXT:    ld1 { v2.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #176
-; CHECK-NEXT:    mov v3.b[3], w3
-; CHECK-NEXT:    ld1 { v4.b }[2], [x10]
+; CHECK-NEXT:    add x11, sp, #112
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ld1 { v3.b }[1], [x8]
+; CHECK-NEXT:    add x8, sp, #160
+; CHECK-NEXT:    ldr b4, [sp, #480]
+; CHECK-NEXT:    ld1 { v1.b }[2], [x9]
+; CHECK-NEXT:    add x9, sp, #104
+; CHECK-NEXT:    ld1 { v2.b }[2], [x8]
+; CHECK-NEXT:    add x8, sp, #168
 ; CHECK-NEXT:    add x10, sp, #120
-; CHECK-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-NEXT:    add x8, sp, #40
-; CHECK-NEXT:    ld1 { v2.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #184
-; CHECK-NEXT:    mov v3.b[4], w4
-; CHECK-NEXT:    ld1 { v4.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #48
-; CHECK-NEXT:    ld1 { v0.b }[5], [x10]
-; CHECK-NEXT:    add x10, sp, #136
-; CHECK-NEXT:    ld1 { v2.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #192
-; CHECK-NEXT:    mov v3.b[5], w5
+; CHECK-NEXT:    add x13, sp, #48
+; CHECK-NEXT:    ld1 { v3.b }[2], [x12]
+; CHECK-NEXT:    add x12, sp, #40
+; CHECK-NEXT:    ldr b5, [sp, #608]
+; CHECK-NEXT:    ld1 { v1.b }[3], [x9]
+; CHECK-NEXT:    ld1 { v2.b }[3], [x8]
+; CHECK-NEXT:    mov v0.b[1], w1
+; CHECK-NEXT:    add x9, sp, #128
+; CHECK-NEXT:    add x14, sp, #184
+; CHECK-NEXT:    ldr b16, [sp, #544]
+; CHECK-NEXT:    ld1 { v3.b }[3], [x12]
+; CHECK-NEXT:    add x12, sp, #176
+; CHECK-NEXT:    ldr b17, [sp, #672]
+; CHECK-NEXT:    ld1 { v1.b }[4], [x11]
+; CHECK-NEXT:    add x11, sp, #488
+; CHECK-NEXT:    ld1 { v2.b }[4], [x12]
+; CHECK-NEXT:    ld1 { v4.b }[1], [x11]
+; CHECK-NEXT:    mov v0.b[2], w2
+; CHECK-NEXT:    add x11, sp, #192
+; CHECK-NEXT:    ld1 { v3.b }[4], [x13]
+; CHECK-NEXT:    add x13, sp, #616
+; CHECK-NEXT:    add x12, sp, #56
+; CHECK-NEXT:    ld1 { v1.b }[5], [x10]
+; CHECK-NEXT:    ld1 { v5.b }[1], [x13]
+; CHECK-NEXT:    add x13, sp, #496
+; CHECK-NEXT:    ld1 { v4.b }[2], [x13]
+; CHECK-NEXT:    ld1 { v2.b }[5], [x14]
+; CHECK-NEXT:    add x14, sp, #680
+; CHECK-NEXT:    ld1 { v17.b }[1], [x14]
+; CHECK-NEXT:    add x13, sp, #504
+; CHECK-NEXT:    ld1 { v3.b }[5], [x12]
+; CHECK-NEXT:    ld1 { v1.b }[6], [x9]
+; CHECK-NEXT:    add x9, sp, #552
+; CHECK-NEXT:    add x12, sp, #688
+; CHECK-NEXT:    ld1 { v16.b }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #624
+; CHECK-NEXT:    ld1 { v4.b }[3], [x13]
+; CHECK-NEXT:    ld1 { v2.b }[6], [x11]
+; CHECK-NEXT:    add x11, sp, #560
+; CHECK-NEXT:    add x8, sp, #136
+; CHECK-NEXT:    ld1 { v17.b }[2], [x12]
+; CHECK-NEXT:    ld1 { v5.b }[2], [x9]
+; CHECK-NEXT:    ld1 { v1.b }[7], [x8]
+; CHECK-NEXT:    ld1 { v16.b }[2], [x11]
+; CHECK-NEXT:    add x8, sp, #512
+; CHECK-NEXT:    mov v0.b[3], w3
 ; CHECK-NEXT:    ld1 { v4.b }[4], [x8]
-; CHECK-NEXT:    add x8, sp, #56
-; CHECK-NEXT:    ld1 { v0.b }[6], [x11]
+; CHECK-NEXT:    add x8, sp, #568
+; CHECK-NEXT:    add x9, sp, #696
 ; CHECK-NEXT:    add x11, sp, #632
-; CHECK-NEXT:    ld1 { v2.b }[6], [x9]
+; CHECK-NEXT:    ld1 { v17.b }[3], [x9]
+; CHECK-NEXT:    add x9, sp, #520
+; CHECK-NEXT:    ld1 { v16.b }[3], [x8]
+; CHECK-NEXT:    ld1 { v5.b }[3], [x11]
+; CHECK-NEXT:    add x8, sp, #640
+; CHECK-NEXT:    ld1 { v4.b }[5], [x9]
+; CHECK-NEXT:    add x9, sp, #576
+; CHECK-NEXT:    add x11, sp, #704
+; CHECK-NEXT:    ldr b18, [sp, #736]
+; CHECK-NEXT:    mov v0.b[4], w4
+; CHECK-NEXT:    ld1 { v17.b }[4], [x11]
+; CHECK-NEXT:    ld1 { v16.b }[4], [x9]
+; CHECK-NEXT:    ld1 { v5.b }[4], [x8]
+; CHECK-NEXT:    add x9, sp, #528
+; CHECK-NEXT:    sshll v18.8h, v18.8b, #0
+; CHECK-NEXT:    add x8, sp, #648
+; CHECK-NEXT:    add x11, sp, #584
+; CHECK-NEXT:    add x12, sp, #712
+; CHECK-NEXT:    ld1 { v4.b }[6], [x9]
+; CHECK-NEXT:    movi v7.2d, #0000000000000000
+; CHECK-NEXT:    ld1 { v16.b }[5], [x11]
+; CHECK-NEXT:    ld1 { v17.b }[5], [x12]
+; CHECK-NEXT:    ld1 { v5.b }[5], [x8]
+; CHECK-NEXT:    mov v0.b[5], w5
+; CHECK-NEXT:    add x9, sp, #536
+; CHECK-NEXT:    sshll v18.4s, v18.4h, #0
+; CHECK-NEXT:    add x8, sp, #656
+; CHECK-NEXT:    add x11, sp, #592
+; CHECK-NEXT:    add x12, sp, #720
+; CHECK-NEXT:    ld1 { v4.b }[7], [x9]
+; CHECK-NEXT:    ld1 { v16.b }[6], [x11]
+; CHECK-NEXT:    ld1 { v17.b }[6], [x12]
+; CHECK-NEXT:    ld1 { v5.b }[6], [x8]
+; CHECK-NEXT:    ldr b6, [sp, #208]
+; CHECK-NEXT:    add x10, sp, #64
+; CHECK-NEXT:    mov v7.s[0], v18.s[0]
+; CHECK-NEXT:    mov v0.b[6], w6
+; CHECK-NEXT:    ld1 { v3.b }[6], [x10]
+; CHECK-NEXT:    add x8, sp, #664
+; CHECK-NEXT:    add x9, sp, #600
+; CHECK-NEXT:    add x10, sp, #728
+; CHECK-NEXT:    sshll v4.8h, v4.8b, #0
+; CHECK-NEXT:    sshll v6.8h, v6.8b, #0
+; CHECK-NEXT:    ld1 { v16.b }[7], [x9]
+; CHECK-NEXT:    ld1 { v17.b }[7], [x10]
+; CHECK-NEXT:    ld1 { v5.b }[7], [x8]
+; CHECK-NEXT:    movi v18.2d, #0000000000000000
+; CHECK-NEXT:    mov v0.b[7], w7
 ; CHECK-NEXT:    add x9, sp, #200
-; CHECK-NEXT:    mov v3.b[6], w6
-; CHECK-NEXT:    ld1 { v4.b }[5], [x8]
-; CHECK-NEXT:    add x8, sp, #64
-; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    ld1 { v0.b }[7], [x10]
-; CHECK-NEXT:    ld1 { v2.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #552
-; CHECK-NEXT:    mov v3.b[7], w7
-; CHECK-NEXT:    add x10, sp, #680
-; CHECK-NEXT:    ld1 { v4.b }[6], [x8]
-; CHECK-NEXT:    add x8, sp, #72
-; CHECK-NEXT:    movi v6.2d, #0000000000000000
-; CHECK-NEXT:    sshll v5.4s, v1.4h, #0
-; CHECK-NEXT:    ldr b1, [sp, #608]
+; CHECK-NEXT:    add x10, sp, #72
+; CHECK-NEXT:    saddw v7.4s, v7.4s, v4.4h
+; CHECK-NEXT:    sshll v6.4s, v6.4h, #0
+; CHECK-NEXT:    sshll v16.8h, v16.8b, #0
+; CHECK-NEXT:    sshll v17.8h, v17.8b, #0
+; CHECK-NEXT:    sshll v5.8h, v5.8b, #0
+; CHECK-NEXT:    ld1 { v2.b }[7], [x9]
+; CHECK-NEXT:    ld1 { v3.b }[7], [x10]
+; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-NEXT:    mov v18.s[0], v6.s[0]
 ; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ld1 { v4.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #616
+; CHECK-NEXT:    saddl2 v6.4s, v17.8h, v16.8h
+; CHECK-NEXT:    saddl2 v4.4s, v5.8h, v4.8h
+; CHECK-NEXT:    saddl v16.4s, v17.4h, v16.4h
+; CHECK-NEXT:    saddw v5.4s, v7.4s, v5.4h
 ; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-NEXT:    sshll v7.8h, v3.8b, #0
-; CHECK-NEXT:    ld1 { v1.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #624
-; CHECK-NEXT:    sshll v3.8h, v4.8b, #0
-; CHECK-NEXT:    mov v6.s[0], v5.s[0]
-; CHECK-NEXT:    saddl2 v5.4s, v3.8h, v2.8h
-; CHECK-NEXT:    saddl2 v16.4s, v7.8h, v0.8h
-; CHECK-NEXT:    ld1 { v1.b }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #488
-; CHECK-NEXT:    saddw v4.4s, v6.4s, v7.4h
-; CHECK-NEXT:    ldr b6, [sp, #480]
-; CHECK-NEXT:    add v5.4s, v16.4s, v5.4s
-; CHECK-NEXT:    ldr b7, [sp, #544]
-; CHECK-NEXT:    ldr b16, [sp, #672]
-; CHECK-NEXT:    ld1 { v6.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #496
-; CHECK-NEXT:    ld1 { v7.b }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #560
-; CHECK-NEXT:    ld1 { v16.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #688
-; CHECK-NEXT:    ld1 { v1.b }[3], [x11]
-; CHECK-NEXT:    add x11, sp, #640
-; CHECK-NEXT:    ld1 { v6.b }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #504
-; CHECK-NEXT:    ld1 { v7.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #568
-; CHECK-NEXT:    ld1 { v16.b }[2], [x10]
-; CHECK-NEXT:    add x10, sp, #696
-; CHECK-NEXT:    ld1 { v1.b }[4], [x11]
-; CHECK-NEXT:    add x11, sp, #648
-; CHECK-NEXT:    ld1 { v6.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #512
-; CHECK-NEXT:    ld1 { v7.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #576
-; CHECK-NEXT:    ld1 { v16.b }[3], [x10]
-; CHECK-NEXT:    add x10, sp, #704
-; CHECK-NEXT:    ld1 { v1.b }[5], [x11]
-; CHECK-NEXT:    add x11, sp, #656
-; CHECK-NEXT:    ld1 { v6.b }[4], [x8]
-; CHECK-NEXT:    add x8, sp, #520
-; CHECK-NEXT:    ld1 { v7.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #584
-; CHECK-NEXT:    ld1 { v16.b }[4], [x10]
-; CHECK-NEXT:    add x10, sp, #712
-; CHECK-NEXT:    ld1 { v1.b }[6], [x11]
-; CHECK-NEXT:    add x11, sp, #664
-; CHECK-NEXT:    ld1 { v6.b }[5], [x8]
-; CHECK-NEXT:    add x8, sp, #528
-; CHECK-NEXT:    ld1 { v7.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #592
-; CHECK-NEXT:    ld1 { v16.b }[5], [x10]
-; CHECK-NEXT:    add x10, sp, #720
+; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
+; CHECK-NEXT:    saddl2 v17.4s, v0.8h, v1.8h
+; CHECK-NEXT:    saddw v0.4s, v18.4s, v0.4h
+; CHECK-NEXT:    saddl2 v7.4s, v3.8h, v2.8h
+; CHECK-NEXT:    add v4.4s, v4.4s, v6.4s
 ; CHECK-NEXT:    saddl v2.4s, v3.4h, v2.4h
-; CHECK-NEXT:    ldr b3, [sp, #736]
-; CHECK-NEXT:    ld1 { v6.b }[6], [x8]
-; CHECK-NEXT:    add x8, sp, #600
-; CHECK-NEXT:    saddw v0.4s, v4.4s, v0.4h
-; CHECK-NEXT:    ld1 { v7.b }[6], [x9]
-; CHECK-NEXT:    ld1 { v16.b }[6], [x10]
-; CHECK-NEXT:    add x9, sp, #728
-; CHECK-NEXT:    add x10, sp, #536
-; CHECK-NEXT:    ld1 { v1.b }[7], [x11]
-; CHECK-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-NEXT:    add v5.4s, v5.4s, v16.4s
+; CHECK-NEXT:    saddw v0.4s, v0.4s, v1.4h
+; CHECK-NEXT:    add v6.4s, v17.4s, v7.4s
+; CHECK-NEXT:    add v1.4s, v5.4s, v4.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ld1 { v7.b }[7], [x8]
-; CHECK-NEXT:    sshll v2.8h, v3.8b, #0
-; CHECK-NEXT:    ld1 { v16.b }[7], [x9]
-; CHECK-NEXT:    ld1 { v6.b }[7], [x10]
-; CHECK-NEXT:    sshll v2.4s, v2.4h, #0
-; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    mov v4.s[0], v2.s[0]
-; CHECK-NEXT:    sshll v3.8h, v7.8b, #0
-; CHECK-NEXT:    sshll v7.8h, v16.8b, #0
-; CHECK-NEXT:    sshll v2.8h, v6.8b, #0
-; CHECK-NEXT:    saddl2 v6.4s, v7.8h, v3.8h
-; CHECK-NEXT:    saddl2 v16.4s, v1.8h, v2.8h
-; CHECK-NEXT:    saddw v2.4s, v4.4s, v2.4h
-; CHECK-NEXT:    saddl v3.4s, v7.4h, v3.4h
-; CHECK-NEXT:    add v4.4s, v16.4s, v6.4s
-; CHECK-NEXT:    saddw v1.4s, v2.4s, v1.4h
-; CHECK-NEXT:    add v2.4s, v3.4s, v4.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v5.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    add v1.4s, v6.4s, v1.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
@@ -2185,14 +2187,15 @@ define i32 @test_udot_v48i8(ptr nocapture readonly %a, ptr nocapture readonly %b
 ; CHECK-LABEL: test_udot_v48i8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    ldr q1, [x1, #32]
-; CHECK-NEXT:    ldr q2, [x0, #32]
-; CHECK-NEXT:    udot v0.4s, v1.16b, v2.16b
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ldp q3, q2, [x0]
-; CHECK-NEXT:    udot v0.4s, v1.16b, v3.16b
-; CHECK-NEXT:    ldr q1, [x1, #16]
-; CHECK-NEXT:    udot v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ldr q1, [x0, #32]
+; CHECK-NEXT:    ldr q2, [x1, #32]
+; CHECK-NEXT:    udot v0.4s, v2.16b, v1.16b
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
+; CHECK-NEXT:    udot v0.4s, v2.16b, v1.16b
+; CHECK-NEXT:    ldr q1, [x0, #16]
+; CHECK-NEXT:    ldr q2, [x1, #16]
+; CHECK-NEXT:    udot v0.4s, v2.16b, v1.16b
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    add w0, w8, w2
@@ -2212,8 +2215,8 @@ define i32 @test_udot_v48i8_nomla(ptr nocapture readonly %a1) {
 ; CHECK-LABEL: test_udot_v48i8_nomla:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v0.16b, #1
-; CHECK-NEXT:    ldr q2, [x0, #32]
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    ldr q2, [x0, #32]
 ; CHECK-NEXT:    udot v1.4s, v2.16b, v0.16b
 ; CHECK-NEXT:    ldr q2, [x0]
 ; CHECK-NEXT:    udot v1.4s, v2.16b, v0.16b
@@ -2232,14 +2235,15 @@ define i32 @test_sdot_v48i8(ptr nocapture readonly %a, ptr nocapture readonly %b
 ; CHECK-LABEL: test_sdot_v48i8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    ldr q1, [x1, #32]
-; CHECK-NEXT:    ldr q2, [x0, #32]
-; CHECK-NEXT:    sdot v0.4s, v1.16b, v2.16b
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ldp q3, q2, [x0]
-; CHECK-NEXT:    sdot v0.4s, v1.16b, v3.16b
-; CHECK-NEXT:    ldr q1, [x1, #16]
-; CHECK-NEXT:    sdot v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ldr q1, [x0, #32]
+; CHECK-NEXT:    ldr q2, [x1, #32]
+; CHECK-NEXT:    sdot v0.4s, v2.16b, v1.16b
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
+; CHECK-NEXT:    sdot v0.4s, v2.16b, v1.16b
+; CHECK-NEXT:    ldr q1, [x0, #16]
+; CHECK-NEXT:    ldr q2, [x1, #16]
+; CHECK-NEXT:    sdot v0.4s, v2.16b, v1.16b
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    add w0, w8, w2
@@ -2261,380 +2265,380 @@ define i32 @test_sdot_v48i8_double(<48 x i8> %a, <48 x i8> %b, <48 x i8> %c, <48
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ldr b2, [sp, #592]
+; CHECK-NEXT:    ldr b3, [sp, #592]
 ; CHECK-NEXT:    add x8, sp, #600
-; CHECK-NEXT:    ldr b3, [sp, #208]
-; CHECK-NEXT:    add x10, sp, #344
+; CHECK-NEXT:    ldr b4, [sp, #208]
 ; CHECK-NEXT:    ldr b0, [sp, #336]
-; CHECK-NEXT:    add x9, sp, #608
-; CHECK-NEXT:    ld1 { v2.b }[1], [x8]
+; CHECK-NEXT:    add x9, sp, #344
+; CHECK-NEXT:    ldr b2, [sp, #464]
+; CHECK-NEXT:    ld1 { v3.b }[1], [x8]
 ; CHECK-NEXT:    add x8, sp, #216
+; CHECK-NEXT:    add x10, sp, #624
+; CHECK-NEXT:    ld1 { v4.b }[1], [x8]
+; CHECK-NEXT:    add x8, sp, #608
+; CHECK-NEXT:    ld1 { v0.b }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #232
 ; CHECK-NEXT:    fmov s1, w0
-; CHECK-NEXT:    add x11, sp, #664
-; CHECK-NEXT:    ld1 { v0.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #352
-; CHECK-NEXT:    ld1 { v3.b }[1], [x8]
+; CHECK-NEXT:    ldr b7, [sp, #1360]
+; CHECK-NEXT:    ld1 { v3.b }[2], [x8]
 ; CHECK-NEXT:    add x8, sp, #224
-; CHECK-NEXT:    ld1 { v2.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #616
+; CHECK-NEXT:    add x12, sp, #376
+; CHECK-NEXT:    ld1 { v4.b }[2], [x8]
+; CHECK-NEXT:    add x8, sp, #616
+; CHECK-NEXT:    add x11, sp, #656
 ; CHECK-NEXT:    mov v1.b[1], w1
-; CHECK-NEXT:    ldr b18, [sp, #1360]
-; CHECK-NEXT:    ld1 { v0.b }[2], [x10]
-; CHECK-NEXT:    add x10, sp, #240
-; CHECK-NEXT:    ld1 { v3.b }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #232
-; CHECK-NEXT:    ld1 { v2.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #624
-; CHECK-NEXT:    mov v1.b[2], w2
-; CHECK-NEXT:    ldr b19, [sp, #976]
-; CHECK-NEXT:    ldr b4, [sp, #464]
+; CHECK-NEXT:    ldr b17, [sp, #976]
+; CHECK-NEXT:    add x14, sp, #288
 ; CHECK-NEXT:    ld1 { v3.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #360
-; CHECK-NEXT:    ld1 { v2.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #632
-; CHECK-NEXT:    mov v1.b[3], w3
-; CHECK-NEXT:    ldr b5, [sp, #80]
-; CHECK-NEXT:    ld1 { v0.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #368
+; CHECK-NEXT:    add x8, sp, #632
+; CHECK-NEXT:    add x15, sp, #408
+; CHECK-NEXT:    ld1 { v4.b }[3], [x9]
+; CHECK-NEXT:    add x9, sp, #472
+; CHECK-NEXT:    add x13, sp, #696
+; CHECK-NEXT:    ld1 { v2.b }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #240
+; CHECK-NEXT:    add x16, sp, #448
 ; CHECK-NEXT:    ld1 { v3.b }[4], [x10]
+; CHECK-NEXT:    add x10, sp, #352
+; CHECK-NEXT:    mov v1.b[2], w2
+; CHECK-NEXT:    ld1 { v4.b }[4], [x9]
+; CHECK-NEXT:    ld1 { v0.b }[2], [x10]
+; CHECK-NEXT:    add x10, sp, #1368
+; CHECK-NEXT:    ld1 { v7.b }[1], [x10]
 ; CHECK-NEXT:    add x10, sp, #248
-; CHECK-NEXT:    ld1 { v2.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #376
-; CHECK-NEXT:    mov v1.b[4], w4
-; CHECK-NEXT:    ldr b16, [sp, #1104]
-; CHECK-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-NEXT:    add x8, sp, #640
-; CHECK-NEXT:    ld1 { v3.b }[5], [x10]
+; CHECK-NEXT:    add x9, sp, #640
+; CHECK-NEXT:    ld1 { v3.b }[5], [x8]
+; CHECK-NEXT:    add x8, sp, #648
+; CHECK-NEXT:    movi v6.2d, #0000000000000000
+; CHECK-NEXT:    ld1 { v4.b }[5], [x10]
+; CHECK-NEXT:    add x10, sp, #360
+; CHECK-NEXT:    mov v1.b[3], w3
+; CHECK-NEXT:    ld1 { v0.b }[3], [x10]
 ; CHECK-NEXT:    add x10, sp, #256
-; CHECK-NEXT:    movi v7.2d, #0000000000000000
-; CHECK-NEXT:    ldr b17, [sp, #720]
-; CHECK-NEXT:    ld1 { v2.b }[6], [x8]
-; CHECK-NEXT:    add x8, sp, #384
-; CHECK-NEXT:    ld1 { v0.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #648
+; CHECK-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-NEXT:    ld1 { v3.b }[6], [x9]
+; CHECK-NEXT:    add x9, sp, #368
+; CHECK-NEXT:    ldr b16, [sp, #720]
+; CHECK-NEXT:    ld1 { v4.b }[6], [x10]
+; CHECK-NEXT:    add x10, sp, #984
+; CHECK-NEXT:    ld1 { v0.b }[4], [x9]
+; CHECK-NEXT:    ld1 { v17.b }[1], [x10]
+; CHECK-NEXT:    add x10, sp, #664
+; CHECK-NEXT:    ld1 { v3.b }[7], [x8]
+; CHECK-NEXT:    add x8, sp, #264
+; CHECK-NEXT:    mov v1.b[4], w4
+; CHECK-NEXT:    ld1 { v4.b }[7], [x8]
+; CHECK-NEXT:    add x9, sp, #672
+; CHECK-NEXT:    add x8, sp, #680
+; CHECK-NEXT:    ld1 { v0.b }[5], [x12]
+; CHECK-NEXT:    add x12, sp, #480
+; CHECK-NEXT:    ld1 { v2.b }[2], [x12]
+; CHECK-NEXT:    add x12, sp, #272
+; CHECK-NEXT:    ld1 { v3.b }[8], [x11]
+; CHECK-NEXT:    ld1 { v4.b }[8], [x12]
+; CHECK-NEXT:    add x12, sp, #384
 ; CHECK-NEXT:    mov v1.b[5], w5
-; CHECK-NEXT:    ld1 { v3.b }[6], [x10]
+; CHECK-NEXT:    ld1 { v0.b }[6], [x12]
+; CHECK-NEXT:    add x12, sp, #280
+; CHECK-NEXT:    add x11, sp, #688
+; CHECK-NEXT:    ld1 { v3.b }[9], [x10]
+; CHECK-NEXT:    add x10, sp, #1376
+; CHECK-NEXT:    ld1 { v7.b }[2], [x10]
 ; CHECK-NEXT:    add x10, sp, #392
-; CHECK-NEXT:    ld1 { v2.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #656
-; CHECK-NEXT:    ld1 { v0.b }[6], [x8]
-; CHECK-NEXT:    add x8, sp, #264
-; CHECK-NEXT:    mov v1.b[6], w6
-; CHECK-NEXT:    movi v6.2d, #0000000000000000
-; CHECK-NEXT:    ld1 { v3.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #272
+; CHECK-NEXT:    ld1 { v4.b }[9], [x12]
 ; CHECK-NEXT:    ld1 { v0.b }[7], [x10]
-; CHECK-NEXT:    add x10, sp, #16
-; CHECK-NEXT:    mov v1.b[7], w7
-; CHECK-NEXT:    ld1 { v2.b }[8], [x9]
+; CHECK-NEXT:    mov v1.b[6], w6
+; CHECK-NEXT:    add x12, sp, #704
+; CHECK-NEXT:    ld1 { v3.b }[10], [x9]
 ; CHECK-NEXT:    add x9, sp, #400
-; CHECK-NEXT:    ld1 { v3.b }[8], [x8]
-; CHECK-NEXT:    add x8, sp, #280
-; CHECK-NEXT:    ld1 { v1.b }[8], [x10]
-; CHECK-NEXT:    add x10, sp, #24
+; CHECK-NEXT:    add x10, sp, #712
+; CHECK-NEXT:    ld1 { v4.b }[10], [x14]
+; CHECK-NEXT:    add x14, sp, #992
 ; CHECK-NEXT:    ld1 { v0.b }[8], [x9]
-; CHECK-NEXT:    add x9, sp, #408
-; CHECK-NEXT:    ld1 { v3.b }[9], [x8]
-; CHECK-NEXT:    add x8, sp, #288
-; CHECK-NEXT:    ld1 { v2.b }[9], [x11]
-; CHECK-NEXT:    add x11, sp, #672
-; CHECK-NEXT:    ld1 { v1.b }[9], [x10]
-; CHECK-NEXT:    add x10, sp, #32
-; CHECK-NEXT:    ld1 { v0.b }[9], [x9]
-; CHECK-NEXT:    add x9, sp, #416
-; CHECK-NEXT:    ld1 { v3.b }[10], [x8]
-; CHECK-NEXT:    add x8, sp, #296
-; CHECK-NEXT:    ld1 { v2.b }[10], [x11]
-; CHECK-NEXT:    add x11, sp, #680
-; CHECK-NEXT:    ld1 { v1.b }[10], [x10]
-; CHECK-NEXT:    add x10, sp, #40
-; CHECK-NEXT:    ld1 { v0.b }[10], [x9]
-; CHECK-NEXT:    add x9, sp, #424
+; CHECK-NEXT:    ld1 { v17.b }[2], [x14]
+; CHECK-NEXT:    add x14, sp, #296
 ; CHECK-NEXT:    ld1 { v3.b }[11], [x8]
-; CHECK-NEXT:    add x8, sp, #304
-; CHECK-NEXT:    ld1 { v2.b }[11], [x11]
-; CHECK-NEXT:    add x11, sp, #688
-; CHECK-NEXT:    ld1 { v1.b }[11], [x10]
-; CHECK-NEXT:    add x10, sp, #48
-; CHECK-NEXT:    ld1 { v0.b }[11], [x9]
-; CHECK-NEXT:    add x9, sp, #432
-; CHECK-NEXT:    ld1 { v3.b }[12], [x8]
+; CHECK-NEXT:    add x9, sp, #304
 ; CHECK-NEXT:    add x8, sp, #312
-; CHECK-NEXT:    ld1 { v2.b }[12], [x11]
-; CHECK-NEXT:    add x11, sp, #696
-; CHECK-NEXT:    ld1 { v1.b }[12], [x10]
-; CHECK-NEXT:    add x10, sp, #56
-; CHECK-NEXT:    ld1 { v0.b }[12], [x9]
-; CHECK-NEXT:    add x9, sp, #440
-; CHECK-NEXT:    ld1 { v3.b }[13], [x8]
-; CHECK-NEXT:    add x8, sp, #320
-; CHECK-NEXT:    ld1 { v2.b }[13], [x11]
-; CHECK-NEXT:    add x11, sp, #704
-; CHECK-NEXT:    ld1 { v1.b }[13], [x10]
-; CHECK-NEXT:    add x10, sp, #64
-; CHECK-NEXT:    ld1 { v0.b }[13], [x9]
-; CHECK-NEXT:    add x9, sp, #448
-; CHECK-NEXT:    ld1 { v3.b }[14], [x8]
-; CHECK-NEXT:    add x8, sp, #328
-; CHECK-NEXT:    ld1 { v2.b }[14], [x11]
-; CHECK-NEXT:    add x11, sp, #712
-; CHECK-NEXT:    ld1 { v1.b }[14], [x10]
-; CHECK-NEXT:    add x10, sp, #472
-; CHECK-NEXT:    ld1 { v0.b }[14], [x9]
-; CHECK-NEXT:    add x9, sp, #456
-; CHECK-NEXT:    ld1 { v3.b }[15], [x8]
-; CHECK-NEXT:    add x8, sp, #72
-; CHECK-NEXT:    ld1 { v4.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #88
-; CHECK-NEXT:    ld1 { v2.b }[15], [x11]
-; CHECK-NEXT:    add x11, sp, #480
-; CHECK-NEXT:    ld1 { v1.b }[15], [x8]
-; CHECK-NEXT:    add x8, sp, #1368
-; CHECK-NEXT:    ld1 { v0.b }[15], [x9]
-; CHECK-NEXT:    add x9, sp, #984
-; CHECK-NEXT:    ld1 { v5.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #96
-; CHECK-NEXT:    ld1 { v18.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #1376
-; CHECK-NEXT:    ld1 { v19.b }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #992
-; CHECK-NEXT:    ld1 { v4.b }[2], [x11]
-; CHECK-NEXT:    add x11, sp, #488
-; CHECK-NEXT:    ld1 { v5.b }[2], [x10]
-; CHECK-NEXT:    add x10, sp, #104
-; CHECK-NEXT:    ld1 { v18.b }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #1384
-; CHECK-NEXT:    ld1 { v19.b }[2], [x9]
+; CHECK-NEXT:    ld1 { v4.b }[11], [x14]
+; CHECK-NEXT:    mov v1.b[7], w7
+; CHECK-NEXT:    add x14, sp, #320
+; CHECK-NEXT:    ld1 { v0.b }[9], [x15]
+; CHECK-NEXT:    add x15, sp, #328
+; CHECK-NEXT:    ld1 { v3.b }[12], [x11]
+; CHECK-NEXT:    add x11, sp, #416
+; CHECK-NEXT:    ld1 { v4.b }[12], [x9]
+; CHECK-NEXT:    add x9, sp, #1384
+; CHECK-NEXT:    ld1 { v0.b }[10], [x11]
+; CHECK-NEXT:    ld1 { v7.b }[3], [x9]
+; CHECK-NEXT:    add x9, sp, #424
+; CHECK-NEXT:    ld1 { v3.b }[13], [x13]
+; CHECK-NEXT:    add x11, sp, #432
+; CHECK-NEXT:    add x13, sp, #440
+; CHECK-NEXT:    ld1 { v4.b }[13], [x8]
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ld1 { v0.b }[11], [x9]
 ; CHECK-NEXT:    add x9, sp, #1000
-; CHECK-NEXT:    ld1 { v4.b }[3], [x11]
-; CHECK-NEXT:    add x11, sp, #496
-; CHECK-NEXT:    ld1 { v5.b }[3], [x10]
-; CHECK-NEXT:    add x10, sp, #112
-; CHECK-NEXT:    ld1 { v18.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #1392
-; CHECK-NEXT:    ld1 { v19.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #1008
-; CHECK-NEXT:    ld1 { v4.b }[4], [x11]
-; CHECK-NEXT:    add x11, sp, #504
-; CHECK-NEXT:    ld1 { v5.b }[4], [x10]
-; CHECK-NEXT:    add x10, sp, #120
-; CHECK-NEXT:    ld1 { v18.b }[4], [x8]
-; CHECK-NEXT:    add x8, sp, #1400
-; CHECK-NEXT:    ld1 { v19.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #1016
-; CHECK-NEXT:    ld1 { v4.b }[5], [x11]
-; CHECK-NEXT:    add x11, sp, #512
-; CHECK-NEXT:    ld1 { v5.b }[5], [x10]
-; CHECK-NEXT:    add x10, sp, #128
-; CHECK-NEXT:    ld1 { v18.b }[5], [x8]
-; CHECK-NEXT:    add x8, sp, #1408
-; CHECK-NEXT:    ld1 { v19.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #1024
-; CHECK-NEXT:    ld1 { v4.b }[6], [x11]
-; CHECK-NEXT:    add x11, sp, #520
-; CHECK-NEXT:    ld1 { v5.b }[6], [x10]
-; CHECK-NEXT:    add x10, sp, #136
-; CHECK-NEXT:    ld1 { v18.b }[6], [x8]
-; CHECK-NEXT:    add x8, sp, #1416
-; CHECK-NEXT:    ld1 { v19.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #1032
-; CHECK-NEXT:    ld1 { v4.b }[7], [x11]
-; CHECK-NEXT:    add x11, sp, #528
-; CHECK-NEXT:    ld1 { v5.b }[7], [x10]
-; CHECK-NEXT:    add x10, sp, #144
-; CHECK-NEXT:    ld1 { v18.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #1424
-; CHECK-NEXT:    ld1 { v19.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #1040
-; CHECK-NEXT:    ld1 { v4.b }[8], [x11]
-; CHECK-NEXT:    add x11, sp, #536
-; CHECK-NEXT:    ld1 { v5.b }[8], [x10]
-; CHECK-NEXT:    add x10, sp, #152
-; CHECK-NEXT:    ld1 { v18.b }[8], [x8]
-; CHECK-NEXT:    add x8, sp, #1432
-; CHECK-NEXT:    ld1 { v19.b }[8], [x9]
+; CHECK-NEXT:    ld1 { v1.b }[8], [x8]
+; CHECK-NEXT:    ld1 { v17.b }[3], [x9]
+; CHECK-NEXT:    ld1 { v3.b }[14], [x12]
+; CHECK-NEXT:    add x12, sp, #488
+; CHECK-NEXT:    ld1 { v4.b }[14], [x14]
+; CHECK-NEXT:    add x14, sp, #1392
+; CHECK-NEXT:    ld1 { v2.b }[3], [x12]
+; CHECK-NEXT:    ld1 { v7.b }[4], [x14]
+; CHECK-NEXT:    add x8, sp, #1008
+; CHECK-NEXT:    ld1 { v0.b }[12], [x11]
+; CHECK-NEXT:    ld1 { v17.b }[4], [x8]
+; CHECK-NEXT:    add x11, sp, #1400
+; CHECK-NEXT:    add x8, sp, #496
+; CHECK-NEXT:    ld1 { v2.b }[4], [x8]
+; CHECK-NEXT:    add x8, sp, #1016
+; CHECK-NEXT:    add x9, sp, #24
+; CHECK-NEXT:    ld1 { v7.b }[5], [x11]
+; CHECK-NEXT:    ld1 { v3.b }[15], [x10]
+; CHECK-NEXT:    ld1 { v0.b }[13], [x13]
+; CHECK-NEXT:    ld1 { v17.b }[5], [x8]
+; CHECK-NEXT:    add x10, sp, #1408
+; CHECK-NEXT:    ld1 { v1.b }[9], [x9]
+; CHECK-NEXT:    add x8, sp, #504
+; CHECK-NEXT:    add x9, sp, #32
+; CHECK-NEXT:    ld1 { v4.b }[15], [x15]
+; CHECK-NEXT:    ld1 { v7.b }[6], [x10]
+; CHECK-NEXT:    ld1 { v2.b }[5], [x8]
+; CHECK-NEXT:    add x8, sp, #1024
+; CHECK-NEXT:    ld1 { v17.b }[6], [x8]
+; CHECK-NEXT:    ld1 { v0.b }[14], [x16]
+; CHECK-NEXT:    ld1 { v1.b }[10], [x9]
+; CHECK-NEXT:    add x9, sp, #1416
+; CHECK-NEXT:    add x10, sp, #512
+; CHECK-NEXT:    add x8, sp, #456
+; CHECK-NEXT:    ld1 { v7.b }[7], [x9]
+; CHECK-NEXT:    ld1 { v2.b }[6], [x10]
+; CHECK-NEXT:    add x10, sp, #1032
+; CHECK-NEXT:    add x9, sp, #40
+; CHECK-NEXT:    ld1 { v17.b }[7], [x10]
+; CHECK-NEXT:    ld1 { v0.b }[15], [x8]
+; CHECK-NEXT:    ld1 { v1.b }[11], [x9]
+; CHECK-NEXT:    add x9, sp, #1424
+; CHECK-NEXT:    add x8, sp, #520
+; CHECK-NEXT:    ld1 { v7.b }[8], [x9]
+; CHECK-NEXT:    ld1 { v2.b }[7], [x8]
+; CHECK-NEXT:    add x8, sp, #1040
+; CHECK-NEXT:    add x9, sp, #48
+; CHECK-NEXT:    ld1 { v17.b }[8], [x8]
+; CHECK-NEXT:    add x10, sp, #528
+; CHECK-NEXT:    ld1 { v1.b }[12], [x9]
+; CHECK-NEXT:    add x9, sp, #1432
+; CHECK-NEXT:    sdot v6.4s, v4.16b, v3.16b
+; CHECK-NEXT:    ld1 { v7.b }[9], [x9]
+; CHECK-NEXT:    ld1 { v2.b }[8], [x10]
 ; CHECK-NEXT:    add x9, sp, #1048
-; CHECK-NEXT:    ld1 { v4.b }[9], [x11]
-; CHECK-NEXT:    add x11, sp, #544
-; CHECK-NEXT:    ld1 { v5.b }[9], [x10]
-; CHECK-NEXT:    add x10, sp, #160
-; CHECK-NEXT:    ld1 { v18.b }[9], [x8]
-; CHECK-NEXT:    add x8, sp, #1440
-; CHECK-NEXT:    ld1 { v19.b }[9], [x9]
-; CHECK-NEXT:    add x9, sp, #1056
-; CHECK-NEXT:    ld1 { v4.b }[10], [x11]
-; CHECK-NEXT:    add x11, sp, #552
-; CHECK-NEXT:    ld1 { v5.b }[10], [x10]
-; CHECK-NEXT:    add x10, sp, #168
-; CHECK-NEXT:    ld1 { v18.b }[10], [x8]
-; CHECK-NEXT:    add x8, sp, #1448
-; CHECK-NEXT:    ld1 { v19.b }[10], [x9]
-; CHECK-NEXT:    add x9, sp, #1064
-; CHECK-NEXT:    ld1 { v4.b }[11], [x11]
-; CHECK-NEXT:    add x11, sp, #560
-; CHECK-NEXT:    ld1 { v5.b }[11], [x10]
-; CHECK-NEXT:    add x10, sp, #176
-; CHECK-NEXT:    ld1 { v18.b }[11], [x8]
-; CHECK-NEXT:    add x8, sp, #1456
-; CHECK-NEXT:    ld1 { v19.b }[11], [x9]
-; CHECK-NEXT:    add x9, sp, #1072
-; CHECK-NEXT:    ld1 { v4.b }[12], [x11]
-; CHECK-NEXT:    add x11, sp, #568
-; CHECK-NEXT:    ld1 { v5.b }[12], [x10]
-; CHECK-NEXT:    add x10, sp, #184
-; CHECK-NEXT:    ld1 { v18.b }[12], [x8]
-; CHECK-NEXT:    add x8, sp, #1464
-; CHECK-NEXT:    ld1 { v19.b }[12], [x9]
-; CHECK-NEXT:    add x9, sp, #1080
-; CHECK-NEXT:    ld1 { v4.b }[13], [x11]
-; CHECK-NEXT:    add x11, sp, #1128
-; CHECK-NEXT:    ld1 { v5.b }[13], [x10]
-; CHECK-NEXT:    add x10, sp, #1112
-; CHECK-NEXT:    ld1 { v18.b }[13], [x8]
-; CHECK-NEXT:    add x8, sp, #1472
-; CHECK-NEXT:    ld1 { v19.b }[13], [x9]
-; CHECK-NEXT:    add x9, sp, #1088
-; CHECK-NEXT:    ld1 { v16.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #728
-; CHECK-NEXT:    sdot v7.4s, v3.16b, v2.16b
-; CHECK-NEXT:    ldr b2, [sp, #1232]
-; CHECK-NEXT:    ld1 { v18.b }[14], [x8]
-; CHECK-NEXT:    add x8, sp, #1480
-; CHECK-NEXT:    ld1 { v19.b }[14], [x9]
-; CHECK-NEXT:    add x9, sp, #1096
-; CHECK-NEXT:    ld1 { v17.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #856
-; CHECK-NEXT:    ldr b3, [sp, #848]
-; CHECK-NEXT:    ld1 { v18.b }[15], [x8]
-; CHECK-NEXT:    add x8, sp, #576
-; CHECK-NEXT:    ld1 { v19.b }[15], [x9]
-; CHECK-NEXT:    add x9, sp, #192
+; CHECK-NEXT:    ldr b3, [sp, #80]
+; CHECK-NEXT:    ld1 { v17.b }[9], [x9]
+; CHECK-NEXT:    add x8, sp, #56
+; CHECK-NEXT:    add x10, sp, #88
+; CHECK-NEXT:    add x9, sp, #536
+; CHECK-NEXT:    add x11, sp, #1440
 ; CHECK-NEXT:    ld1 { v3.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #864
-; CHECK-NEXT:    ld1 { v4.b }[14], [x8]
-; CHECK-NEXT:    add x8, sp, #1120
-; CHECK-NEXT:    ld1 { v5.b }[14], [x9]
-; CHECK-NEXT:    add x9, sp, #1240
-; CHECK-NEXT:    sdot v6.4s, v19.16b, v18.16b
-; CHECK-NEXT:    ld1 { v16.b }[2], [x8]
+; CHECK-NEXT:    ld1 { v1.b }[13], [x8]
+; CHECK-NEXT:    ld1 { v2.b }[9], [x9]
+; CHECK-NEXT:    add x8, sp, #1056
+; CHECK-NEXT:    ld1 { v7.b }[10], [x11]
+; CHECK-NEXT:    add x9, sp, #96
+; CHECK-NEXT:    ld1 { v17.b }[10], [x8]
+; CHECK-NEXT:    add x8, sp, #544
+; CHECK-NEXT:    add x10, sp, #1448
+; CHECK-NEXT:    ld1 { v3.b }[2], [x9]
+; CHECK-NEXT:    ld1 { v2.b }[10], [x8]
+; CHECK-NEXT:    add x8, sp, #1064
+; CHECK-NEXT:    ld1 { v7.b }[11], [x10]
+; CHECK-NEXT:    add x10, sp, #104
+; CHECK-NEXT:    add x11, sp, #1456
+; CHECK-NEXT:    ld1 { v17.b }[11], [x8]
+; CHECK-NEXT:    add x8, sp, #552
+; CHECK-NEXT:    add x9, sp, #64
+; CHECK-NEXT:    ld1 { v3.b }[3], [x10]
+; CHECK-NEXT:    ld1 { v2.b }[11], [x8]
+; CHECK-NEXT:    add x8, sp, #1072
+; CHECK-NEXT:    ld1 { v7.b }[12], [x11]
+; CHECK-NEXT:    ld1 { v1.b }[14], [x9]
+; CHECK-NEXT:    add x9, sp, #112
+; CHECK-NEXT:    ld1 { v17.b }[12], [x8]
+; CHECK-NEXT:    add x8, sp, #560
+; CHECK-NEXT:    add x10, sp, #1464
+; CHECK-NEXT:    ld1 { v3.b }[4], [x9]
+; CHECK-NEXT:    ld1 { v2.b }[12], [x8]
+; CHECK-NEXT:    add x8, sp, #1080
+; CHECK-NEXT:    ld1 { v7.b }[13], [x10]
+; CHECK-NEXT:    add x10, sp, #120
+; CHECK-NEXT:    add x11, sp, #1472
+; CHECK-NEXT:    ld1 { v17.b }[13], [x8]
+; CHECK-NEXT:    add x8, sp, #568
+; CHECK-NEXT:    add x9, sp, #72
+; CHECK-NEXT:    ld1 { v3.b }[5], [x10]
+; CHECK-NEXT:    ld1 { v2.b }[13], [x8]
+; CHECK-NEXT:    add x8, sp, #1088
+; CHECK-NEXT:    ld1 { v7.b }[14], [x11]
+; CHECK-NEXT:    ld1 { v1.b }[15], [x9]
+; CHECK-NEXT:    add x9, sp, #128
+; CHECK-NEXT:    ld1 { v17.b }[14], [x8]
+; CHECK-NEXT:    ldr b4, [sp, #1104]
+; CHECK-NEXT:    add x10, sp, #1480
+; CHECK-NEXT:    ld1 { v3.b }[6], [x9]
+; CHECK-NEXT:    add x8, sp, #1096
+; CHECK-NEXT:    add x9, sp, #1112
+; CHECK-NEXT:    ld1 { v7.b }[15], [x10]
+; CHECK-NEXT:    ld1 { v4.b }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #576
+; CHECK-NEXT:    ld1 { v17.b }[15], [x8]
+; CHECK-NEXT:    add x8, sp, #728
+; CHECK-NEXT:    add x10, sp, #136
+; CHECK-NEXT:    ld1 { v16.b }[1], [x8]
+; CHECK-NEXT:    add x8, sp, #1120
+; CHECK-NEXT:    ld1 { v2.b }[14], [x9]
+; CHECK-NEXT:    ld1 { v4.b }[2], [x8]
 ; CHECK-NEXT:    add x8, sp, #736
-; CHECK-NEXT:    ld1 { v2.b }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #1248
-; CHECK-NEXT:    ld1 { v3.b }[2], [x10]
-; CHECK-NEXT:    add x10, sp, #872
-; CHECK-NEXT:    ld1 { v17.b }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #744
+; CHECK-NEXT:    ld1 { v3.b }[7], [x10]
+; CHECK-NEXT:    sdot v5.4s, v17.16b, v7.16b
+; CHECK-NEXT:    ldr b7, [sp, #1232]
+; CHECK-NEXT:    ldr b17, [sp, #848]
+; CHECK-NEXT:    ld1 { v16.b }[2], [x8]
+; CHECK-NEXT:    add x9, sp, #1240
+; CHECK-NEXT:    add x10, sp, #856
+; CHECK-NEXT:    ld1 { v7.b }[1], [x9]
+; CHECK-NEXT:    ld1 { v17.b }[1], [x10]
+; CHECK-NEXT:    add x8, sp, #1128
+; CHECK-NEXT:    add x11, sp, #744
+; CHECK-NEXT:    ld1 { v4.b }[3], [x8]
+; CHECK-NEXT:    add x10, sp, #1248
 ; CHECK-NEXT:    ld1 { v16.b }[3], [x11]
-; CHECK-NEXT:    add x11, sp, #1136
-; CHECK-NEXT:    ld1 { v2.b }[2], [x9]
+; CHECK-NEXT:    add x11, sp, #864
+; CHECK-NEXT:    add x9, sp, #144
+; CHECK-NEXT:    ld1 { v7.b }[2], [x10]
+; CHECK-NEXT:    ld1 { v17.b }[2], [x11]
+; CHECK-NEXT:    add x8, sp, #1136
+; CHECK-NEXT:    add x12, sp, #752
+; CHECK-NEXT:    ld1 { v3.b }[8], [x9]
+; CHECK-NEXT:    ld1 { v4.b }[4], [x8]
+; CHECK-NEXT:    ld1 { v16.b }[4], [x12]
 ; CHECK-NEXT:    add x9, sp, #1256
-; CHECK-NEXT:    ld1 { v3.b }[3], [x10]
-; CHECK-NEXT:    add x10, sp, #880
-; CHECK-NEXT:    ld1 { v17.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #752
-; CHECK-NEXT:    ld1 { v16.b }[4], [x11]
-; CHECK-NEXT:    add x11, sp, #1144
-; CHECK-NEXT:    ld1 { v2.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #1264
-; CHECK-NEXT:    ld1 { v3.b }[4], [x10]
-; CHECK-NEXT:    add x10, sp, #888
-; CHECK-NEXT:    ld1 { v17.b }[4], [x8]
-; CHECK-NEXT:    add x8, sp, #760
+; CHECK-NEXT:    add x10, sp, #872
+; CHECK-NEXT:    ld1 { v7.b }[3], [x9]
+; CHECK-NEXT:    ld1 { v17.b }[3], [x10]
+; CHECK-NEXT:    add x8, sp, #1144
+; CHECK-NEXT:    add x11, sp, #760
+; CHECK-NEXT:    ld1 { v4.b }[5], [x8]
+; CHECK-NEXT:    add x10, sp, #1264
 ; CHECK-NEXT:    ld1 { v16.b }[5], [x11]
-; CHECK-NEXT:    add x11, sp, #1152
-; CHECK-NEXT:    ld1 { v2.b }[4], [x9]
+; CHECK-NEXT:    add x11, sp, #880
+; CHECK-NEXT:    add x9, sp, #152
+; CHECK-NEXT:    ld1 { v7.b }[4], [x10]
+; CHECK-NEXT:    ld1 { v17.b }[4], [x11]
+; CHECK-NEXT:    add x8, sp, #1152
+; CHECK-NEXT:    add x12, sp, #768
+; CHECK-NEXT:    ld1 { v3.b }[9], [x9]
+; CHECK-NEXT:    ld1 { v4.b }[6], [x8]
+; CHECK-NEXT:    ld1 { v16.b }[6], [x12]
 ; CHECK-NEXT:    add x9, sp, #1272
-; CHECK-NEXT:    ld1 { v3.b }[5], [x10]
-; CHECK-NEXT:    add x10, sp, #896
-; CHECK-NEXT:    ld1 { v17.b }[5], [x8]
-; CHECK-NEXT:    add x8, sp, #768
-; CHECK-NEXT:    ld1 { v16.b }[6], [x11]
-; CHECK-NEXT:    add x11, sp, #1160
-; CHECK-NEXT:    ld1 { v2.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #1280
-; CHECK-NEXT:    ld1 { v3.b }[6], [x10]
-; CHECK-NEXT:    add x10, sp, #904
-; CHECK-NEXT:    ld1 { v17.b }[6], [x8]
-; CHECK-NEXT:    add x8, sp, #776
+; CHECK-NEXT:    add x10, sp, #888
+; CHECK-NEXT:    ld1 { v7.b }[5], [x9]
+; CHECK-NEXT:    ld1 { v17.b }[5], [x10]
+; CHECK-NEXT:    add x8, sp, #1160
+; CHECK-NEXT:    add x11, sp, #776
+; CHECK-NEXT:    ld1 { v4.b }[7], [x8]
+; CHECK-NEXT:    add x10, sp, #1280
 ; CHECK-NEXT:    ld1 { v16.b }[7], [x11]
-; CHECK-NEXT:    add x11, sp, #1168
-; CHECK-NEXT:    ld1 { v2.b }[6], [x9]
+; CHECK-NEXT:    add x11, sp, #896
+; CHECK-NEXT:    add x9, sp, #160
+; CHECK-NEXT:    ld1 { v7.b }[6], [x10]
+; CHECK-NEXT:    ld1 { v17.b }[6], [x11]
+; CHECK-NEXT:    add x8, sp, #1168
+; CHECK-NEXT:    add x12, sp, #784
+; CHECK-NEXT:    ld1 { v3.b }[10], [x9]
+; CHECK-NEXT:    ld1 { v4.b }[8], [x8]
+; CHECK-NEXT:    ld1 { v16.b }[8], [x12]
 ; CHECK-NEXT:    add x9, sp, #1288
-; CHECK-NEXT:    ld1 { v3.b }[7], [x10]
-; CHECK-NEXT:    add x10, sp, #912
-; CHECK-NEXT:    ld1 { v17.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #784
-; CHECK-NEXT:    ld1 { v16.b }[8], [x11]
-; CHECK-NEXT:    add x11, sp, #1176
-; CHECK-NEXT:    ld1 { v2.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #1296
-; CHECK-NEXT:    ld1 { v3.b }[8], [x10]
-; CHECK-NEXT:    add x10, sp, #920
-; CHECK-NEXT:    ld1 { v17.b }[8], [x8]
-; CHECK-NEXT:    add x8, sp, #792
+; CHECK-NEXT:    add x10, sp, #904
+; CHECK-NEXT:    ld1 { v7.b }[7], [x9]
+; CHECK-NEXT:    ld1 { v17.b }[7], [x10]
+; CHECK-NEXT:    add x8, sp, #1176
+; CHECK-NEXT:    add x11, sp, #792
+; CHECK-NEXT:    ld1 { v4.b }[9], [x8]
+; CHECK-NEXT:    add x10, sp, #1296
 ; CHECK-NEXT:    ld1 { v16.b }[9], [x11]
-; CHECK-NEXT:    add x11, sp, #1184
-; CHECK-NEXT:    ld1 { v2.b }[8], [x9]
+; CHECK-NEXT:    add x11, sp, #912
+; CHECK-NEXT:    add x9, sp, #168
+; CHECK-NEXT:    ld1 { v7.b }[8], [x10]
+; CHECK-NEXT:    ld1 { v17.b }[8], [x11]
+; CHECK-NEXT:    add x8, sp, #1184
+; CHECK-NEXT:    add x12, sp, #800
+; CHECK-NEXT:    ld1 { v3.b }[11], [x9]
+; CHECK-NEXT:    ld1 { v4.b }[10], [x8]
+; CHECK-NEXT:    ld1 { v16.b }[10], [x12]
 ; CHECK-NEXT:    add x9, sp, #1304
-; CHECK-NEXT:    ld1 { v3.b }[9], [x10]
-; CHECK-NEXT:    add x10, sp, #928
-; CHECK-NEXT:    ld1 { v17.b }[9], [x8]
-; CHECK-NEXT:    add x8, sp, #800
-; CHECK-NEXT:    ld1 { v16.b }[10], [x11]
-; CHECK-NEXT:    add x11, sp, #1192
-; CHECK-NEXT:    ld1 { v2.b }[9], [x9]
-; CHECK-NEXT:    add x9, sp, #1312
-; CHECK-NEXT:    ld1 { v3.b }[10], [x10]
-; CHECK-NEXT:    add x10, sp, #936
-; CHECK-NEXT:    ld1 { v17.b }[10], [x8]
-; CHECK-NEXT:    add x8, sp, #808
+; CHECK-NEXT:    add x10, sp, #920
+; CHECK-NEXT:    ld1 { v7.b }[9], [x9]
+; CHECK-NEXT:    ld1 { v17.b }[9], [x10]
+; CHECK-NEXT:    add x8, sp, #1192
+; CHECK-NEXT:    add x11, sp, #808
+; CHECK-NEXT:    ld1 { v4.b }[11], [x8]
+; CHECK-NEXT:    add x10, sp, #1312
 ; CHECK-NEXT:    ld1 { v16.b }[11], [x11]
-; CHECK-NEXT:    add x11, sp, #1200
-; CHECK-NEXT:    ld1 { v2.b }[10], [x9]
+; CHECK-NEXT:    add x11, sp, #928
+; CHECK-NEXT:    add x9, sp, #176
+; CHECK-NEXT:    ld1 { v7.b }[10], [x10]
+; CHECK-NEXT:    ld1 { v17.b }[10], [x11]
+; CHECK-NEXT:    add x8, sp, #1200
+; CHECK-NEXT:    add x12, sp, #816
+; CHECK-NEXT:    ld1 { v3.b }[12], [x9]
+; CHECK-NEXT:    ld1 { v4.b }[12], [x8]
+; CHECK-NEXT:    ld1 { v16.b }[12], [x12]
 ; CHECK-NEXT:    add x9, sp, #1320
-; CHECK-NEXT:    ld1 { v3.b }[11], [x10]
-; CHECK-NEXT:    add x10, sp, #944
-; CHECK-NEXT:    ld1 { v17.b }[11], [x8]
-; CHECK-NEXT:    add x8, sp, #816
-; CHECK-NEXT:    ld1 { v16.b }[12], [x11]
-; CHECK-NEXT:    add x11, sp, #1208
-; CHECK-NEXT:    ld1 { v2.b }[11], [x9]
-; CHECK-NEXT:    add x9, sp, #1328
-; CHECK-NEXT:    ld1 { v3.b }[12], [x10]
-; CHECK-NEXT:    add x10, sp, #952
-; CHECK-NEXT:    ld1 { v17.b }[12], [x8]
-; CHECK-NEXT:    add x8, sp, #824
+; CHECK-NEXT:    add x10, sp, #936
+; CHECK-NEXT:    ld1 { v7.b }[11], [x9]
+; CHECK-NEXT:    ld1 { v17.b }[11], [x10]
+; CHECK-NEXT:    add x8, sp, #1208
+; CHECK-NEXT:    add x11, sp, #824
+; CHECK-NEXT:    ld1 { v4.b }[13], [x8]
+; CHECK-NEXT:    add x10, sp, #1328
 ; CHECK-NEXT:    ld1 { v16.b }[13], [x11]
-; CHECK-NEXT:    add x11, sp, #1216
-; CHECK-NEXT:    ld1 { v2.b }[12], [x9]
+; CHECK-NEXT:    add x11, sp, #944
+; CHECK-NEXT:    add x9, sp, #184
+; CHECK-NEXT:    ld1 { v7.b }[12], [x10]
+; CHECK-NEXT:    ld1 { v17.b }[12], [x11]
+; CHECK-NEXT:    add x8, sp, #1216
+; CHECK-NEXT:    add x12, sp, #832
+; CHECK-NEXT:    ld1 { v3.b }[13], [x9]
+; CHECK-NEXT:    ld1 { v4.b }[14], [x8]
+; CHECK-NEXT:    ld1 { v16.b }[14], [x12]
 ; CHECK-NEXT:    add x9, sp, #1336
-; CHECK-NEXT:    ld1 { v3.b }[13], [x10]
-; CHECK-NEXT:    add x10, sp, #960
-; CHECK-NEXT:    ld1 { v17.b }[13], [x8]
-; CHECK-NEXT:    add x8, sp, #832
-; CHECK-NEXT:    ld1 { v16.b }[14], [x11]
-; CHECK-NEXT:    add x11, sp, #1224
-; CHECK-NEXT:    ld1 { v2.b }[13], [x9]
-; CHECK-NEXT:    add x9, sp, #1344
-; CHECK-NEXT:    ld1 { v3.b }[14], [x10]
-; CHECK-NEXT:    add x10, sp, #968
-; CHECK-NEXT:    ld1 { v17.b }[14], [x8]
-; CHECK-NEXT:    add x8, sp, #840
+; CHECK-NEXT:    add x10, sp, #952
+; CHECK-NEXT:    ld1 { v7.b }[13], [x9]
+; CHECK-NEXT:    ld1 { v17.b }[13], [x10]
+; CHECK-NEXT:    add x8, sp, #1224
+; CHECK-NEXT:    add x11, sp, #840
+; CHECK-NEXT:    ld1 { v4.b }[15], [x8]
+; CHECK-NEXT:    add x8, sp, #192
 ; CHECK-NEXT:    ld1 { v16.b }[15], [x11]
-; CHECK-NEXT:    add x11, sp, #584
-; CHECK-NEXT:    ld1 { v2.b }[14], [x9]
-; CHECK-NEXT:    add x9, sp, #1352
-; CHECK-NEXT:    sdot v7.4s, v1.16b, v0.16b
-; CHECK-NEXT:    ld1 { v3.b }[15], [x10]
-; CHECK-NEXT:    ld1 { v17.b }[15], [x8]
+; CHECK-NEXT:    add x10, sp, #1344
+; CHECK-NEXT:    add x11, sp, #960
+; CHECK-NEXT:    ld1 { v3.b }[14], [x8]
+; CHECK-NEXT:    ld1 { v7.b }[14], [x10]
+; CHECK-NEXT:    ld1 { v17.b }[14], [x11]
+; CHECK-NEXT:    add x9, sp, #584
+; CHECK-NEXT:    sdot v6.4s, v1.16b, v0.16b
 ; CHECK-NEXT:    add x8, sp, #200
-; CHECK-NEXT:    ld1 { v4.b }[15], [x11]
+; CHECK-NEXT:    sdot v5.4s, v16.16b, v4.16b
 ; CHECK-NEXT:    ld1 { v2.b }[15], [x9]
-; CHECK-NEXT:    ld1 { v5.b }[15], [x8]
-; CHECK-NEXT:    sdot v6.4s, v17.16b, v16.16b
-; CHECK-NEXT:    sdot v7.4s, v5.16b, v4.16b
+; CHECK-NEXT:    add x9, sp, #1352
+; CHECK-NEXT:    add x10, sp, #968
+; CHECK-NEXT:    ld1 { v3.b }[15], [x8]
+; CHECK-NEXT:    ld1 { v7.b }[15], [x9]
+; CHECK-NEXT:    ld1 { v17.b }[15], [x10]
 ; CHECK-NEXT:    sdot v6.4s, v3.16b, v2.16b
-; CHECK-NEXT:    add v0.4s, v7.4s, v6.4s
+; CHECK-NEXT:    sdot v5.4s, v17.16b, v7.16b
+; CHECK-NEXT:    add v0.4s, v6.4s, v5.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2658,195 +2662,195 @@ define i32 @test_sdot_v48i8_double_nomla(<48 x i8> %a, <48 x i8> %b, <48 x i8> %
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ldr b0, [sp, #208]
+; CHECK-NEXT:    ldr b1, [sp, #208]
 ; CHECK-NEXT:    add x8, sp, #216
-; CHECK-NEXT:    fmov s1, w0
-; CHECK-NEXT:    add x9, sp, #232
-; CHECK-NEXT:    ldr b2, [sp, #80]
-; CHECK-NEXT:    add x11, sp, #88
-; CHECK-NEXT:    ld1 { v0.b }[1], [x8]
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ldr b5, [sp, #976]
+; CHECK-NEXT:    add x9, sp, #984
+; CHECK-NEXT:    add x12, sp, #328
+; CHECK-NEXT:    ld1 { v1.b }[1], [x8]
 ; CHECK-NEXT:    add x8, sp, #224
-; CHECK-NEXT:    mov v1.b[1], w1
-; CHECK-NEXT:    add x10, sp, #248
-; CHECK-NEXT:    ld1 { v2.b }[1], [x11]
-; CHECK-NEXT:    add x11, sp, #728
-; CHECK-NEXT:    ldr b4, [sp, #720]
-; CHECK-NEXT:    add x12, sp, #984
-; CHECK-NEXT:    ld1 { v0.b }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #240
-; CHECK-NEXT:    mov v1.b[2], w2
-; CHECK-NEXT:    ldr b3, [sp, #976]
-; CHECK-NEXT:    ldr b5, [sp, #848]
-; CHECK-NEXT:    add x13, sp, #96
-; CHECK-NEXT:    ld1 { v4.b }[1], [x11]
-; CHECK-NEXT:    add x11, sp, #856
-; CHECK-NEXT:    ld1 { v0.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #256
-; CHECK-NEXT:    mov v1.b[3], w3
-; CHECK-NEXT:    ld1 { v3.b }[1], [x12]
-; CHECK-NEXT:    add x12, sp, #264
-; CHECK-NEXT:    ld1 { v5.b }[1], [x11]
+; CHECK-NEXT:    movi v2.16b, #1
+; CHECK-NEXT:    mov v0.b[1], w1
+; CHECK-NEXT:    ld1 { v5.b }[1], [x9]
+; CHECK-NEXT:    movi v4.2d, #0000000000000000
 ; CHECK-NEXT:    add x11, sp, #992
-; CHECK-NEXT:    ld1 { v2.b }[2], [x13]
-; CHECK-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-NEXT:    add x13, sp, #736
-; CHECK-NEXT:    mov v1.b[4], w4
+; CHECK-NEXT:    ldr b6, [sp, #720]
+; CHECK-NEXT:    ldr b7, [sp, #80]
+; CHECK-NEXT:    ld1 { v1.b }[2], [x8]
+; CHECK-NEXT:    add x8, sp, #232
+; CHECK-NEXT:    add x13, sp, #88
+; CHECK-NEXT:    ld1 { v5.b }[2], [x11]
+; CHECK-NEXT:    ld1 { v7.b }[1], [x13]
+; CHECK-NEXT:    add x13, sp, #856
+; CHECK-NEXT:    mov v0.b[2], w2
+; CHECK-NEXT:    add x14, sp, #744
+; CHECK-NEXT:    add x15, sp, #872
+; CHECK-NEXT:    ld1 { v1.b }[3], [x8]
+; CHECK-NEXT:    add x8, sp, #240
+; CHECK-NEXT:    add x16, sp, #888
+; CHECK-NEXT:    add x10, sp, #16
+; CHECK-NEXT:    add x9, sp, #24
+; CHECK-NEXT:    add x11, sp, #40
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-NEXT:    ld1 { v1.b }[4], [x8]
+; CHECK-NEXT:    add x8, sp, #248
+; CHECK-NEXT:    mov v0.b[3], w3
+; CHECK-NEXT:    ld1 { v1.b }[5], [x8]
+; CHECK-NEXT:    add x8, sp, #256
+; CHECK-NEXT:    mov v0.b[4], w4
+; CHECK-NEXT:    ld1 { v1.b }[6], [x8]
+; CHECK-NEXT:    add x8, sp, #264
+; CHECK-NEXT:    mov v0.b[5], w5
+; CHECK-NEXT:    ld1 { v1.b }[7], [x8]
 ; CHECK-NEXT:    add x8, sp, #272
-; CHECK-NEXT:    ld1 { v3.b }[2], [x11]
+; CHECK-NEXT:    ld1 { v1.b }[8], [x8]
+; CHECK-NEXT:    add x8, sp, #280
+; CHECK-NEXT:    mov v0.b[6], w6
+; CHECK-NEXT:    ld1 { v1.b }[9], [x8]
+; CHECK-NEXT:    add x8, sp, #288
+; CHECK-NEXT:    mov v0.b[7], w7
+; CHECK-NEXT:    ld1 { v1.b }[10], [x8]
+; CHECK-NEXT:    add x8, sp, #296
+; CHECK-NEXT:    ld1 { v0.b }[8], [x10]
+; CHECK-NEXT:    add x10, sp, #128
+; CHECK-NEXT:    ld1 { v1.b }[11], [x8]
+; CHECK-NEXT:    add x8, sp, #304
+; CHECK-NEXT:    ld1 { v0.b }[9], [x9]
+; CHECK-NEXT:    add x9, sp, #136
+; CHECK-NEXT:    ld1 { v1.b }[12], [x8]
+; CHECK-NEXT:    add x8, sp, #312
+; CHECK-NEXT:    ld1 { v1.b }[13], [x8]
+; CHECK-NEXT:    add x8, sp, #320
+; CHECK-NEXT:    ld1 { v1.b }[14], [x8]
+; CHECK-NEXT:    add x8, sp, #32
+; CHECK-NEXT:    ld1 { v0.b }[10], [x8]
+; CHECK-NEXT:    add x8, sp, #144
+; CHECK-NEXT:    ld1 { v1.b }[15], [x12]
+; CHECK-NEXT:    add x12, sp, #728
+; CHECK-NEXT:    ld1 { v6.b }[1], [x12]
+; CHECK-NEXT:    add x12, sp, #1000
+; CHECK-NEXT:    ld1 { v0.b }[11], [x11]
+; CHECK-NEXT:    ld1 { v5.b }[3], [x12]
+; CHECK-NEXT:    add x12, sp, #736
+; CHECK-NEXT:    add x11, sp, #920
+; CHECK-NEXT:    sdot v4.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ldr b1, [sp, #848]
+; CHECK-NEXT:    ld1 { v6.b }[2], [x12]
+; CHECK-NEXT:    add x12, sp, #1008
+; CHECK-NEXT:    ld1 { v1.b }[1], [x13]
+; CHECK-NEXT:    ld1 { v5.b }[4], [x12]
+; CHECK-NEXT:    add x12, sp, #96
+; CHECK-NEXT:    ld1 { v7.b }[2], [x12]
+; CHECK-NEXT:    add x12, sp, #1016
+; CHECK-NEXT:    add x13, sp, #48
+; CHECK-NEXT:    ld1 { v6.b }[3], [x14]
 ; CHECK-NEXT:    add x14, sp, #864
-; CHECK-NEXT:    ld1 { v4.b }[2], [x13]
-; CHECK-NEXT:    add x13, sp, #1000
-; CHECK-NEXT:    ld1 { v0.b }[5], [x10]
-; CHECK-NEXT:    add x11, sp, #104
-; CHECK-NEXT:    mov v1.b[5], w5
-; CHECK-NEXT:    add x10, sp, #280
-; CHECK-NEXT:    ld1 { v3.b }[3], [x13]
-; CHECK-NEXT:    add x13, sp, #16
-; CHECK-NEXT:    ld1 { v5.b }[2], [x14]
-; CHECK-NEXT:    add x14, sp, #296
-; CHECK-NEXT:    ld1 { v0.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #288
-; CHECK-NEXT:    mov v1.b[6], w6
-; CHECK-NEXT:    ld1 { v2.b }[3], [x11]
-; CHECK-NEXT:    add x11, sp, #872
-; CHECK-NEXT:    movi v6.16b, #1
-; CHECK-NEXT:    ld1 { v0.b }[7], [x12]
-; CHECK-NEXT:    add x12, sp, #744
-; CHECK-NEXT:    mov v1.b[7], w7
-; CHECK-NEXT:    ld1 { v5.b }[3], [x11]
-; CHECK-NEXT:    add x11, sp, #1008
-; CHECK-NEXT:    ld1 { v4.b }[3], [x12]
+; CHECK-NEXT:    ld1 { v0.b }[12], [x13]
+; CHECK-NEXT:    ld1 { v1.b }[2], [x14]
+; CHECK-NEXT:    add x14, sp, #752
+; CHECK-NEXT:    ld1 { v5.b }[5], [x12]
+; CHECK-NEXT:    add x12, sp, #104
+; CHECK-NEXT:    ld1 { v6.b }[4], [x14]
+; CHECK-NEXT:    add x14, sp, #1024
+; CHECK-NEXT:    ld1 { v7.b }[3], [x12]
+; CHECK-NEXT:    ld1 { v1.b }[3], [x15]
+; CHECK-NEXT:    add x15, sp, #760
+; CHECK-NEXT:    ld1 { v5.b }[6], [x14]
 ; CHECK-NEXT:    add x12, sp, #112
-; CHECK-NEXT:    ld1 { v0.b }[8], [x8]
-; CHECK-NEXT:    add x8, sp, #304
-; CHECK-NEXT:    ld1 { v1.b }[8], [x13]
-; CHECK-NEXT:    add x13, sp, #24
-; CHECK-NEXT:    ld1 { v3.b }[4], [x11]
-; CHECK-NEXT:    add x11, sp, #880
-; CHECK-NEXT:    ld1 { v2.b }[4], [x12]
-; CHECK-NEXT:    add x12, sp, #752
-; CHECK-NEXT:    ld1 { v0.b }[9], [x10]
-; CHECK-NEXT:    add x10, sp, #312
-; CHECK-NEXT:    ld1 { v1.b }[9], [x13]
-; CHECK-NEXT:    add x13, sp, #32
-; CHECK-NEXT:    ld1 { v5.b }[4], [x11]
-; CHECK-NEXT:    add x11, sp, #1016
-; CHECK-NEXT:    ld1 { v4.b }[4], [x12]
+; CHECK-NEXT:    add x14, sp, #880
+; CHECK-NEXT:    ld1 { v6.b }[5], [x15]
+; CHECK-NEXT:    add x15, sp, #1032
+; CHECK-NEXT:    ld1 { v7.b }[4], [x12]
+; CHECK-NEXT:    ld1 { v1.b }[4], [x14]
+; CHECK-NEXT:    add x14, sp, #768
+; CHECK-NEXT:    ld1 { v5.b }[7], [x15]
 ; CHECK-NEXT:    add x12, sp, #120
-; CHECK-NEXT:    ld1 { v0.b }[10], [x9]
-; CHECK-NEXT:    add x9, sp, #320
-; CHECK-NEXT:    ld1 { v1.b }[10], [x13]
-; CHECK-NEXT:    add x13, sp, #40
-; CHECK-NEXT:    ld1 { v3.b }[5], [x11]
-; CHECK-NEXT:    add x11, sp, #888
-; CHECK-NEXT:    ld1 { v2.b }[5], [x12]
-; CHECK-NEXT:    add x12, sp, #760
-; CHECK-NEXT:    ld1 { v0.b }[11], [x14]
-; CHECK-NEXT:    add x14, sp, #328
-; CHECK-NEXT:    ld1 { v1.b }[11], [x13]
-; CHECK-NEXT:    add x13, sp, #48
-; CHECK-NEXT:    ld1 { v4.b }[5], [x12]
-; CHECK-NEXT:    add x12, sp, #56
-; CHECK-NEXT:    ld1 { v5.b }[5], [x11]
-; CHECK-NEXT:    add x11, sp, #128
-; CHECK-NEXT:    ld1 { v0.b }[12], [x8]
-; CHECK-NEXT:    add x8, sp, #1024
-; CHECK-NEXT:    ld1 { v1.b }[12], [x13]
-; CHECK-NEXT:    ld1 { v2.b }[6], [x11]
-; CHECK-NEXT:    add x11, sp, #64
-; CHECK-NEXT:    ld1 { v3.b }[6], [x8]
-; CHECK-NEXT:    add x8, sp, #1032
-; CHECK-NEXT:    ld1 { v0.b }[13], [x10]
-; CHECK-NEXT:    add x10, sp, #768
-; CHECK-NEXT:    ld1 { v1.b }[13], [x12]
-; CHECK-NEXT:    movi v7.2d, #0000000000000000
-; CHECK-NEXT:    ld1 { v3.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #1040
-; CHECK-NEXT:    ld1 { v4.b }[6], [x10]
-; CHECK-NEXT:    add x10, sp, #896
-; CHECK-NEXT:    ld1 { v0.b }[14], [x9]
-; CHECK-NEXT:    add x9, sp, #776
-; CHECK-NEXT:    ld1 { v1.b }[14], [x11]
-; CHECK-NEXT:    add x11, sp, #136
-; CHECK-NEXT:    ld1 { v3.b }[8], [x8]
-; CHECK-NEXT:    add x8, sp, #1048
-; CHECK-NEXT:    ld1 { v4.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #784
-; CHECK-NEXT:    ld1 { v5.b }[6], [x10]
-; CHECK-NEXT:    add x10, sp, #904
-; CHECK-NEXT:    ld1 { v2.b }[7], [x11]
-; CHECK-NEXT:    add x11, sp, #144
-; CHECK-NEXT:    ld1 { v3.b }[9], [x8]
-; CHECK-NEXT:    add x8, sp, #1056
-; CHECK-NEXT:    ld1 { v4.b }[8], [x9]
+; CHECK-NEXT:    add x15, sp, #1040
+; CHECK-NEXT:    ld1 { v6.b }[6], [x14]
+; CHECK-NEXT:    ld1 { v7.b }[5], [x12]
+; CHECK-NEXT:    add x12, sp, #776
+; CHECK-NEXT:    ld1 { v1.b }[5], [x16]
+; CHECK-NEXT:    ld1 { v5.b }[8], [x15]
+; CHECK-NEXT:    add x15, sp, #896
+; CHECK-NEXT:    add x14, sp, #1048
+; CHECK-NEXT:    ld1 { v6.b }[7], [x12]
+; CHECK-NEXT:    ld1 { v7.b }[6], [x10]
+; CHECK-NEXT:    add x10, sp, #784
+; CHECK-NEXT:    ld1 { v1.b }[6], [x15]
+; CHECK-NEXT:    ld1 { v5.b }[9], [x14]
+; CHECK-NEXT:    add x14, sp, #904
+; CHECK-NEXT:    add x12, sp, #1056
+; CHECK-NEXT:    ld1 { v6.b }[8], [x10]
+; CHECK-NEXT:    ld1 { v7.b }[7], [x9]
 ; CHECK-NEXT:    add x9, sp, #792
-; CHECK-NEXT:    ld1 { v5.b }[7], [x10]
-; CHECK-NEXT:    add x10, sp, #912
-; CHECK-NEXT:    ld1 { v2.b }[8], [x11]
-; CHECK-NEXT:    add x11, sp, #152
-; CHECK-NEXT:    ld1 { v3.b }[10], [x8]
-; CHECK-NEXT:    add x8, sp, #1064
-; CHECK-NEXT:    ld1 { v4.b }[9], [x9]
+; CHECK-NEXT:    ld1 { v1.b }[7], [x14]
+; CHECK-NEXT:    ld1 { v5.b }[10], [x12]
+; CHECK-NEXT:    add x12, sp, #912
+; CHECK-NEXT:    add x10, sp, #1064
+; CHECK-NEXT:    ld1 { v6.b }[9], [x9]
+; CHECK-NEXT:    ld1 { v7.b }[8], [x8]
 ; CHECK-NEXT:    add x9, sp, #800
-; CHECK-NEXT:    ld1 { v5.b }[8], [x10]
-; CHECK-NEXT:    add x10, sp, #920
-; CHECK-NEXT:    ld1 { v2.b }[9], [x11]
-; CHECK-NEXT:    add x11, sp, #160
-; CHECK-NEXT:    ld1 { v3.b }[11], [x8]
-; CHECK-NEXT:    add x8, sp, #1072
-; CHECK-NEXT:    ld1 { v4.b }[10], [x9]
-; CHECK-NEXT:    add x9, sp, #808
-; CHECK-NEXT:    ld1 { v5.b }[9], [x10]
-; CHECK-NEXT:    add x10, sp, #928
-; CHECK-NEXT:    ld1 { v2.b }[10], [x11]
-; CHECK-NEXT:    add x11, sp, #168
-; CHECK-NEXT:    ld1 { v3.b }[12], [x8]
-; CHECK-NEXT:    add x8, sp, #1080
-; CHECK-NEXT:    ld1 { v4.b }[11], [x9]
-; CHECK-NEXT:    add x9, sp, #816
-; CHECK-NEXT:    ld1 { v5.b }[10], [x10]
-; CHECK-NEXT:    add x10, sp, #936
-; CHECK-NEXT:    ld1 { v2.b }[11], [x11]
-; CHECK-NEXT:    add x11, sp, #176
-; CHECK-NEXT:    ld1 { v3.b }[13], [x8]
-; CHECK-NEXT:    add x8, sp, #1088
-; CHECK-NEXT:    ld1 { v4.b }[12], [x9]
-; CHECK-NEXT:    add x9, sp, #824
+; CHECK-NEXT:    ld1 { v1.b }[8], [x12]
 ; CHECK-NEXT:    ld1 { v5.b }[11], [x10]
-; CHECK-NEXT:    add x10, sp, #944
-; CHECK-NEXT:    ld1 { v2.b }[12], [x11]
-; CHECK-NEXT:    add x11, sp, #184
-; CHECK-NEXT:    ld1 { v3.b }[14], [x8]
-; CHECK-NEXT:    add x8, sp, #1096
-; CHECK-NEXT:    ld1 { v4.b }[13], [x9]
-; CHECK-NEXT:    add x9, sp, #832
+; CHECK-NEXT:    add x8, sp, #152
+; CHECK-NEXT:    add x10, sp, #1072
+; CHECK-NEXT:    ld1 { v6.b }[10], [x9]
+; CHECK-NEXT:    ld1 { v7.b }[9], [x8]
+; CHECK-NEXT:    add x9, sp, #808
+; CHECK-NEXT:    ld1 { v1.b }[9], [x11]
 ; CHECK-NEXT:    ld1 { v5.b }[12], [x10]
-; CHECK-NEXT:    add x10, sp, #952
-; CHECK-NEXT:    movi v16.2d, #0000000000000000
-; CHECK-NEXT:    ld1 { v0.b }[15], [x14]
-; CHECK-NEXT:    ld1 { v2.b }[13], [x11]
-; CHECK-NEXT:    add x11, sp, #72
-; CHECK-NEXT:    ld1 { v3.b }[15], [x8]
-; CHECK-NEXT:    add x8, sp, #192
-; CHECK-NEXT:    ld1 { v4.b }[14], [x9]
-; CHECK-NEXT:    add x9, sp, #840
+; CHECK-NEXT:    add x10, sp, #160
+; CHECK-NEXT:    add x8, sp, #56
+; CHECK-NEXT:    ld1 { v6.b }[11], [x9]
+; CHECK-NEXT:    add x9, sp, #928
+; CHECK-NEXT:    ld1 { v7.b }[10], [x10]
+; CHECK-NEXT:    add x10, sp, #1080
+; CHECK-NEXT:    ld1 { v1.b }[10], [x9]
+; CHECK-NEXT:    ld1 { v0.b }[13], [x8]
 ; CHECK-NEXT:    ld1 { v5.b }[13], [x10]
-; CHECK-NEXT:    add x10, sp, #960
-; CHECK-NEXT:    ld1 { v1.b }[15], [x11]
-; CHECK-NEXT:    sdot v16.4s, v0.16b, v6.16b
-; CHECK-NEXT:    ld1 { v2.b }[14], [x8]
-; CHECK-NEXT:    sdot v7.4s, v3.16b, v6.16b
-; CHECK-NEXT:    ld1 { v4.b }[15], [x9]
-; CHECK-NEXT:    ld1 { v5.b }[14], [x10]
+; CHECK-NEXT:    add x8, sp, #816
+; CHECK-NEXT:    add x9, sp, #168
+; CHECK-NEXT:    ld1 { v6.b }[12], [x8]
+; CHECK-NEXT:    add x8, sp, #936
+; CHECK-NEXT:    ld1 { v7.b }[11], [x9]
+; CHECK-NEXT:    add x9, sp, #1088
+; CHECK-NEXT:    ld1 { v1.b }[11], [x8]
+; CHECK-NEXT:    add x10, sp, #176
+; CHECK-NEXT:    ld1 { v5.b }[14], [x9]
+; CHECK-NEXT:    add x9, sp, #824
+; CHECK-NEXT:    add x8, sp, #64
+; CHECK-NEXT:    ld1 { v6.b }[13], [x9]
+; CHECK-NEXT:    add x9, sp, #944
+; CHECK-NEXT:    ld1 { v7.b }[12], [x10]
+; CHECK-NEXT:    add x10, sp, #1096
+; CHECK-NEXT:    ld1 { v1.b }[12], [x9]
+; CHECK-NEXT:    ld1 { v0.b }[14], [x8]
+; CHECK-NEXT:    ld1 { v5.b }[15], [x10]
+; CHECK-NEXT:    add x8, sp, #832
+; CHECK-NEXT:    add x9, sp, #184
+; CHECK-NEXT:    ld1 { v6.b }[14], [x8]
+; CHECK-NEXT:    add x8, sp, #952
+; CHECK-NEXT:    ld1 { v7.b }[13], [x9]
+; CHECK-NEXT:    ld1 { v1.b }[13], [x8]
+; CHECK-NEXT:    add x10, sp, #72
+; CHECK-NEXT:    add x8, sp, #840
+; CHECK-NEXT:    sdot v3.4s, v5.16b, v2.16b
+; CHECK-NEXT:    ld1 { v0.b }[15], [x10]
+; CHECK-NEXT:    add x9, sp, #192
+; CHECK-NEXT:    ld1 { v6.b }[15], [x8]
+; CHECK-NEXT:    add x8, sp, #960
+; CHECK-NEXT:    ld1 { v7.b }[14], [x9]
+; CHECK-NEXT:    ld1 { v1.b }[14], [x8]
 ; CHECK-NEXT:    add x8, sp, #200
 ; CHECK-NEXT:    add x9, sp, #968
-; CHECK-NEXT:    sdot v16.4s, v1.16b, v6.16b
-; CHECK-NEXT:    ld1 { v2.b }[15], [x8]
-; CHECK-NEXT:    sdot v7.4s, v4.16b, v6.16b
-; CHECK-NEXT:    ld1 { v5.b }[15], [x9]
-; CHECK-NEXT:    sdot v16.4s, v2.16b, v6.16b
-; CHECK-NEXT:    sdot v7.4s, v5.16b, v6.16b
-; CHECK-NEXT:    add v0.4s, v16.4s, v7.4s
+; CHECK-NEXT:    sdot v4.4s, v0.16b, v2.16b
+; CHECK-NEXT:    sdot v3.4s, v6.16b, v2.16b
+; CHECK-NEXT:    ld1 { v7.b }[15], [x8]
+; CHECK-NEXT:    ld1 { v1.b }[15], [x9]
+; CHECK-NEXT:    sdot v4.4s, v7.16b, v2.16b
+; CHECK-NEXT:    sdot v3.4s, v1.16b, v2.16b
+; CHECK-NEXT:    add v0.4s, v4.4s, v3.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2863,17 +2867,17 @@ entry:
 define i32 @test_udot_v64i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
 ; CHECK-LABEL: test_udot_v64i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp q1, q4, [x1, #32]
-; CHECK-NEXT:    movi v5.2d, #0000000000000000
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    ldp q2, q3, [x0, #32]
-; CHECK-NEXT:    udot v5.4s, v1.16b, v2.16b
-; CHECK-NEXT:    ldp q6, q7, [x0]
-; CHECK-NEXT:    udot v0.4s, v4.16b, v3.16b
-; CHECK-NEXT:    ldp q1, q16, [x1]
-; CHECK-NEXT:    udot v5.4s, v1.16b, v6.16b
-; CHECK-NEXT:    udot v0.4s, v16.16b, v7.16b
-; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
+; CHECK-NEXT:    ldp q4, q5, [x1, #32]
+; CHECK-NEXT:    udot v1.4s, v5.16b, v3.16b
+; CHECK-NEXT:    udot v0.4s, v4.16b, v2.16b
+; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    ldp q4, q5, [x1]
+; CHECK-NEXT:    udot v1.4s, v5.16b, v3.16b
+; CHECK-NEXT:    udot v0.4s, v4.16b, v2.16b
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    add w0, w8, w2
@@ -2892,16 +2896,16 @@ entry:
 define i32 @test_udot_v64i8_nomla(ptr nocapture readonly %a1) {
 ; CHECK-LABEL: test_udot_v64i8_nomla:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp q4, q3, [x0, #32]
 ; CHECK-NEXT:    movi v0.16b, #1
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    ldp q3, q4, [x0, #32]
+; CHECK-NEXT:    udot v2.4s, v4.16b, v0.16b
 ; CHECK-NEXT:    udot v1.4s, v3.16b, v0.16b
-; CHECK-NEXT:    ldp q3, q5, [x0]
+; CHECK-NEXT:    ldp q3, q4, [x0]
 ; CHECK-NEXT:    udot v2.4s, v4.16b, v0.16b
-; CHECK-NEXT:    udot v2.4s, v3.16b, v0.16b
-; CHECK-NEXT:    udot v1.4s, v5.16b, v0.16b
-; CHECK-NEXT:    add v0.4s, v2.4s, v1.4s
+; CHECK-NEXT:    udot v1.4s, v3.16b, v0.16b
+; CHECK-NEXT:    add v0.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -2914,17 +2918,17 @@ entry:
 define i32 @test_sdot_v64i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
 ; CHECK-LABEL: test_sdot_v64i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp q1, q4, [x1, #32]
-; CHECK-NEXT:    movi v5.2d, #0000000000000000
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    ldp q2, q3, [x0, #32]
-; CHECK-NEXT:    sdot v5.4s, v1.16b, v2.16b
-; CHECK-NEXT:    ldp q6, q7, [x0]
-; CHECK-NEXT:    sdot v0.4s, v4.16b, v3.16b
-; CHECK-NEXT:    ldp q1, q16, [x1]
-; CHECK-NEXT:    sdot v5.4s, v1.16b, v6.16b
-; CHECK-NEXT:    sdot v0.4s, v16.16b, v7.16b
-; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
+; CHECK-NEXT:    ldp q4, q5, [x1, #32]
+; CHECK-NEXT:    sdot v1.4s, v5.16b, v3.16b
+; CHECK-NEXT:    sdot v0.4s, v4.16b, v2.16b
+; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    ldp q4, q5, [x1]
+; CHECK-NEXT:    sdot v1.4s, v5.16b, v3.16b
+; CHECK-NEXT:    sdot v0.4s, v4.16b, v2.16b
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    add w0, w8, w2
@@ -2943,24 +2947,24 @@ entry:
 define i32 @test_sdot_v64i8_double(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
 ; CHECK-LABEL: test_sdot_v64i8_double:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp q18, q19, [sp, #96]
-; CHECK-NEXT:    movi v22.2d, #0000000000000000
-; CHECK-NEXT:    movi v23.2d, #0000000000000000
-; CHECK-NEXT:    movi v24.2d, #0000000000000000
-; CHECK-NEXT:    movi v25.2d, #0000000000000000
-; CHECK-NEXT:    sdot v22.4s, v3.16b, v7.16b
-; CHECK-NEXT:    ldp q20, q21, [sp, #32]
-; CHECK-NEXT:    sdot v23.4s, v2.16b, v6.16b
-; CHECK-NEXT:    sdot v22.4s, v1.16b, v5.16b
-; CHECK-NEXT:    sdot v25.4s, v20.16b, v18.16b
-; CHECK-NEXT:    sdot v23.4s, v0.16b, v4.16b
-; CHECK-NEXT:    ldp q16, q17, [sp, #64]
-; CHECK-NEXT:    sdot v24.4s, v21.16b, v19.16b
-; CHECK-NEXT:    add v0.4s, v23.4s, v22.4s
-; CHECK-NEXT:    ldp q26, q3, [sp]
-; CHECK-NEXT:    sdot v25.4s, v26.16b, v16.16b
-; CHECK-NEXT:    sdot v24.4s, v3.16b, v17.16b
-; CHECK-NEXT:    add v1.4s, v25.4s, v24.4s
+; CHECK-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-NEXT:    movi v17.2d, #0000000000000000
+; CHECK-NEXT:    movi v18.2d, #0000000000000000
+; CHECK-NEXT:    movi v19.2d, #0000000000000000
+; CHECK-NEXT:    ldp q20, q21, [sp, #96]
+; CHECK-NEXT:    ldp q22, q23, [sp, #32]
+; CHECK-NEXT:    sdot v16.4s, v3.16b, v7.16b
+; CHECK-NEXT:    sdot v18.4s, v2.16b, v6.16b
+; CHECK-NEXT:    sdot v19.4s, v23.16b, v21.16b
+; CHECK-NEXT:    sdot v17.4s, v22.16b, v20.16b
+; CHECK-NEXT:    ldp q2, q3, [sp, #64]
+; CHECK-NEXT:    ldp q6, q7, [sp]
+; CHECK-NEXT:    sdot v16.4s, v1.16b, v5.16b
+; CHECK-NEXT:    sdot v18.4s, v0.16b, v4.16b
+; CHECK-NEXT:    sdot v19.4s, v7.16b, v3.16b
+; CHECK-NEXT:    sdot v17.4s, v6.16b, v2.16b
+; CHECK-NEXT:    add v0.4s, v18.4s, v16.4s
+; CHECK-NEXT:    add v1.4s, v17.4s, v19.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
@@ -2981,23 +2985,23 @@ entry:
 define i32 @test_sdot_v64i8_double_nomla(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
 ; CHECK-LABEL: test_sdot_v64i8_double_nomla:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp q4, q5, [sp, #32]
-; CHECK-NEXT:    movi v6.16b, #1
+; CHECK-NEXT:    movi v4.16b, #1
+; CHECK-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-NEXT:    movi v6.2d, #0000000000000000
 ; CHECK-NEXT:    movi v7.2d, #0000000000000000
+; CHECK-NEXT:    ldp q17, q18, [sp, #32]
 ; CHECK-NEXT:    movi v16.2d, #0000000000000000
-; CHECK-NEXT:    movi v17.2d, #0000000000000000
-; CHECK-NEXT:    movi v18.2d, #0000000000000000
-; CHECK-NEXT:    sdot v7.4s, v3.16b, v6.16b
-; CHECK-NEXT:    sdot v16.4s, v2.16b, v6.16b
-; CHECK-NEXT:    ldp q3, q2, [sp]
-; CHECK-NEXT:    sdot v17.4s, v5.16b, v6.16b
-; CHECK-NEXT:    sdot v18.4s, v4.16b, v6.16b
-; CHECK-NEXT:    sdot v7.4s, v1.16b, v6.16b
-; CHECK-NEXT:    sdot v16.4s, v0.16b, v6.16b
-; CHECK-NEXT:    sdot v17.4s, v2.16b, v6.16b
-; CHECK-NEXT:    sdot v18.4s, v3.16b, v6.16b
-; CHECK-NEXT:    add v0.4s, v16.4s, v7.4s
-; CHECK-NEXT:    add v1.4s, v18.4s, v17.4s
+; CHECK-NEXT:    sdot v5.4s, v3.16b, v4.16b
+; CHECK-NEXT:    sdot v6.4s, v17.16b, v4.16b
+; CHECK-NEXT:    sdot v7.4s, v2.16b, v4.16b
+; CHECK-NEXT:    ldp q2, q3, [sp]
+; CHECK-NEXT:    sdot v16.4s, v18.16b, v4.16b
+; CHECK-NEXT:    sdot v5.4s, v1.16b, v4.16b
+; CHECK-NEXT:    sdot v6.4s, v2.16b, v4.16b
+; CHECK-NEXT:    sdot v7.4s, v0.16b, v4.16b
+; CHECK-NEXT:    sdot v16.4s, v3.16b, v4.16b
+; CHECK-NEXT:    add v0.4s, v7.4s, v5.4s
+; CHECK-NEXT:    add v1.4s, v6.4s, v16.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0

diff  --git a/llvm/test/CodeGen/AArch64/neon-extadd.ll b/llvm/test/CodeGen/AArch64/neon-extadd.ll
index 76fab7ff733bc8..6d2305059ce887 100644
--- a/llvm/test/CodeGen/AArch64/neon-extadd.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extadd.ll
@@ -56,12 +56,13 @@ entry:
 define <32 x i16> @extadds_v32i8_i16(<32 x i8> %s0, <32 x i8> %s1) {
 ; CHECK-LABEL: extadds_v32i8_i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl2 v4.8h, v0.16b, v2.16b
-; CHECK-NEXT:    saddl2 v5.8h, v1.16b, v3.16b
-; CHECK-NEXT:    saddl v0.8h, v0.8b, v2.8b
+; CHECK-NEXT:    saddl2 v4.8h, v1.16b, v3.16b
+; CHECK-NEXT:    saddl v5.8h, v0.8b, v2.8b
+; CHECK-NEXT:    saddl2 v6.8h, v0.16b, v2.16b
 ; CHECK-NEXT:    saddl v2.8h, v1.8b, v3.8b
-; CHECK-NEXT:    mov v1.16b, v4.16b
-; CHECK-NEXT:    mov v3.16b, v5.16b
+; CHECK-NEXT:    mov v0.16b, v5.16b
+; CHECK-NEXT:    mov v1.16b, v6.16b
+; CHECK-NEXT:    mov v3.16b, v4.16b
 ; CHECK-NEXT:    ret
 entry:
   %s0s = sext <32 x i8> %s0 to <32 x i16>
@@ -73,12 +74,13 @@ entry:
 define <32 x i16> @extaddu_v32i8_i16(<32 x i8> %s0, <32 x i8> %s1) {
 ; CHECK-LABEL: extaddu_v32i8_i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl2 v4.8h, v0.16b, v2.16b
-; CHECK-NEXT:    uaddl2 v5.8h, v1.16b, v3.16b
-; CHECK-NEXT:    uaddl v0.8h, v0.8b, v2.8b
+; CHECK-NEXT:    uaddl2 v4.8h, v1.16b, v3.16b
+; CHECK-NEXT:    uaddl v5.8h, v0.8b, v2.8b
+; CHECK-NEXT:    uaddl2 v6.8h, v0.16b, v2.16b
 ; CHECK-NEXT:    uaddl v2.8h, v1.8b, v3.8b
-; CHECK-NEXT:    mov v1.16b, v4.16b
-; CHECK-NEXT:    mov v3.16b, v5.16b
+; CHECK-NEXT:    mov v0.16b, v5.16b
+; CHECK-NEXT:    mov v1.16b, v6.16b
+; CHECK-NEXT:    mov v3.16b, v4.16b
 ; CHECK-NEXT:    ret
 entry:
   %s0s = zext <32 x i8> %s0 to <32 x i16>
@@ -118,12 +120,12 @@ entry:
 define <16 x i32> @extadds_v16i8_i32(<16 x i8> %s0, <16 x i8> %s1) {
 ; CHECK-LABEL: extadds_v16i8_i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl2 v2.8h, v0.16b, v1.16b
-; CHECK-NEXT:    saddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    sshll2 v3.4s, v2.8h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-NEXT:    saddl v2.8h, v0.8b, v1.8b
+; CHECK-NEXT:    saddl2 v4.8h, v0.16b, v1.16b
+; CHECK-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-NEXT:    sshll2 v3.4s, v4.8h, #0
+; CHECK-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-NEXT:    sshll v2.4s, v4.4h, #0
 ; CHECK-NEXT:    ret
 entry:
   %s0s = sext <16 x i8> %s0 to <16 x i32>
@@ -135,12 +137,12 @@ entry:
 define <16 x i32> @extaddu_v16i8_i32(<16 x i8> %s0, <16 x i8> %s1) {
 ; CHECK-LABEL: extaddu_v16i8_i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl2 v2.8h, v0.16b, v1.16b
-; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    ushll2 v3.4s, v2.8h, #0
-; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-NEXT:    uaddl v2.8h, v0.8b, v1.8b
+; CHECK-NEXT:    uaddl2 v4.8h, v0.16b, v1.16b
+; CHECK-NEXT:    ushll v0.4s, v2.4h, #0
+; CHECK-NEXT:    ushll2 v3.4s, v4.8h, #0
+; CHECK-NEXT:    ushll2 v1.4s, v2.8h, #0
+; CHECK-NEXT:    ushll v2.4s, v4.4h, #0
 ; CHECK-NEXT:    ret
 entry:
   %s0s = zext <16 x i8> %s0 to <16 x i32>
@@ -153,11 +155,11 @@ define <8 x i64> @extadds_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) {
 ; CHECK-LABEL: extadds_v8i8_i64:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    saddl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    sshll v1.4s, v0.4h, #0
 ; CHECK-NEXT:    sshll2 v2.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    sshll v0.2d, v1.2s, #0
 ; CHECK-NEXT:    sshll2 v3.2d, v2.4s, #0
-; CHECK-NEXT:    sshll2 v1.2d, v0.4s, #0
-; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-NEXT:    sshll2 v1.2d, v1.4s, #0
 ; CHECK-NEXT:    sshll v2.2d, v2.2s, #0
 ; CHECK-NEXT:    ret
 entry:
@@ -171,11 +173,11 @@ define <8 x i64> @extaddu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) {
 ; CHECK-LABEL: extaddu_v8i8_i64:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    ushll v1.4s, v0.4h, #0
 ; CHECK-NEXT:    ushll2 v2.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ushll v0.2d, v1.2s, #0
 ; CHECK-NEXT:    ushll2 v3.2d, v2.4s, #0
-; CHECK-NEXT:    ushll2 v1.2d, v0.4s, #0
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ushll2 v1.2d, v1.4s, #0
 ; CHECK-NEXT:    ushll v2.2d, v2.2s, #0
 ; CHECK-NEXT:    ret
 entry:
@@ -240,12 +242,13 @@ entry:
 define <16 x i32> @extadds_v16i16_i32(<16 x i16> %s0, <16 x i16> %s1) {
 ; CHECK-LABEL: extadds_v16i16_i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl2 v4.4s, v0.8h, v2.8h
-; CHECK-NEXT:    saddl2 v5.4s, v1.8h, v3.8h
-; CHECK-NEXT:    saddl v0.4s, v0.4h, v2.4h
+; CHECK-NEXT:    saddl2 v4.4s, v1.8h, v3.8h
+; CHECK-NEXT:    saddl v5.4s, v0.4h, v2.4h
+; CHECK-NEXT:    saddl2 v6.4s, v0.8h, v2.8h
 ; CHECK-NEXT:    saddl v2.4s, v1.4h, v3.4h
-; CHECK-NEXT:    mov v1.16b, v4.16b
-; CHECK-NEXT:    mov v3.16b, v5.16b
+; CHECK-NEXT:    mov v0.16b, v5.16b
+; CHECK-NEXT:    mov v1.16b, v6.16b
+; CHECK-NEXT:    mov v3.16b, v4.16b
 ; CHECK-NEXT:    ret
 entry:
   %s0s = sext <16 x i16> %s0 to <16 x i32>
@@ -257,12 +260,13 @@ entry:
 define <16 x i32> @extaddu_v16i16_i32(<16 x i16> %s0, <16 x i16> %s1) {
 ; CHECK-LABEL: extaddu_v16i16_i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl2 v4.4s, v0.8h, v2.8h
-; CHECK-NEXT:    uaddl2 v5.4s, v1.8h, v3.8h
-; CHECK-NEXT:    uaddl v0.4s, v0.4h, v2.4h
+; CHECK-NEXT:    uaddl2 v4.4s, v1.8h, v3.8h
+; CHECK-NEXT:    uaddl v5.4s, v0.4h, v2.4h
+; CHECK-NEXT:    uaddl2 v6.4s, v0.8h, v2.8h
 ; CHECK-NEXT:    uaddl v2.4s, v1.4h, v3.4h
-; CHECK-NEXT:    mov v1.16b, v4.16b
-; CHECK-NEXT:    mov v3.16b, v5.16b
+; CHECK-NEXT:    mov v0.16b, v5.16b
+; CHECK-NEXT:    mov v1.16b, v6.16b
+; CHECK-NEXT:    mov v3.16b, v4.16b
 ; CHECK-NEXT:    ret
 entry:
   %s0s = zext <16 x i16> %s0 to <16 x i32>
@@ -302,12 +306,12 @@ entry:
 define <8 x i64> @extadds_v8i16_i64(<8 x i16> %s0, <8 x i16> %s1) {
 ; CHECK-LABEL: extadds_v8i16_i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl2 v2.4s, v0.8h, v1.8h
-; CHECK-NEXT:    saddl v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    sshll2 v3.2d, v2.4s, #0
-; CHECK-NEXT:    sshll2 v1.2d, v0.4s, #0
-; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-NEXT:    sshll v2.2d, v2.2s, #0
+; CHECK-NEXT:    saddl v2.4s, v0.4h, v1.4h
+; CHECK-NEXT:    saddl2 v4.4s, v0.8h, v1.8h
+; CHECK-NEXT:    sshll v0.2d, v2.2s, #0
+; CHECK-NEXT:    sshll2 v3.2d, v4.4s, #0
+; CHECK-NEXT:    sshll2 v1.2d, v2.4s, #0
+; CHECK-NEXT:    sshll v2.2d, v4.2s, #0
 ; CHECK-NEXT:    ret
 entry:
   %s0s = sext <8 x i16> %s0 to <8 x i64>
@@ -319,12 +323,12 @@ entry:
 define <8 x i64> @extaddu_v8i16_i64(<8 x i16> %s0, <8 x i16> %s1) {
 ; CHECK-LABEL: extaddu_v8i16_i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl2 v2.4s, v0.8h, v1.8h
-; CHECK-NEXT:    uaddl v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    ushll2 v3.2d, v2.4s, #0
-; CHECK-NEXT:    ushll2 v1.2d, v0.4s, #0
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-NEXT:    uaddl v2.4s, v0.4h, v1.4h
+; CHECK-NEXT:    uaddl2 v4.4s, v0.8h, v1.8h
+; CHECK-NEXT:    ushll v0.2d, v2.2s, #0
+; CHECK-NEXT:    ushll2 v3.2d, v4.4s, #0
+; CHECK-NEXT:    ushll2 v1.2d, v2.4s, #0
+; CHECK-NEXT:    ushll v2.2d, v4.2s, #0
 ; CHECK-NEXT:    ret
 entry:
   %s0s = zext <8 x i16> %s0 to <8 x i64>
@@ -388,12 +392,13 @@ entry:
 define <8 x i64> @extadds_v8i32_i64(<8 x i32> %s0, <8 x i32> %s1) {
 ; CHECK-LABEL: extadds_v8i32_i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
-; CHECK-NEXT:    saddl2 v5.2d, v1.4s, v3.4s
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v2.2s
+; CHECK-NEXT:    saddl2 v4.2d, v1.4s, v3.4s
+; CHECK-NEXT:    saddl v5.2d, v0.2s, v2.2s
+; CHECK-NEXT:    saddl2 v6.2d, v0.4s, v2.4s
 ; CHECK-NEXT:    saddl v2.2d, v1.2s, v3.2s
-; CHECK-NEXT:    mov v1.16b, v4.16b
-; CHECK-NEXT:    mov v3.16b, v5.16b
+; CHECK-NEXT:    mov v0.16b, v5.16b
+; CHECK-NEXT:    mov v1.16b, v6.16b
+; CHECK-NEXT:    mov v3.16b, v4.16b
 ; CHECK-NEXT:    ret
 entry:
   %s0s = sext <8 x i32> %s0 to <8 x i64>
@@ -405,12 +410,13 @@ entry:
 define <8 x i64> @extaddu_v8i32_i64(<8 x i32> %s0, <8 x i32> %s1) {
 ; CHECK-LABEL: extaddu_v8i32_i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
-; CHECK-NEXT:    uaddl2 v5.2d, v1.4s, v3.4s
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v2.2s
+; CHECK-NEXT:    uaddl2 v4.2d, v1.4s, v3.4s
+; CHECK-NEXT:    uaddl v5.2d, v0.2s, v2.2s
+; CHECK-NEXT:    uaddl2 v6.2d, v0.4s, v2.4s
 ; CHECK-NEXT:    uaddl v2.2d, v1.2s, v3.2s
-; CHECK-NEXT:    mov v1.16b, v4.16b
-; CHECK-NEXT:    mov v3.16b, v5.16b
+; CHECK-NEXT:    mov v0.16b, v5.16b
+; CHECK-NEXT:    mov v1.16b, v6.16b
+; CHECK-NEXT:    mov v3.16b, v4.16b
 ; CHECK-NEXT:    ret
 entry:
   %s0s = zext <8 x i32> %s0 to <8 x i64>
@@ -422,14 +428,14 @@ entry:
 define <16 x i32> @add_zs(<16 x i8> %s0, <16 x i8> %s1) {
 ; CHECK-LABEL: add_zs:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll2 v2.8h, v0.16b, #0
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    saddw2 v2.8h, v2.8h, v1.16b
-; CHECK-NEXT:    saddw v0.8h, v0.8h, v1.8b
-; CHECK-NEXT:    sshll2 v3.4s, v2.8h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-NEXT:    saddw v2.8h, v2.8h, v1.8b
+; CHECK-NEXT:    saddw2 v4.8h, v0.8h, v1.16b
+; CHECK-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-NEXT:    sshll2 v3.4s, v4.8h, #0
+; CHECK-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-NEXT:    sshll v2.4s, v4.4h, #0
 ; CHECK-NEXT:    ret
 entry:
   %s0s = zext <16 x i8> %s0 to <16 x i32>
@@ -441,79 +447,79 @@ entry:
 define <20 x i32> @v20(<20 x i8> %s0, <20 x i8> %s1) {
 ; CHECK-LABEL: v20:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr b0, [sp, #96]
-; CHECK-NEXT:    add x9, sp, #104
+; CHECK-NEXT:    fmov s0, w0
 ; CHECK-NEXT:    ldr b2, [sp, #160]
 ; CHECK-NEXT:    add x10, sp, #168
 ; CHECK-NEXT:    ldr b3, [sp]
-; CHECK-NEXT:    fmov s1, w0
-; CHECK-NEXT:    ld1 { v0.b }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #112
+; CHECK-NEXT:    add x11, sp, #8
+; CHECK-NEXT:    ldr b1, [sp, #96]
 ; CHECK-NEXT:    ld1 { v2.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #8
-; CHECK-NEXT:    add x11, sp, #128
-; CHECK-NEXT:    add x12, sp, #184
-; CHECK-NEXT:    mov v1.b[1], w1
-; CHECK-NEXT:    add x13, sp, #192
-; CHECK-NEXT:    ld1 { v0.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #120
-; CHECK-NEXT:    ld1 { v3.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #16
-; CHECK-NEXT:    ldr b4, [sp, #224]
-; CHECK-NEXT:    mov v1.b[2], w2
+; CHECK-NEXT:    add x9, sp, #104
+; CHECK-NEXT:    add x10, sp, #176
+; CHECK-NEXT:    mov v0.b[1], w1
+; CHECK-NEXT:    ld1 { v3.b }[1], [x11]
+; CHECK-NEXT:    ld1 { v1.b }[1], [x9]
+; CHECK-NEXT:    add x12, sp, #16
+; CHECK-NEXT:    add x9, sp, #112
+; CHECK-NEXT:    add x13, sp, #184
+; CHECK-NEXT:    ld1 { v2.b }[2], [x10]
+; CHECK-NEXT:    add x11, sp, #120
+; CHECK-NEXT:    add x14, sp, #32
+; CHECK-NEXT:    ld1 { v3.b }[2], [x12]
+; CHECK-NEXT:    ld1 { v1.b }[2], [x9]
 ; CHECK-NEXT:    ldr b5, [sp, #64]
-; CHECK-NEXT:    ld1 { v0.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #176
-; CHECK-NEXT:    ld1 { v3.b }[2], [x10]
-; CHECK-NEXT:    add x10, sp, #24
-; CHECK-NEXT:    ld1 { v2.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #136
-; CHECK-NEXT:    ld1 { v0.b }[4], [x11]
-; CHECK-NEXT:    add x11, sp, #144
-; CHECK-NEXT:    ld1 { v3.b }[3], [x10]
-; CHECK-NEXT:    add x10, sp, #32
-; CHECK-NEXT:    mov v1.b[3], w3
-; CHECK-NEXT:    ld1 { v2.b }[3], [x12]
-; CHECK-NEXT:    add x12, sp, #200
-; CHECK-NEXT:    ld1 { v0.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #152
-; CHECK-NEXT:    ld1 { v3.b }[4], [x10]
-; CHECK-NEXT:    add x10, sp, #72
-; CHECK-NEXT:    mov v1.b[4], w4
-; CHECK-NEXT:    ld1 { v2.b }[4], [x13]
-; CHECK-NEXT:    add x13, sp, #232
-; CHECK-NEXT:    ld1 { v0.b }[6], [x11]
-; CHECK-NEXT:    add x11, sp, #40
-; CHECK-NEXT:    ld1 { v5.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #80
-; CHECK-NEXT:    ld1 { v4.b }[1], [x13]
-; CHECK-NEXT:    ld1 { v2.b }[5], [x12]
-; CHECK-NEXT:    add x12, sp, #240
-; CHECK-NEXT:    ld1 { v0.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #208
-; CHECK-NEXT:    ld1 { v3.b }[5], [x11]
-; CHECK-NEXT:    add x11, sp, #216
-; CHECK-NEXT:    mov v1.b[5], w5
-; CHECK-NEXT:    ld1 { v4.b }[2], [x12]
-; CHECK-NEXT:    ld1 { v2.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #48
-; CHECK-NEXT:    ld1 { v5.b }[2], [x10]
+; CHECK-NEXT:    mov v0.b[2], w2
+; CHECK-NEXT:    ldr b4, [sp, #224]
+; CHECK-NEXT:    add x10, sp, #128
+; CHECK-NEXT:    ld1 { v2.b }[3], [x13]
+; CHECK-NEXT:    add x13, sp, #24
+; CHECK-NEXT:    add x12, sp, #136
+; CHECK-NEXT:    ld1 { v3.b }[3], [x13]
+; CHECK-NEXT:    ld1 { v1.b }[3], [x11]
+; CHECK-NEXT:    add x11, sp, #192
+; CHECK-NEXT:    add x13, sp, #200
+; CHECK-NEXT:    add x15, sp, #80
+; CHECK-NEXT:    add x9, sp, #144
+; CHECK-NEXT:    mov v0.b[3], w3
+; CHECK-NEXT:    ld1 { v2.b }[4], [x11]
+; CHECK-NEXT:    add x11, sp, #232
+; CHECK-NEXT:    ld1 { v3.b }[4], [x14]
+; CHECK-NEXT:    add x14, sp, #72
+; CHECK-NEXT:    ld1 { v4.b }[1], [x11]
+; CHECK-NEXT:    ld1 { v5.b }[1], [x14]
+; CHECK-NEXT:    add x14, sp, #40
+; CHECK-NEXT:    ld1 { v1.b }[4], [x10]
+; CHECK-NEXT:    ld1 { v2.b }[5], [x13]
+; CHECK-NEXT:    add x11, sp, #208
+; CHECK-NEXT:    add x13, sp, #48
+; CHECK-NEXT:    mov v0.b[4], w4
+; CHECK-NEXT:    ld1 { v3.b }[5], [x14]
+; CHECK-NEXT:    add x14, sp, #240
+; CHECK-NEXT:    ld1 { v4.b }[2], [x14]
+; CHECK-NEXT:    ld1 { v5.b }[2], [x15]
+; CHECK-NEXT:    ld1 { v1.b }[5], [x12]
+; CHECK-NEXT:    ld1 { v2.b }[6], [x11]
+; CHECK-NEXT:    add x10, sp, #216
+; CHECK-NEXT:    add x11, sp, #56
+; CHECK-NEXT:    ld1 { v3.b }[6], [x13]
 ; CHECK-NEXT:    add x12, sp, #248
-; CHECK-NEXT:    add x10, sp, #56
-; CHECK-NEXT:    ld1 { v3.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #88
-; CHECK-NEXT:    mov v1.b[6], w6
-; CHECK-NEXT:    ld1 { v2.b }[7], [x11]
+; CHECK-NEXT:    add x13, sp, #88
+; CHECK-NEXT:    mov v0.b[5], w5
 ; CHECK-NEXT:    ld1 { v4.b }[3], [x12]
-; CHECK-NEXT:    ld1 { v5.b }[3], [x9]
-; CHECK-NEXT:    ld1 { v3.b }[7], [x10]
-; CHECK-NEXT:    mov v1.b[7], w7
+; CHECK-NEXT:    ld1 { v5.b }[3], [x13]
+; CHECK-NEXT:    ld1 { v1.b }[6], [x9]
+; CHECK-NEXT:    ld1 { v2.b }[7], [x10]
+; CHECK-NEXT:    add x9, sp, #152
+; CHECK-NEXT:    ld1 { v3.b }[7], [x11]
 ; CHECK-NEXT:    uaddl v4.8h, v5.8b, v4.8b
+; CHECK-NEXT:    mov v0.b[6], w6
+; CHECK-NEXT:    ld1 { v1.b }[7], [x9]
 ; CHECK-NEXT:    uaddl v2.8h, v3.8b, v2.8b
-; CHECK-NEXT:    uaddl v0.8h, v1.8b, v0.8b
-; CHECK-NEXT:    ushll v1.4s, v4.4h, #0
+; CHECK-NEXT:    mov v0.b[7], w7
 ; CHECK-NEXT:    ushll2 v3.4s, v2.8h, #0
 ; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    ushll v1.4s, v4.4h, #0
 ; CHECK-NEXT:    stp q3, q1, [x8, #48]
 ; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
 ; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
@@ -530,83 +536,95 @@ entry:
 define <16 x i32> @i12(<16 x i12> %s0, <16 x i12> %s1) {
 ; CHECK-LABEL: i12:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr w12, [sp, #32]
-; CHECK-NEXT:    fmov s5, w0
-; CHECK-NEXT:    ldr w15, [sp]
-; CHECK-NEXT:    fmov s4, w4
-; CHECK-NEXT:    ldr w14, [sp, #40]
-; CHECK-NEXT:    fmov s0, w12
-; CHECK-NEXT:    ldr w16, [sp, #48]
-; CHECK-NEXT:    fmov s1, w15
-; CHECK-NEXT:    ldr w15, [sp, #8]
-; CHECK-NEXT:    ldr w18, [sp, #16]
-; CHECK-NEXT:    mov v0.h[1], w14
-; CHECK-NEXT:    ldr w17, [sp, #56]
-; CHECK-NEXT:    mov v1.h[1], w15
-; CHECK-NEXT:    ldr w0, [sp, #24]
-; CHECK-NEXT:    mov v5.h[1], w1
-; CHECK-NEXT:    ldr w13, [sp, #64]
-; CHECK-NEXT:    ldr w1, [sp, #128]
-; CHECK-NEXT:    mov v0.h[2], w16
-; CHECK-NEXT:    ldr w16, [sp, #96]
-; CHECK-NEXT:    mov v1.h[2], w18
-; CHECK-NEXT:    ldr w10, [sp, #72]
-; CHECK-NEXT:    mov v5.h[2], w2
-; CHECK-NEXT:    ldr w2, [sp, #160]
-; CHECK-NEXT:    mov v4.h[1], w5
-; CHECK-NEXT:    ldr w5, [sp, #168]
-; CHECK-NEXT:    mov v0.h[3], w17
-; CHECK-NEXT:    ldr w14, [sp, #104]
-; CHECK-NEXT:    mov v1.h[3], w0
-; CHECK-NEXT:    ldr w18, [sp, #136]
-; CHECK-NEXT:    fmov s6, w1
-; CHECK-NEXT:    ldr w0, [sp, #176]
-; CHECK-NEXT:    fmov s7, w16
-; CHECK-NEXT:    fmov s16, w13
-; CHECK-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-NEXT:    ldr w9, [sp, #80]
-; CHECK-NEXT:    movi v0.4s, #15, msl #8
+; CHECK-NEXT:    str x23, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w21, -24
+; CHECK-NEXT:    .cfi_offset w22, -32
+; CHECK-NEXT:    .cfi_offset w23, -48
 ; CHECK-NEXT:    ldr w12, [sp, #112]
+; CHECK-NEXT:    ldr w14, [sp, #144]
+; CHECK-NEXT:    fmov s2, w4
+; CHECK-NEXT:    ldr w16, [sp, #176]
+; CHECK-NEXT:    ldr w19, [sp, #208]
+; CHECK-NEXT:    fmov s3, w0
+; CHECK-NEXT:    ldr w20, [sp, #80]
+; CHECK-NEXT:    ldr w21, [sp, #48]
+; CHECK-NEXT:    fmov s5, w12
+; CHECK-NEXT:    fmov s4, w19
+; CHECK-NEXT:    fmov s6, w16
+; CHECK-NEXT:    fmov s7, w14
+; CHECK-NEXT:    fmov s0, w20
+; CHECK-NEXT:    fmov s1, w21
+; CHECK-NEXT:    ldr w10, [sp, #120]
+; CHECK-NEXT:    ldr w11, [sp, #152]
+; CHECK-NEXT:    ldr w13, [sp, #184]
+; CHECK-NEXT:    ldr w15, [sp, #216]
+; CHECK-NEXT:    ldr w22, [sp, #88]
+; CHECK-NEXT:    ldr w23, [sp, #56]
+; CHECK-NEXT:    mov v2.h[1], w5
+; CHECK-NEXT:    mov v3.h[1], w1
+; CHECK-NEXT:    mov v5.h[1], w10
+; CHECK-NEXT:    mov v4.h[1], w15
+; CHECK-NEXT:    mov v0.h[1], w22
+; CHECK-NEXT:    mov v1.h[1], w23
+; CHECK-NEXT:    mov v6.h[1], w13
+; CHECK-NEXT:    mov v7.h[1], w11
+; CHECK-NEXT:    ldr w8, [sp, #128]
+; CHECK-NEXT:    ldr w9, [sp, #160]
+; CHECK-NEXT:    ldr w17, [sp, #64]
+; CHECK-NEXT:    ldr w18, [sp, #96]
+; CHECK-NEXT:    ldr w10, [sp, #192]
+; CHECK-NEXT:    ldr w11, [sp, #224]
+; CHECK-NEXT:    mov v2.h[2], w6
+; CHECK-NEXT:    mov v3.h[2], w2
+; CHECK-NEXT:    mov v0.h[2], w18
+; CHECK-NEXT:    mov v1.h[2], w17
+; CHECK-NEXT:    mov v5.h[2], w8
+; CHECK-NEXT:    mov v4.h[2], w11
+; CHECK-NEXT:    mov v6.h[2], w10
+; CHECK-NEXT:    mov v7.h[2], w9
+; CHECK-NEXT:    ldr w12, [sp, #72]
+; CHECK-NEXT:    ldr w13, [sp, #104]
+; CHECK-NEXT:    ldr w8, [sp, #136]
+; CHECK-NEXT:    ldr w9, [sp, #168]
+; CHECK-NEXT:    ldr w10, [sp, #200]
+; CHECK-NEXT:    ldr w11, [sp, #232]
+; CHECK-NEXT:    mov v0.h[3], w13
+; CHECK-NEXT:    mov v1.h[3], w12
+; CHECK-NEXT:    mov v2.h[3], w7
+; CHECK-NEXT:    mov v3.h[3], w3
+; CHECK-NEXT:    mov v5.h[3], w8
+; CHECK-NEXT:    mov v4.h[3], w11
+; CHECK-NEXT:    mov v6.h[3], w10
+; CHECK-NEXT:    mov v7.h[3], w9
+; CHECK-NEXT:    movi v16.4s, #15, msl #8
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    ldr w17, [sp, #144]
-; CHECK-NEXT:    mov v6.h[1], w18
-; CHECK-NEXT:    ldr w4, [sp, #184]
-; CHECK-NEXT:    mov v7.h[1], w14
-; CHECK-NEXT:    ldr w8, [sp, #88]
-; CHECK-NEXT:    and v3.16b, v2.16b, v0.16b
-; CHECK-NEXT:    ldr w11, [sp, #120]
-; CHECK-NEXT:    and v2.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ldr w15, [sp, #152]
-; CHECK-NEXT:    fmov s1, w2
-; CHECK-NEXT:    mov v16.h[1], w10
-; CHECK-NEXT:    mov v4.h[2], w6
-; CHECK-NEXT:    mov v1.h[1], w5
-; CHECK-NEXT:    mov v6.h[2], w17
-; CHECK-NEXT:    mov v7.h[2], w12
-; CHECK-NEXT:    mov v16.h[2], w9
-; CHECK-NEXT:    mov v1.h[2], w0
-; CHECK-NEXT:    mov v4.h[3], w7
-; CHECK-NEXT:    mov v5.h[3], w3
-; CHECK-NEXT:    mov v6.h[3], w15
-; CHECK-NEXT:    mov v1.h[3], w4
-; CHECK-NEXT:    mov v7.h[3], w11
-; CHECK-NEXT:    mov v16.h[3], w8
-; CHECK-NEXT:    ushll v4.4s, v4.4h, #0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-NEXT:    ushll v3.4s, v3.4h, #0
 ; CHECK-NEXT:    ushll v5.4s, v5.4h, #0
+; CHECK-NEXT:    ushll v4.4s, v4.4h, #0
 ; CHECK-NEXT:    ushll v6.4s, v6.4h, #0
-; CHECK-NEXT:    and v17.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ushll v1.4s, v7.4h, #0
-; CHECK-NEXT:    ushll v7.4s, v16.4h, #0
-; CHECK-NEXT:    and v4.16b, v4.16b, v0.16b
-; CHECK-NEXT:    and v5.16b, v5.16b, v0.16b
-; CHECK-NEXT:    and v6.16b, v6.16b, v0.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v0.16b
-; CHECK-NEXT:    and v0.16b, v7.16b, v0.16b
-; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
-; CHECK-NEXT:    add v1.4s, v4.4s, v1.4s
-; CHECK-NEXT:    add v2.4s, v2.4s, v6.4s
-; CHECK-NEXT:    add v3.4s, v3.4s, v17.4s
+; CHECK-NEXT:    ushll v7.4s, v7.4h, #0
+; CHECK-NEXT:    and v17.16b, v0.16b, v16.16b
+; CHECK-NEXT:    and v18.16b, v1.16b, v16.16b
+; CHECK-NEXT:    and v1.16b, v2.16b, v16.16b
+; CHECK-NEXT:    and v0.16b, v3.16b, v16.16b
+; CHECK-NEXT:    and v2.16b, v5.16b, v16.16b
+; CHECK-NEXT:    and v3.16b, v4.16b, v16.16b
+; CHECK-NEXT:    and v4.16b, v6.16b, v16.16b
+; CHECK-NEXT:    and v5.16b, v7.16b, v16.16b
+; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    add v3.4s, v17.4s, v3.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v5.4s
+; CHECK-NEXT:    add v2.4s, v18.4s, v4.4s
+; CHECK-NEXT:    ldr x23, [sp], #48 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
   %s0s = zext <16 x i12> %s0 to <16 x i32>
@@ -618,12 +636,12 @@ entry:
 define <16 x i32> @sub_zz(<16 x i8> %s0, <16 x i8> %s1) {
 ; CHECK-LABEL: sub_zz:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    usubl2 v2.8h, v0.16b, v1.16b
-; CHECK-NEXT:    usubl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    sshll2 v3.4s, v2.8h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-NEXT:    usubl v2.8h, v0.8b, v1.8b
+; CHECK-NEXT:    usubl2 v4.8h, v0.16b, v1.16b
+; CHECK-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-NEXT:    sshll2 v3.4s, v4.8h, #0
+; CHECK-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-NEXT:    sshll v2.4s, v4.4h, #0
 ; CHECK-NEXT:    ret
 entry:
   %s0s = zext <16 x i8> %s0 to <16 x i32>
@@ -635,12 +653,12 @@ entry:
 define <16 x i32> @sub_ss(<16 x i8> %s0, <16 x i8> %s1) {
 ; CHECK-LABEL: sub_ss:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ssubl2 v2.8h, v0.16b, v1.16b
-; CHECK-NEXT:    ssubl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    sshll2 v3.4s, v2.8h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-NEXT:    ssubl v2.8h, v0.8b, v1.8b
+; CHECK-NEXT:    ssubl2 v4.8h, v0.16b, v1.16b
+; CHECK-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-NEXT:    sshll2 v3.4s, v4.8h, #0
+; CHECK-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-NEXT:    sshll v2.4s, v4.4h, #0
 ; CHECK-NEXT:    ret
 entry:
   %s0s = sext <16 x i8> %s0 to <16 x i32>
@@ -652,14 +670,14 @@ entry:
 define <16 x i32> @sub_zs(<16 x i8> %s0, <16 x i8> %s1) {
 ; CHECK-LABEL: sub_zs:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll2 v2.8h, v0.16b, #0
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ssubw2 v2.8h, v2.8h, v1.16b
-; CHECK-NEXT:    ssubw v0.8h, v0.8h, v1.8b
-; CHECK-NEXT:    sshll2 v3.4s, v2.8h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-NEXT:    ssubw v2.8h, v2.8h, v1.8b
+; CHECK-NEXT:    ssubw2 v4.8h, v0.8h, v1.16b
+; CHECK-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-NEXT:    sshll2 v3.4s, v4.8h, #0
+; CHECK-NEXT:    sshll2 v1.4s, v2.8h, #0
+; CHECK-NEXT:    sshll v2.4s, v4.4h, #0
 ; CHECK-NEXT:    ret
 entry:
   %s0s = zext <16 x i8> %s0 to <16 x i32>

diff  --git a/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll b/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll
index b2ecd6ead55057..3f590226c47150 100644
--- a/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extracttruncate.ll
@@ -206,8 +206,8 @@ define <16 x i8> @extract_4_mixed(<4 x i16> %a, <4 x i32> %b, <4 x i32> %c, <4 x
 ; CHECK-LABEL: extract_4_mixed:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    xtn v2.4h, v2.4s
-; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
 ; CHECK-NEXT:    xtn2 v0.8h, v1.4s
 ; CHECK-NEXT:    mov v2.d[1], v3.d[0]
 ; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
@@ -267,11 +267,11 @@ entry:
 define <16 x i8> @extract_4_v4i32_badindex(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
 ; CHECK-LABEL: extract_4_v4i32_badindex:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    adrp x8, .LCPI5_0
 ; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT:    adrp x8, .LCPI5_0
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI5_0]
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/neon-mov.ll b/llvm/test/CodeGen/AArch64/neon-mov.ll
index dc1b4c6627dd73..2d0842cadc280c 100644
--- a/llvm/test/CodeGen/AArch64/neon-mov.ll
+++ b/llvm/test/CodeGen/AArch64/neon-mov.ll
@@ -333,8 +333,8 @@ declare <2 x i32> @test_movi1d(<2 x i32>, <2 x i32>)
 define <2 x i32> @movi1d() {
 ; CHECK-LABEL: movi1d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI40_0
 ; CHECK-NEXT:    movi d1, #0x00ffffffff0000
+; CHECK-NEXT:    adrp x8, .LCPI40_0
 ; CHECK-NEXT:    ldr d0, [x8, :lo12:.LCPI40_0]
 ; CHECK-NEXT:    b test_movi1d
   %1 = tail call <2 x i32> @test_movi1d(<2 x i32> <i32 -2147483648, i32 2147450880>, <2 x i32> <i32 -65536, i32 65535>)

diff  --git a/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll b/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll
index 041d0b40a8b8ec..de90024a4a2571 100644
--- a/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll
+++ b/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll
@@ -48,8 +48,8 @@ define <8 x i16> @v8i16_2(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    adrp x8, .LCPI4_0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI4_0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-NEXT:    ret
 entry:
@@ -83,8 +83,8 @@ define <16 x i8> @v16i8_2(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    adrp x8, .LCPI7_0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI7_0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/neon-rshrn.ll b/llvm/test/CodeGen/AArch64/neon-rshrn.ll
index 563509c75d12da..8d47f4afb355f7 100644
--- a/llvm/test/CodeGen/AArch64/neon-rshrn.ll
+++ b/llvm/test/CodeGen/AArch64/neon-rshrn.ll
@@ -112,8 +112,8 @@ define <16 x i8> @rshrn_v16i16_9(<16 x i16> %a) {
 ; CHECK-NEXT:    movi v2.8h, #1, lsl #8
 ; CHECK-NEXT:    add v0.8h, v0.8h, v2.8h
 ; CHECK-NEXT:    add v1.8h, v1.8h, v2.8h
-; CHECK-NEXT:    ushr v0.8h, v0.8h, #9
 ; CHECK-NEXT:    ushr v1.8h, v1.8h, #9
+; CHECK-NEXT:    ushr v0.8h, v0.8h, #9
 ; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
 entry:
@@ -338,8 +338,8 @@ define <8 x i16> @rshrn_v8i32_17(<8 x i32> %a) {
 ; CHECK-NEXT:    movi v2.4s, #1, lsl #16
 ; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    ushr v0.4s, v0.4s, #17
 ; CHECK-NEXT:    ushr v1.4s, v1.4s, #17
+; CHECK-NEXT:    ushr v0.4s, v0.4s, #17
 ; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
 entry:
@@ -773,8 +773,8 @@ define <4 x i32> @rshrn_v4i64_33(<4 x i64> %a) {
 ; CHECK-NEXT:    dup v2.2d, x8
 ; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-NEXT:    ushr v0.2d, v0.2d, #33
 ; CHECK-NEXT:    ushr v1.2d, v1.2d, #33
+; CHECK-NEXT:    ushr v0.2d, v0.2d, #33
 ; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/neon-shift-neg.ll b/llvm/test/CodeGen/AArch64/neon-shift-neg.ll
index 45272143e8592f..881bbf315e8e99 100644
--- a/llvm/test/CodeGen/AArch64/neon-shift-neg.ll
+++ b/llvm/test/CodeGen/AArch64/neon-shift-neg.ll
@@ -375,8 +375,8 @@ entry:
 define <vscale x 2 x i64> @shrn64x2(<vscale x 2 x i64> %a, i64 %b) {
 ; CHECK-LABEL: shrn64x2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    neg x8, x0
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    neg x8, x0
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    asr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -391,8 +391,8 @@ entry:
 define <vscale x 4 x i32> @shrn32x4(<vscale x 4 x i32> %a, i32 %b) {
 ; CHECK-LABEL: shrn32x4:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    neg w8, w0
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    neg w8, w0
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -407,8 +407,8 @@ entry:
 define <vscale x 8 x i16> @shrn16x8(<vscale x 8 x i16> %a, i16 %b) {
 ; CHECK-LABEL: shrn16x8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    neg w8, w0
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    neg w8, w0
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    asr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
@@ -423,8 +423,8 @@ entry:
 define <vscale x 16 x i8> @shrn8x16(<vscale x 16 x i8> %a, i8 %b) {
 ; CHECK-LABEL: shrn8x16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    neg w8, w0
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    neg w8, w0
 ; CHECK-NEXT:    mov z1.b, w8
 ; CHECK-NEXT:    asr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    ret
@@ -439,8 +439,8 @@ entry:
 define <vscale x 2 x i64> @lshrn64x2(<vscale x 2 x i64> %a, i64 %b) {
 ; CHECK-LABEL: lshrn64x2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    neg x8, x0
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    neg x8, x0
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -455,8 +455,8 @@ entry:
 define <vscale x 4 x i32> @lshrn32x4(<vscale x 4 x i32> %a, i32 %b) {
 ; CHECK-LABEL: lshrn32x4:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    neg w8, w0
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    neg w8, w0
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -471,8 +471,8 @@ entry:
 define <vscale x 8 x i16> @lshrn16x8(<vscale x 8 x i16> %a, i16 %b) {
 ; CHECK-LABEL: lshrn16x8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    neg w8, w0
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    neg w8, w0
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
@@ -487,8 +487,8 @@ entry:
 define <vscale x 16 x i8> @lshrn8x16(<vscale x 16 x i8> %a, i8 %b) {
 ; CHECK-LABEL: lshrn8x16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    neg w8, w0
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    neg w8, w0
 ; CHECK-NEXT:    mov z1.b, w8
 ; CHECK-NEXT:    lsr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    ret
@@ -503,8 +503,8 @@ entry:
 define <vscale x 2 x i64> @shln64x2(<vscale x 2 x i64> %a, i64 %b) {
 ; CHECK-LABEL: shln64x2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    neg x8, x0
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    neg x8, x0
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -519,8 +519,8 @@ entry:
 define <vscale x 4 x i32> @shln32x4(<vscale x 4 x i32> %a, i32 %b) {
 ; CHECK-LABEL: shln32x4:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    neg w8, w0
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    neg w8, w0
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -535,8 +535,8 @@ entry:
 define <vscale x 8 x i16> @shln16x8(<vscale x 8 x i16> %a, i16 %b) {
 ; CHECK-LABEL: shln16x8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    neg w8, w0
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    neg w8, w0
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
@@ -551,8 +551,8 @@ entry:
 define <vscale x 16 x i8> @shln8x16(<vscale x 16 x i8> %a, i8 %b) {
 ; CHECK-LABEL: shln8x16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    neg w8, w0
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    neg w8, w0
 ; CHECK-NEXT:    mov z1.b, w8
 ; CHECK-NEXT:    lsl z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/neon-truncstore.ll b/llvm/test/CodeGen/AArch64/neon-truncstore.ll
index 083cc00c8c1b74..b677d077b98c14 100644
--- a/llvm/test/CodeGen/AArch64/neon-truncstore.ll
+++ b/llvm/test/CodeGen/AArch64/neon-truncstore.ll
@@ -141,10 +141,10 @@ define void @v32i32_v32i8(<32 x i32> %a, ptr %result) {
 ; CHECK-LABEL: v32i32_v32i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uzp1 v6.8h, v6.8h, v7.8h
+; CHECK-NEXT:    uzp1 v4.8h, v4.8h, v5.8h
 ; CHECK-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
-; CHECK-NEXT:    uzp1 v3.8h, v4.8h, v5.8h
 ; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    uzp1 v1.16b, v3.16b, v6.16b
+; CHECK-NEXT:    uzp1 v1.16b, v4.16b, v6.16b
 ; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/neon-wide-splat.ll b/llvm/test/CodeGen/AArch64/neon-wide-splat.ll
index bd477888c172d8..8f05fd6cb76bd9 100644
--- a/llvm/test/CodeGen/AArch64/neon-wide-splat.ll
+++ b/llvm/test/CodeGen/AArch64/neon-wide-splat.ll
@@ -131,8 +131,8 @@ entry:
 define <8 x i8> @shuffle_not4(<8 x i8> %v) {
 ; CHECK-LABEL: shuffle_not4:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    adrp x8, .LCPI11_0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    adrp x8, .LCPI11_0
 ; CHECK-NEXT:    mov v0.d[1], v0.d[0]
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI11_0]
 ; CHECK-NEXT:    tbl v0.8b, { v0.16b }, v1.8b

diff  --git a/llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll b/llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll
index 845d88ebf3bd40..afcced5dcb9ab5 100644
--- a/llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll
@@ -138,8 +138,8 @@ define <8 x i16> @shuffle_widen_faili1(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    adrp x8, .LCPI12_0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI12_0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-NEXT:    ret
 entry:
@@ -153,8 +153,8 @@ define <8 x i16> @shuffle_widen_fail2(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    adrp x8, .LCPI13_0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI13_0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-NEXT:    ret
 entry:
@@ -168,8 +168,8 @@ define <8 x i16> @shuffle_widen_fail3(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    adrp x8, .LCPI14_0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI14_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/no-sve-no-neon.ll b/llvm/test/CodeGen/AArch64/no-sve-no-neon.ll
index 0ad216685e18ed..83a6bef5fa5e6c 100644
--- a/llvm/test/CodeGen/AArch64/no-sve-no-neon.ll
+++ b/llvm/test/CodeGen/AArch64/no-sve-no-neon.ll
@@ -7,32 +7,32 @@ define <16 x float> @foo(<16 x i64> %a) {
 ; CHECK-LABEL: foo:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp x10, x9, [sp, #48]
-; CHECK-NEXT:    ldp x12, x11, [sp, #32]
-; CHECK-NEXT:    ucvtf s1, x10
 ; CHECK-NEXT:    ucvtf s0, x9
-; CHECK-NEXT:    ldp x13, x9, [sp, #16]
-; CHECK-NEXT:    ucvtf s2, x11
-; CHECK-NEXT:    ucvtf s3, x12
-; CHECK-NEXT:    ldp x11, x10, [sp]
+; CHECK-NEXT:    ldp x11, x9, [sp, #32]
+; CHECK-NEXT:    ucvtf s1, x10
+; CHECK-NEXT:    ucvtf s2, x9
+; CHECK-NEXT:    ldp x10, x9, [sp, #16]
+; CHECK-NEXT:    ucvtf s3, x11
 ; CHECK-NEXT:    str s0, [x8, #60]
-; CHECK-NEXT:    ucvtf s0, x13
 ; CHECK-NEXT:    str s1, [x8, #56]
 ; CHECK-NEXT:    ucvtf s4, x9
+; CHECK-NEXT:    ucvtf s0, x10
+; CHECK-NEXT:    ldp x11, x9, [sp]
 ; CHECK-NEXT:    str s2, [x8, #52]
-; CHECK-NEXT:    ucvtf s2, x11
 ; CHECK-NEXT:    str s3, [x8, #48]
-; CHECK-NEXT:    ucvtf s1, x10
 ; CHECK-NEXT:    ucvtf s3, x7
-; CHECK-NEXT:    str s0, [x8, #40]
-; CHECK-NEXT:    ucvtf s0, x5
+; CHECK-NEXT:    ucvtf s1, x9
+; CHECK-NEXT:    ucvtf s2, x11
 ; CHECK-NEXT:    str s4, [x8, #44]
 ; CHECK-NEXT:    ucvtf s4, x6
-; CHECK-NEXT:    str s2, [x8, #32]
-; CHECK-NEXT:    ucvtf s2, x3
-; CHECK-NEXT:    str s1, [x8, #36]
-; CHECK-NEXT:    ucvtf s1, x4
+; CHECK-NEXT:    str s0, [x8, #40]
+; CHECK-NEXT:    ucvtf s0, x5
 ; CHECK-NEXT:    str s3, [x8, #28]
 ; CHECK-NEXT:    ucvtf s3, x2
+; CHECK-NEXT:    str s1, [x8, #36]
+; CHECK-NEXT:    ucvtf s1, x4
+; CHECK-NEXT:    str s2, [x8, #32]
+; CHECK-NEXT:    ucvtf s2, x3
 ; CHECK-NEXT:    str s4, [x8, #24]
 ; CHECK-NEXT:    ucvtf s4, x1
 ; CHECK-NEXT:    str s0, [x8, #20]

diff  --git a/llvm/test/CodeGen/AArch64/nontemporal-load.ll b/llvm/test/CodeGen/AArch64/nontemporal-load.ll
index f8f7de48ca800b..ad46d32e4bf632 100644
--- a/llvm/test/CodeGen/AArch64/nontemporal-load.ll
+++ b/llvm/test/CodeGen/AArch64/nontemporal-load.ll
@@ -321,31 +321,31 @@ define <17 x float> @test_ldnp_v17f32(ptr %A) {
 ; CHECK-LABEL: test_ldnp_v17f32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    ldnp q0, q1, [x0, #32]
-; CHECK-NEXT:    ldnp q2, q3, [x0]
-; CHECK-NEXT:    ldr s4, [x0, #64]
+; CHECK-NEXT:    ldr s2, [x0, #64]
+; CHECK-NEXT:    ldnp q3, q4, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x8, #32]
-; CHECK-NEXT:    stp q2, q3, [x8]
-; CHECK-NEXT:    str s4, [x8, #64]
+; CHECK-NEXT:    stp q3, q4, [x8]
+; CHECK-NEXT:    str s2, [x8, #64]
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: test_ldnp_v17f32:
 ; CHECK-BE:       // %bb.0:
-; CHECK-BE-NEXT:    add x9, x0, #32
-; CHECK-BE-NEXT:    ld1 { v1.4s }, [x0]
-; CHECK-BE-NEXT:    add x10, x0, #16
-; CHECK-BE-NEXT:    ldr s2, [x0, #64]
-; CHECK-BE-NEXT:    ld1 { v0.4s }, [x9]
 ; CHECK-BE-NEXT:    add x9, x0, #48
-; CHECK-BE-NEXT:    ld1 { v4.4s }, [x10]
-; CHECK-BE-NEXT:    add x10, x8, #32
+; CHECK-BE-NEXT:    ld1 { v0.4s }, [x0]
+; CHECK-BE-NEXT:    add x10, x0, #32
+; CHECK-BE-NEXT:    ld1 { v2.4s }, [x9]
+; CHECK-BE-NEXT:    add x9, x0, #16
+; CHECK-BE-NEXT:    ldr s1, [x0, #64]
 ; CHECK-BE-NEXT:    ld1 { v3.4s }, [x9]
+; CHECK-BE-NEXT:    ld1 { v4.4s }, [x10]
 ; CHECK-BE-NEXT:    add x9, x8, #48
-; CHECK-BE-NEXT:    str s2, [x8, #64]
-; CHECK-BE-NEXT:    st1 { v1.4s }, [x8]
+; CHECK-BE-NEXT:    str s1, [x8, #64]
+; CHECK-BE-NEXT:    add x10, x8, #32
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x8]
 ; CHECK-BE-NEXT:    add x8, x8, #16
-; CHECK-BE-NEXT:    st1 { v3.4s }, [x9]
-; CHECK-BE-NEXT:    st1 { v0.4s }, [x10]
-; CHECK-BE-NEXT:    st1 { v4.4s }, [x8]
+; CHECK-BE-NEXT:    st1 { v2.4s }, [x9]
+; CHECK-BE-NEXT:    st1 { v4.4s }, [x10]
+; CHECK-BE-NEXT:    st1 { v3.4s }, [x8]
 ; CHECK-BE-NEXT:    ret
   %lv = load <17 x float>, ptr %A, align 8, !nontemporal !0
   ret <17 x float> %lv
@@ -355,90 +355,90 @@ define <33 x double> @test_ldnp_v33f64(ptr %A) {
 ; CHECK-LABEL: test_ldnp_v33f64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    ldnp q0, q1, [x0]
+; CHECK-NEXT:    ldr d20, [x0, #256]
 ; CHECK-NEXT:    ldnp q2, q3, [x0, #32]
 ; CHECK-NEXT:    ldnp q4, q5, [x0, #64]
 ; CHECK-NEXT:    ldnp q6, q7, [x0, #96]
 ; CHECK-NEXT:    ldnp q16, q17, [x0, #128]
 ; CHECK-NEXT:    ldnp q18, q19, [x0, #224]
-; CHECK-NEXT:    ldnp q20, q21, [x0, #192]
-; CHECK-NEXT:    ldnp q22, q23, [x0, #160]
-; CHECK-NEXT:    ldr d24, [x0, #256]
+; CHECK-NEXT:    ldnp q21, q22, [x0, #160]
+; CHECK-NEXT:    ldnp q23, q24, [x0, #192]
 ; CHECK-NEXT:    stp q0, q1, [x8]
 ; CHECK-NEXT:    stp q2, q3, [x8, #32]
 ; CHECK-NEXT:    stp q4, q5, [x8, #64]
 ; CHECK-NEXT:    stp q6, q7, [x8, #96]
 ; CHECK-NEXT:    stp q16, q17, [x8, #128]
-; CHECK-NEXT:    stp q22, q23, [x8, #160]
-; CHECK-NEXT:    stp q20, q21, [x8, #192]
+; CHECK-NEXT:    stp q21, q22, [x8, #160]
+; CHECK-NEXT:    stp q23, q24, [x8, #192]
 ; CHECK-NEXT:    stp q18, q19, [x8, #224]
-; CHECK-NEXT:    str d24, [x8, #256]
+; CHECK-NEXT:    str d20, [x8, #256]
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: test_ldnp_v33f64:
 ; CHECK-BE:       // %bb.0:
 ; CHECK-BE-NEXT:    add x9, x0, #16
 ; CHECK-BE-NEXT:    add x10, x0, #32
-; CHECK-BE-NEXT:    ld1 { v21.2d }, [x0]
-; CHECK-BE-NEXT:    add x11, x8, #208
+; CHECK-BE-NEXT:    add x11, x0, #48
 ; CHECK-BE-NEXT:    ld1 { v0.2d }, [x9]
-; CHECK-BE-NEXT:    add x9, x0, #48
+; CHECK-BE-NEXT:    add x9, x0, #64
 ; CHECK-BE-NEXT:    ld1 { v1.2d }, [x10]
-; CHECK-BE-NEXT:    add x10, x0, #64
-; CHECK-BE-NEXT:    ld1 { v2.2d }, [x9]
-; CHECK-BE-NEXT:    add x9, x0, #80
-; CHECK-BE-NEXT:    ld1 { v3.2d }, [x10]
-; CHECK-BE-NEXT:    add x10, x0, #96
-; CHECK-BE-NEXT:    ld1 { v4.2d }, [x9]
+; CHECK-BE-NEXT:    add x10, x0, #80
+; CHECK-BE-NEXT:    ld1 { v3.2d }, [x9]
 ; CHECK-BE-NEXT:    add x9, x0, #112
-; CHECK-BE-NEXT:    ld1 { v5.2d }, [x10]
+; CHECK-BE-NEXT:    ld1 { v4.2d }, [x10]
 ; CHECK-BE-NEXT:    add x10, x0, #128
 ; CHECK-BE-NEXT:    ld1 { v6.2d }, [x9]
-; CHECK-BE-NEXT:    add x9, x0, #144
+; CHECK-BE-NEXT:    add x9, x0, #160
 ; CHECK-BE-NEXT:    ld1 { v7.2d }, [x10]
-; CHECK-BE-NEXT:    add x10, x0, #160
-; CHECK-BE-NEXT:    ld1 { v16.2d }, [x9]
-; CHECK-BE-NEXT:    add x9, x0, #176
-; CHECK-BE-NEXT:    ld1 { v17.2d }, [x10]
-; CHECK-BE-NEXT:    add x10, x0, #192
-; CHECK-BE-NEXT:    ld1 { v18.2d }, [x9]
-; CHECK-BE-NEXT:    add x9, x0, #224
-; CHECK-BE-NEXT:    ld1 { v19.2d }, [x10]
-; CHECK-BE-NEXT:    add x10, x0, #208
-; CHECK-BE-NEXT:    ld1 { v20.2d }, [x9]
+; CHECK-BE-NEXT:    add x10, x0, #176
+; CHECK-BE-NEXT:    ld1 { v17.2d }, [x9]
 ; CHECK-BE-NEXT:    add x9, x0, #240
-; CHECK-BE-NEXT:    ldr d22, [x0, #256]
+; CHECK-BE-NEXT:    ld1 { v2.2d }, [x11]
+; CHECK-BE-NEXT:    add x11, x0, #96
+; CHECK-BE-NEXT:    ld1 { v18.2d }, [x10]
+; CHECK-BE-NEXT:    ld1 { v20.2d }, [x0]
+; CHECK-BE-NEXT:    ld1 { v22.2d }, [x9]
+; CHECK-BE-NEXT:    add x10, x0, #224
+; CHECK-BE-NEXT:    ld1 { v5.2d }, [x11]
+; CHECK-BE-NEXT:    add x11, x0, #144
+; CHECK-BE-NEXT:    ldr d21, [x0, #256]
+; CHECK-BE-NEXT:    add x9, x0, #208
+; CHECK-BE-NEXT:    ld1 { v24.2d }, [x10]
+; CHECK-BE-NEXT:    ld1 { v16.2d }, [x11]
+; CHECK-BE-NEXT:    add x11, x0, #192
 ; CHECK-BE-NEXT:    ld1 { v23.2d }, [x9]
 ; CHECK-BE-NEXT:    add x9, x8, #240
-; CHECK-BE-NEXT:    ld1 { v24.2d }, [x10]
-; CHECK-BE-NEXT:    add x10, x8, #224
-; CHECK-BE-NEXT:    str d22, [x8, #256]
-; CHECK-BE-NEXT:    st1 { v21.2d }, [x8]
+; CHECK-BE-NEXT:    ld1 { v19.2d }, [x11]
+; CHECK-BE-NEXT:    str d21, [x8, #256]
+; CHECK-BE-NEXT:    st1 { v20.2d }, [x8]
+; CHECK-BE-NEXT:    st1 { v22.2d }, [x9]
+; CHECK-BE-NEXT:    add x9, x8, #224
+; CHECK-BE-NEXT:    st1 { v24.2d }, [x9]
+; CHECK-BE-NEXT:    add x9, x8, #208
 ; CHECK-BE-NEXT:    st1 { v23.2d }, [x9]
 ; CHECK-BE-NEXT:    add x9, x8, #192
-; CHECK-BE-NEXT:    st1 { v20.2d }, [x10]
-; CHECK-BE-NEXT:    add x10, x8, #176
-; CHECK-BE-NEXT:    st1 { v24.2d }, [x11]
-; CHECK-BE-NEXT:    add x11, x8, #160
 ; CHECK-BE-NEXT:    st1 { v19.2d }, [x9]
+; CHECK-BE-NEXT:    add x9, x8, #176
+; CHECK-BE-NEXT:    st1 { v18.2d }, [x9]
+; CHECK-BE-NEXT:    add x9, x8, #160
+; CHECK-BE-NEXT:    st1 { v17.2d }, [x9]
 ; CHECK-BE-NEXT:    add x9, x8, #144
-; CHECK-BE-NEXT:    st1 { v18.2d }, [x10]
-; CHECK-BE-NEXT:    add x10, x8, #128
-; CHECK-BE-NEXT:    st1 { v17.2d }, [x11]
-; CHECK-BE-NEXT:    add x11, x8, #112
 ; CHECK-BE-NEXT:    st1 { v16.2d }, [x9]
+; CHECK-BE-NEXT:    add x9, x8, #128
+; CHECK-BE-NEXT:    st1 { v7.2d }, [x9]
+; CHECK-BE-NEXT:    add x9, x8, #112
+; CHECK-BE-NEXT:    st1 { v6.2d }, [x9]
 ; CHECK-BE-NEXT:    add x9, x8, #96
-; CHECK-BE-NEXT:    st1 { v7.2d }, [x10]
-; CHECK-BE-NEXT:    add x10, x8, #80
-; CHECK-BE-NEXT:    st1 { v6.2d }, [x11]
-; CHECK-BE-NEXT:    add x11, x8, #64
 ; CHECK-BE-NEXT:    st1 { v5.2d }, [x9]
+; CHECK-BE-NEXT:    add x9, x8, #80
+; CHECK-BE-NEXT:    st1 { v4.2d }, [x9]
+; CHECK-BE-NEXT:    add x9, x8, #64
+; CHECK-BE-NEXT:    st1 { v3.2d }, [x9]
 ; CHECK-BE-NEXT:    add x9, x8, #48
-; CHECK-BE-NEXT:    st1 { v4.2d }, [x10]
-; CHECK-BE-NEXT:    add x10, x8, #32
-; CHECK-BE-NEXT:    add x8, x8, #16
-; CHECK-BE-NEXT:    st1 { v3.2d }, [x11]
 ; CHECK-BE-NEXT:    st1 { v2.2d }, [x9]
-; CHECK-BE-NEXT:    st1 { v1.2d }, [x10]
+; CHECK-BE-NEXT:    add x9, x8, #32
+; CHECK-BE-NEXT:    add x8, x8, #16
+; CHECK-BE-NEXT:    st1 { v1.2d }, [x9]
 ; CHECK-BE-NEXT:    st1 { v0.2d }, [x8]
 ; CHECK-BE-NEXT:    ret
   %lv = load <33 x double>, ptr %A, align 8, !nontemporal !0
@@ -459,12 +459,12 @@ define <33 x i8> @test_ldnp_v33i8(ptr %A) {
 ; CHECK-BE:       // %bb.0:
 ; CHECK-BE-NEXT:    add x9, x0, #16
 ; CHECK-BE-NEXT:    ld1 { v0.16b }, [x0]
-; CHECK-BE-NEXT:    add x10, x8, #16
+; CHECK-BE-NEXT:    ldrb w10, [x0, #32]
 ; CHECK-BE-NEXT:    ld1 { v1.16b }, [x9]
-; CHECK-BE-NEXT:    ldrb w9, [x0, #32]
-; CHECK-BE-NEXT:    strb w9, [x8, #32]
+; CHECK-BE-NEXT:    strb w10, [x8, #32]
 ; CHECK-BE-NEXT:    st1 { v0.16b }, [x8]
-; CHECK-BE-NEXT:    st1 { v1.16b }, [x10]
+; CHECK-BE-NEXT:    add x8, x8, #16
+; CHECK-BE-NEXT:    st1 { v1.16b }, [x8]
 ; CHECK-BE-NEXT:    ret
   %lv = load<33 x i8>, ptr %A, align 8, !nontemporal !0
   ret <33 x i8> %lv
@@ -476,13 +476,13 @@ define <4 x i65> @test_ldnp_v4i65(ptr %A) {
 ; CHECK-NEXT:    ldp x8, x9, [x0, #8]
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr x10, [x0, #24]
-; CHECK-NEXT:    and x1, x8, #0x1
 ; CHECK-NEXT:    ldrb w11, [x0, #32]
+; CHECK-NEXT:    and x1, x8, #0x1
 ; CHECK-NEXT:    extr x2, x9, x8, #1
 ; CHECK-NEXT:    extr x4, x10, x9, #2
+; CHECK-NEXT:    mov.d v0[1], x1
 ; CHECK-NEXT:    extr x6, x11, x10, #3
 ; CHECK-NEXT:    ubfx x3, x9, #1, #1
-; CHECK-NEXT:    mov.d v0[1], x1
 ; CHECK-NEXT:    ubfx x5, x10, #2, #1
 ; CHECK-NEXT:    ubfx x7, x11, #3, #1
 ; CHECK-NEXT:    fmov x0, d0
@@ -490,35 +490,35 @@ define <4 x i65> @test_ldnp_v4i65(ptr %A) {
 ;
 ; CHECK-BE-LABEL: test_ldnp_v4i65:
 ; CHECK-BE:       // %bb.0:
-; CHECK-BE-NEXT:    ldp x10, x9, [x0, #16]
-; CHECK-BE-NEXT:    ldp x12, x11, [x0]
+; CHECK-BE-NEXT:    ldp x10, x9, [x0]
 ; CHECK-BE-NEXT:    ldrb w8, [x0, #32]
+; CHECK-BE-NEXT:    ldp x12, x11, [x0, #16]
 ; CHECK-BE-NEXT:    lsr x13, x10, #56
-; CHECK-BE-NEXT:    lsr x14, x12, #56
-; CHECK-BE-NEXT:    extr x15, x11, x10, #56
-; CHECK-BE-NEXT:    orr x7, x8, x9, lsl #8
+; CHECK-BE-NEXT:    orr x7, x8, x11, lsl #8
 ; CHECK-BE-NEXT:    extr x8, x10, x9, #56
-; CHECK-BE-NEXT:    extr x9, x12, x11, #56
-; CHECK-BE-NEXT:    lsr x12, x12, #59
-; CHECK-BE-NEXT:    ubfx x10, x10, #57, #1
-; CHECK-BE-NEXT:    extr x5, x13, x8, #1
-; CHECK-BE-NEXT:    extr x1, x14, x9, #3
-; CHECK-BE-NEXT:    ubfx x9, x11, #58, #1
-; CHECK-BE-NEXT:    fmov d0, x12
-; CHECK-BE-NEXT:    and x12, x8, #0x1
-; CHECK-BE-NEXT:    lsr x11, x11, #56
-; CHECK-BE-NEXT:    fmov d2, x10
+; CHECK-BE-NEXT:    extr x11, x12, x11, #56
+; CHECK-BE-NEXT:    lsr x14, x12, #56
+; CHECK-BE-NEXT:    extr x15, x9, x12, #56
+; CHECK-BE-NEXT:    lsr x10, x10, #59
+; CHECK-BE-NEXT:    extr x1, x13, x8, #3
+; CHECK-BE-NEXT:    lsr x8, x9, #56
+; CHECK-BE-NEXT:    ubfx x12, x12, #57, #1
+; CHECK-BE-NEXT:    ubfx x9, x9, #58, #1
+; CHECK-BE-NEXT:    extr x5, x14, x11, #1
+; CHECK-BE-NEXT:    and x11, x11, #0x1
+; CHECK-BE-NEXT:    fmov d0, x10
+; CHECK-BE-NEXT:    fmov d2, x12
+; CHECK-BE-NEXT:    fmov d3, x11
 ; CHECK-BE-NEXT:    fmov d1, x9
-; CHECK-BE-NEXT:    extr x3, x11, x15, #2
-; CHECK-BE-NEXT:    fmov d3, x12
+; CHECK-BE-NEXT:    extr x3, x8, x15, #2
 ; CHECK-BE-NEXT:    mov v0.d[1], x1
 ; CHECK-BE-NEXT:    mov v2.d[1], x5
-; CHECK-BE-NEXT:    mov v1.d[1], x3
 ; CHECK-BE-NEXT:    mov v3.d[1], x7
+; CHECK-BE-NEXT:    mov v1.d[1], x3
 ; CHECK-BE-NEXT:    fmov x0, d0
 ; CHECK-BE-NEXT:    fmov x4, d2
-; CHECK-BE-NEXT:    fmov x2, d1
 ; CHECK-BE-NEXT:    fmov x6, d3
+; CHECK-BE-NEXT:    fmov x2, d1
 ; CHECK-BE-NEXT:    ret
   %lv = load <4 x i65>, ptr %A, align 8, !nontemporal !0
   ret <4 x i65> %lv
@@ -528,23 +528,24 @@ define <4 x i63> @test_ldnp_v4i63(ptr %A) {
 ; CHECK-LABEL: test_ldnp_v4i63:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    ldp x8, x9, [x0]
-; CHECK-NEXT:    ldp x10, x12, [x0, #16]
-; CHECK-NEXT:    extr x11, x9, x8, #63
+; CHECK-NEXT:    ldp x10, x11, [x0, #16]
+; CHECK-NEXT:    extr x12, x9, x8, #63
 ; CHECK-NEXT:    and x0, x8, #0x7fffffffffffffff
 ; CHECK-NEXT:    extr x9, x10, x9, #62
-; CHECK-NEXT:    extr x3, x12, x10, #61
-; CHECK-NEXT:    and x1, x11, #0x7fffffffffffffff
+; CHECK-NEXT:    extr x3, x11, x10, #61
+; CHECK-NEXT:    and x1, x12, #0x7fffffffffffffff
 ; CHECK-NEXT:    and x2, x9, #0x7fffffffffffffff
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: test_ldnp_v4i63:
 ; CHECK-BE:       // %bb.0:
-; CHECK-BE-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-BE-NEXT:    ldp x11, x10, [x0]
-; CHECK-BE-NEXT:    and x3, x9, #0x7fffffffffffffff
-; CHECK-BE-NEXT:    extr x12, x10, x8, #62
-; CHECK-BE-NEXT:    extr x8, x8, x9, #63
-; CHECK-BE-NEXT:    extr x0, x11, x10, #61
+; CHECK-BE-NEXT:    ldp x9, x8, [x0, #8]
+; CHECK-BE-NEXT:    ldr x11, [x0, #24]
+; CHECK-BE-NEXT:    ldr x10, [x0]
+; CHECK-BE-NEXT:    and x3, x11, #0x7fffffffffffffff
+; CHECK-BE-NEXT:    extr x12, x9, x8, #62
+; CHECK-BE-NEXT:    extr x8, x8, x11, #63
+; CHECK-BE-NEXT:    extr x0, x10, x9, #61
 ; CHECK-BE-NEXT:    and x1, x12, #0x7fffffffffffffff
 ; CHECK-BE-NEXT:    and x2, x8, #0x7fffffffffffffff
 ; CHECK-BE-NEXT:    ret
@@ -558,10 +559,10 @@ define <5 x double> @test_ldnp_v5f64(ptr %A) {
 ; CHECK-NEXT:    ldnp q0, q2, [x0]
 ; CHECK-NEXT:    ldr d4, [x0, #32]
 ; CHECK-NEXT:    ext.16b v1, v0, v0, #8
-; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $q1
 ; CHECK-NEXT:    ext.16b v3, v2, v2, #8
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ; kill: def $d2 killed $d2 killed $q2
+; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $q1
 ; CHECK-NEXT:    ; kill: def $d3 killed $d3 killed $q3
 ; CHECK-NEXT:    ret
 ;
@@ -574,8 +575,8 @@ define <5 x double> @test_ldnp_v5f64(ptr %A) {
 ; CHECK-BE-NEXT:    // kill: def $d4 killed $d4 killed $q4
 ; CHECK-BE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; CHECK-BE-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-BE-NEXT:    // kill: def $d1 killed $d1 killed $q1
 ; CHECK-BE-NEXT:    ext v3.16b, v2.16b, v2.16b, #8
+; CHECK-BE-NEXT:    // kill: def $d1 killed $d1 killed $q1
 ; CHECK-BE-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-BE-NEXT:    // kill: def $d3 killed $d3 killed $q3
 ; CHECK-BE-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/nontemporal.ll b/llvm/test/CodeGen/AArch64/nontemporal.ll
index 92c56fcc9fc61b..4fcb5c0342e525 100644
--- a/llvm/test/CodeGen/AArch64/nontemporal.ll
+++ b/llvm/test/CodeGen/AArch64/nontemporal.ll
@@ -439,43 +439,43 @@ entry:
 define void @test_stnp_v17f32(<17 x float> %v, ptr %ptr) {
 ; CHECK-LABEL: test_stnp_v17f32:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    ldr s16, [sp, #16]
-; CHECK-NEXT:    add x8, sp, #20
-; CHECK-NEXT:    ldr s17, [sp]
-; CHECK-NEXT:    add x9, sp, #4
 ; CHECK-NEXT:    ; kill: def $s4 killed $s4 def $q4
 ; CHECK-NEXT:    ; kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    ldr s16, [sp, #16]
 ; CHECK-NEXT:    ; kill: def $s5 killed $s5 def $q5
 ; CHECK-NEXT:    ; kill: def $s1 killed $s1 def $q1
+; CHECK-NEXT:    add x8, sp, #20
 ; CHECK-NEXT:    ; kill: def $s6 killed $s6 def $q6
 ; CHECK-NEXT:    ; kill: def $s2 killed $s2 def $q2
 ; CHECK-NEXT:    ; kill: def $s7 killed $s7 def $q7
 ; CHECK-NEXT:    ; kill: def $s3 killed $s3 def $q3
-; CHECK-NEXT:    ld1.s { v16 }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #24
-; CHECK-NEXT:    ld1.s { v17 }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #8
 ; CHECK-NEXT:    mov.s v4[1], v5[0]
 ; CHECK-NEXT:    mov.s v0[1], v1[0]
-; CHECK-NEXT:    ld1.s { v16 }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #28
-; CHECK-NEXT:    ld1.s { v17 }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #12
+; CHECK-NEXT:    ldr s5, [sp]
+; CHECK-NEXT:    ld1.s { v16 }[1], [x8]
+; CHECK-NEXT:    add x8, sp, #4
+; CHECK-NEXT:    ld1.s { v5 }[1], [x8]
+; CHECK-NEXT:    add x8, sp, #24
 ; CHECK-NEXT:    mov.s v4[2], v6[0]
+; CHECK-NEXT:    ld1.s { v16 }[2], [x8]
 ; CHECK-NEXT:    mov.s v0[2], v2[0]
+; CHECK-NEXT:    add x8, sp, #8
+; CHECK-NEXT:    ld1.s { v5 }[2], [x8]
+; CHECK-NEXT:    add x8, sp, #28
 ; CHECK-NEXT:    ld1.s { v16 }[3], [x8]
-; CHECK-NEXT:    ld1.s { v17 }[3], [x9]
+; CHECK-NEXT:    add x8, sp, #12
 ; CHECK-NEXT:    mov.s v4[3], v7[0]
 ; CHECK-NEXT:    mov.s v0[3], v3[0]
+; CHECK-NEXT:    ld1.s { v5 }[3], [x8]
 ; CHECK-NEXT:    mov d1, v16[1]
-; CHECK-NEXT:    mov d2, v17[1]
+; CHECK-NEXT:    mov d2, v5[1]
 ; CHECK-NEXT:    mov d3, v4[1]
-; CHECK-NEXT:    mov d5, v0[1]
+; CHECK-NEXT:    mov d6, v0[1]
 ; CHECK-NEXT:    stnp d16, d1, [x0, #48]
 ; CHECK-NEXT:    ldr s1, [sp, #32]
-; CHECK-NEXT:    stnp d17, d2, [x0, #32]
+; CHECK-NEXT:    stnp d5, d2, [x0, #32]
 ; CHECK-NEXT:    stnp d4, d3, [x0, #16]
-; CHECK-NEXT:    stnp d0, d5, [x0]
+; CHECK-NEXT:    stnp d0, d6, [x0]
 ; CHECK-NEXT:    str s1, [x0, #64]
 ; CHECK-NEXT:    ret
 
@@ -486,8 +486,8 @@ entry:
 define void @test_stnp_v16i32_invalid_offset(<16 x i32> %v, ptr %ptr) {
 ; CHECK-LABEL: test_stnp_v16i32_invalid_offset:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    mov w8, #32032
-; CHECK-NEXT:    mov w9, #32000
+; CHECK-NEXT:    mov w8, #32032 ; =0x7d20
+; CHECK-NEXT:    mov w9, #32000 ; =0x7d00
 ; CHECK-NEXT:    add x8, x0, x8
 ; CHECK-NEXT:    add x9, x0, x9
 ; CHECK-NEXT:    stnp q2, q3, [x8]

diff  --git a/llvm/test/CodeGen/AArch64/nzcv-save.ll b/llvm/test/CodeGen/AArch64/nzcv-save.ll
index 2fee2666316ba0..9bc4ccf2787ad9 100644
--- a/llvm/test/CodeGen/AArch64/nzcv-save.ll
+++ b/llvm/test/CodeGen/AArch64/nzcv-save.ll
@@ -6,20 +6,20 @@
 define void @f(ptr nocapture %a, ptr nocapture %b, ptr nocapture %cc, ptr nocapture %dd) nounwind uwtable noinline ssp {
 ; CHECK-LABEL: f:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp x9, x8, [x2]
-; CHECK-NEXT:    ldp x11, x10, [x3]
+; CHECK-NEXT:    ldp x8, x10, [x2]
+; CHECK-NEXT:    ldp x9, x11, [x3]
 ; CHECK-NEXT:    ldp x13, x12, [x2, #16]
-; CHECK-NEXT:    ldp x14, x15, [x3, #16]
-; CHECK-NEXT:    adds x9, x9, x11
-; CHECK-NEXT:    adcs x8, x8, x10
-; CHECK-NEXT:    adcs x10, x13, x14
-; CHECK-NEXT:    adc x11, x12, x15
+; CHECK-NEXT:    adds x8, x8, x9
+; CHECK-NEXT:    ldp x14, x9, [x3, #16]
+; CHECK-NEXT:    adcs x10, x10, x11
+; CHECK-NEXT:    stp x8, x10, [x0]
+; CHECK-NEXT:    adcs x11, x13, x14
+; CHECK-NEXT:    adc x13, x12, x9
 ; CHECK-NEXT:    orr x12, x12, #0x100
-; CHECK-NEXT:    adc x12, x12, x15
-; CHECK-NEXT:    stp x9, x8, [x0]
-; CHECK-NEXT:    stp x10, x11, [x0, #16]
-; CHECK-NEXT:    stp x10, x12, [x1, #16]
-; CHECK-NEXT:    stp x9, x8, [x1]
+; CHECK-NEXT:    adc x9, x12, x9
+; CHECK-NEXT:    stp x11, x13, [x0, #16]
+; CHECK-NEXT:    stp x11, x9, [x1, #16]
+; CHECK-NEXT:    stp x8, x10, [x1]
 ; CHECK-NEXT:    ret
 entry:
   %c = load i256, ptr %cc

diff  --git a/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll b/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll
index e676cacf1b51da..d86af564f2622b 100644
--- a/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll
+++ b/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll
@@ -5,18 +5,18 @@ define dso_local void @jsimd_idct_ifast_neon_intrinsic(ptr nocapture readonly %d
 ; CHECK-LABEL: jsimd_idct_ifast_neon_intrinsic:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr q0, [x1, #32]
-; CHECK-NEXT:    mov w8, w3
 ; CHECK-NEXT:    ldr q1, [x1, #96]
+; CHECK-NEXT:    mov w9, w3
 ; CHECK-NEXT:    ldr q2, [x0, #32]
 ; CHECK-NEXT:    ldr q3, [x0, #96]
-; CHECK-NEXT:    ldr x9, [x2, #48]
+; CHECK-NEXT:    ldr x8, [x2, #48]
 ; CHECK-NEXT:    mul v0.8h, v2.8h, v0.8h
 ; CHECK-NEXT:    mul v1.8h, v3.8h, v1.8h
 ; CHECK-NEXT:    add v2.8h, v0.8h, v1.8h
-; CHECK-NEXT:    str q2, [x9, x8]
-; CHECK-NEXT:    ldr x9, [x2, #56]
 ; CHECK-NEXT:    sub v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    str q0, [x9, x8]
+; CHECK-NEXT:    str q2, [x8, x9]
+; CHECK-NEXT:    ldr x8, [x2, #56]
+; CHECK-NEXT:    str q0, [x8, x9]
 ; CHECK-NEXT:    ret
 entry:
   %add.ptr5 = getelementptr inbounds i16, ptr %coef_block, i64 16

diff  --git a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
index b47dd2ede5f9ea..17ad2983abe905 100644
--- a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
+++ b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
@@ -8,11 +8,10 @@
 define i32 @test_func_i32_two_uses(i32 %in, i32 %bit, i32 %mask) {
 ; CHECK-LABEL: test_func_i32_two_uses:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    adrp x9, :got:ptr_wrapper
-; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ldr x9, [x9, :got_lo12:ptr_wrapper]
-; CHECK-NEXT:    ldr x9, [x9]
+; CHECK-NEXT:    adrp x8, :got:ptr_wrapper
+; CHECK-NEXT:    ldr x8, [x8, :got_lo12:ptr_wrapper]
+; CHECK-NEXT:    ldr x9, [x8]
+; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    b .LBB0_3
 ; CHECK-NEXT:  .LBB0_1: // in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    str xzr, [x9, #8]
@@ -21,9 +20,9 @@ define i32 @test_func_i32_two_uses(i32 %in, i32 %bit, i32 %mask) {
 ; CHECK-NEXT:    cbz w1, .LBB0_6
 ; CHECK-NEXT:  .LBB0_3: // %do.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ands w10, w1, w8
-; CHECK-NEXT:    and w11, w2, w8
-; CHECK-NEXT:    cinc w0, w0, ne
+; CHECK-NEXT:    ands w10, w1, w0
+; CHECK-NEXT:    and w11, w2, w0
+; CHECK-NEXT:    cinc w8, w8, ne
 ; CHECK-NEXT:    cmp w10, w11
 ; CHECK-NEXT:    b.eq .LBB0_1
 ; CHECK-NEXT:  // %bb.4: // %do.body
@@ -34,6 +33,7 @@ define i32 @test_func_i32_two_uses(i32 %in, i32 %bit, i32 %mask) {
 ; CHECK-NEXT:    cbz w10, .LBB0_2
 ; CHECK-NEXT:    b .LBB0_1
 ; CHECK-NEXT:  .LBB0_6: // %do.end
+; CHECK-NEXT:    mov w0, w8
 ; CHECK-NEXT:    ret
 entry:
   %0 = load ptr, ptr @ptr_wrapper, align 8
@@ -72,25 +72,25 @@ do.end:                                           ; preds = %4
 define i32 @test_func_i64_one_use(i64 %in, i64 %bit, i64 %mask) {
 ; CHECK-LABEL: test_func_i64_one_use:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    adrp x9, :got:ptr_wrapper
-; CHECK-NEXT:    mov x8, x0
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ldr x9, [x9, :got_lo12:ptr_wrapper]
-; CHECK-NEXT:    ldr x9, [x9]
+; CHECK-NEXT:    adrp x8, :got:ptr_wrapper
+; CHECK-NEXT:    ldr x8, [x8, :got_lo12:ptr_wrapper]
+; CHECK-NEXT:    ldr x9, [x8]
+; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    b .LBB1_2
 ; CHECK-NEXT:  .LBB1_1: // in Loop: Header=BB1_2 Depth=1
 ; CHECK-NEXT:    lsl x1, x1, #1
 ; CHECK-NEXT:    cbz x1, .LBB1_4
 ; CHECK-NEXT:  .LBB1_2: // %do.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ands x10, x1, x8
+; CHECK-NEXT:    ands x10, x1, x0
 ; CHECK-NEXT:    orr x10, x2, x10
-; CHECK-NEXT:    cinc w0, w0, ne
+; CHECK-NEXT:    cinc w8, w8, ne
 ; CHECK-NEXT:    cbz x10, .LBB1_1
 ; CHECK-NEXT:  // %bb.3: // in Loop: Header=BB1_2 Depth=1
 ; CHECK-NEXT:    str xzr, [x9, #8]
 ; CHECK-NEXT:    b .LBB1_1
 ; CHECK-NEXT:  .LBB1_4: // %do.end
+; CHECK-NEXT:    mov w0, w8
 ; CHECK-NEXT:    ret
 entry:
   %0 = load ptr, ptr @ptr_wrapper, align 8

diff  --git a/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll b/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll
index 55f1ad9b2e1ebc..ff9dcbeda18c11 100644
--- a/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll
+++ b/llvm/test/CodeGen/AArch64/pmull-ldr-merge.ll
@@ -6,10 +6,10 @@
 define void @test1(ptr %0, i64 %1, i64 %2) {
 ; CHECK-LABEL: test1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, x2, lsl #4
-; CHECK-NEXT:    add x9, x0, x1, lsl #4
-; CHECK-NEXT:    ldr d0, [x8, #8]
-; CHECK-NEXT:    ldr d1, [x9, #8]
+; CHECK-NEXT:    add x8, x0, x1, lsl #4
+; CHECK-NEXT:    add x9, x0, x2, lsl #4
+; CHECK-NEXT:    ldr d0, [x9, #8]
+; CHECK-NEXT:    ldr d1, [x8, #8]
 ; CHECK-NEXT:    pmull v0.1q, v1.1d, v0.1d
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
@@ -49,9 +49,9 @@ define void @test3(ptr %0, i64 %1, i64 %2, i64 %3) {
 ; CHECK-LABEL: test3:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    add x8, x0, x1, lsl #4
-; CHECK-NEXT:    fmov d0, x3
-; CHECK-NEXT:    ldr d1, [x8, #8]
-; CHECK-NEXT:    pmull v0.1q, v1.1d, v0.1d
+; CHECK-NEXT:    fmov d1, x3
+; CHECK-NEXT:    ldr d0, [x8, #8]
+; CHECK-NEXT:    pmull v0.1q, v0.1d, v1.1d
 ; CHECK-NEXT:    str q0, [x8]
 ; CHECK-NEXT:    ret
   %5 = getelementptr inbounds <2 x i64>, ptr %0, i64 %1

diff  --git a/llvm/test/CodeGen/AArch64/pr-cf624b2.ll b/llvm/test/CodeGen/AArch64/pr-cf624b2.ll
index 87e3f4a5705311..ea9588e9e3db7d 100644
--- a/llvm/test/CodeGen/AArch64/pr-cf624b2.ll
+++ b/llvm/test/CodeGen/AArch64/pr-cf624b2.ll
@@ -13,42 +13,42 @@ define linkonce_odr void @_ZN1y2beEPiRK1vPmPS1_(<8 x i8> %0, ptr %agg.tmp.i) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    mov x8, sp
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    orr x9, x8, #0xf
-; CHECK-NEXT:    orr x11, x8, #0xc
 ; CHECK-NEXT:    orr x10, x8, #0xe
-; CHECK-NEXT:    orr x12, x8, #0x8
 ; CHECK-NEXT:    st1 { v0.b }[0], [x8]
 ; CHECK-NEXT:    st1 { v0.b }[15], [x9]
+; CHECK-NEXT:    orr x9, x8, #0xc
+; CHECK-NEXT:    st1 { v0.b }[12], [x9]
+; CHECK-NEXT:    orr x9, x8, #0x8
+; CHECK-NEXT:    st1 { v0.b }[8], [x9]
 ; CHECK-NEXT:    orr x9, x8, #0x7
-; CHECK-NEXT:    st1 { v0.b }[12], [x11]
-; CHECK-NEXT:    orr x11, x8, #0x4
-; CHECK-NEXT:    st1 { v0.b }[14], [x10]
-; CHECK-NEXT:    orr x10, x8, #0x6
 ; CHECK-NEXT:    st1 { v0.b }[7], [x9]
+; CHECK-NEXT:    orr x9, x8, #0x6
+; CHECK-NEXT:    st1 { v0.b }[6], [x9]
+; CHECK-NEXT:    orr x9, x8, #0x4
+; CHECK-NEXT:    st1 { v0.b }[4], [x9]
 ; CHECK-NEXT:    orr x9, x8, #0x3
-; CHECK-NEXT:    st1 { v0.b }[8], [x12]
-; CHECK-NEXT:    mov w12, #11
-; CHECK-NEXT:    st1 { v0.b }[4], [x11]
-; CHECK-NEXT:    mov w11, #13
 ; CHECK-NEXT:    st1 { v0.b }[3], [x9]
 ; CHECK-NEXT:    orr x9, x8, #0x2
-; CHECK-NEXT:    st1 { v0.b }[6], [x10]
-; CHECK-NEXT:    orr x10, x8, #0x1
-; CHECK-NEXT:    orr x11, x8, x11
+; CHECK-NEXT:    st1 { v0.b }[14], [x10]
+; CHECK-NEXT:    mov w10, #13 // =0xd
 ; CHECK-NEXT:    st1 { v0.b }[2], [x9]
-; CHECK-NEXT:    orr x9, x8, x12
-; CHECK-NEXT:    st1 { v0.b }[1], [x10]
-; CHECK-NEXT:    mov w10, #9
-; CHECK-NEXT:    st1 { v0.b }[13], [x11]
-; CHECK-NEXT:    mov w11, #5
+; CHECK-NEXT:    orr x9, x8, #0x1
+; CHECK-NEXT:    st1 { v0.b }[1], [x9]
+; CHECK-NEXT:    orr x9, x8, x10
+; CHECK-NEXT:    mov w10, #11 // =0xb
+; CHECK-NEXT:    st1 { v0.b }[13], [x9]
+; CHECK-NEXT:    orr x9, x8, x10
+; CHECK-NEXT:    mov w10, #10 // =0xa
 ; CHECK-NEXT:    st1 { v0.b }[11], [x9]
-; CHECK-NEXT:    mov w9, #10
-; CHECK-NEXT:    orr x9, x8, x9
-; CHECK-NEXT:    orr x10, x8, x10
-; CHECK-NEXT:    orr x8, x8, x11
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    orr x9, x8, x10
+; CHECK-NEXT:    mov w10, #9 // =0x9
 ; CHECK-NEXT:    st1 { v0.b }[10], [x9]
-; CHECK-NEXT:    st1 { v0.b }[9], [x10]
+; CHECK-NEXT:    orr x9, x8, x10
+; CHECK-NEXT:    st1 { v0.b }[9], [x9]
+; CHECK-NEXT:    mov w9, #5 // =0x5
+; CHECK-NEXT:    orr x8, x8, x9
 ; CHECK-NEXT:    st1 { v0.b }[5], [x8]
 ; CHECK-NEXT:    ldr q0, [sp]
 ; CHECK-NEXT:    stp q0, q1, [x0]

diff  --git a/llvm/test/CodeGen/AArch64/pr58350.ll b/llvm/test/CodeGen/AArch64/pr58350.ll
index efbdfb16850276..f7efab1ff66b33 100644
--- a/llvm/test/CodeGen/AArch64/pr58350.ll
+++ b/llvm/test/CodeGen/AArch64/pr58350.ll
@@ -12,9 +12,9 @@ define void @f(<1 x float> %a, i64 %b) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    adrp x8, .LCPI0_0
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    bfi x9, x0, #2, #1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT:    bfi x9, x0, #2, #1
 ; CHECK-NEXT:    str d1, [sp]
 ; CHECK-NEXT:    ldr s1, [x9]
 ; CHECK-NEXT:    mov v1.s[1], v0.s[0]

diff  --git a/llvm/test/CodeGen/AArch64/pr58516.ll b/llvm/test/CodeGen/AArch64/pr58516.ll
index 3bc904c9a63a77..b4840f01ce116b 100644
--- a/llvm/test/CodeGen/AArch64/pr58516.ll
+++ b/llvm/test/CodeGen/AArch64/pr58516.ll
@@ -24,13 +24,13 @@ define void @osfx(ptr %this) comdat personality ptr @__CxxFrameHandler3 {
 ; CHECK-NEXT:    sub x9, sp, #32
 ; CHECK-NEXT:    and sp, x9, #0xffffffffffffffe0
 ; CHECK-NEXT:    mov x19, sp
-; CHECK-NEXT:    mov x1, #-2
-; CHECK-NEXT:    add x8, x19, #0
+; CHECK-NEXT:    mov x1, #-2 // =0xfffffffffffffffe
 ; CHECK-NEXT:    mov x20, x0
+; CHECK-NEXT:    add x8, x19, #0
+; CHECK-NEXT:    stur x1, [x29, #24]
 ; CHECK-NEXT:    lsr x21, x8, #3
 ; CHECK-NEXT:    adrp x8, osfx
 ; CHECK-NEXT:    add x8, x8, :lo12:osfx
-; CHECK-NEXT:    stur x1, [x29, #24]
 ; CHECK-NEXT:    str x8, [x0]
 ; CHECK-NEXT:    str wzr, [x21]
 ; CHECK-NEXT:    ldr x0, [x0]

diff  --git a/llvm/test/CodeGen/AArch64/pr61549.ll b/llvm/test/CodeGen/AArch64/pr61549.ll
index c947706827f875..e66ee7d219cc5e 100644
--- a/llvm/test/CodeGen/AArch64/pr61549.ll
+++ b/llvm/test/CodeGen/AArch64/pr61549.ll
@@ -5,10 +5,10 @@
 define i35 @f(i35 %0) {
 ; CHECK-LABEL: f:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    sbfx x9, x0, #0, #35
-; CHECK-NEXT:    sdiv x10, x8, x9
-; CHECK-NEXT:    msub x8, x10, x9, x8
+; CHECK-NEXT:    sbfx x8, x0, #0, #35
+; CHECK-NEXT:    mov w9, #1 // =0x1
+; CHECK-NEXT:    sdiv x10, x9, x8
+; CHECK-NEXT:    msub x8, x10, x8, x9
 ; CHECK-NEXT:    clz x8, x8
 ; CHECK-NEXT:    sub x8, x8, #29
 ; CHECK-NEXT:    ubfx x0, x8, #5, #30
@@ -16,10 +16,10 @@ define i35 @f(i35 %0) {
 ;
 ; GISEL-LABEL: f:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #1 // =0x1
-; GISEL-NEXT:    sbfx x9, x0, #0, #35
-; GISEL-NEXT:    sdiv x10, x8, x9
-; GISEL-NEXT:    msub x8, x10, x9, x8
+; GISEL-NEXT:    sbfx x8, x0, #0, #35
+; GISEL-NEXT:    mov w9, #1 // =0x1
+; GISEL-NEXT:    sdiv x10, x9, x8
+; GISEL-NEXT:    msub x8, x10, x8, x9
 ; GISEL-NEXT:    and x8, x8, #0x7ffffffff
 ; GISEL-NEXT:    clz x8, x8
 ; GISEL-NEXT:    sub x8, x8, #29

diff  --git a/llvm/test/CodeGen/AArch64/predicated-add-sub.ll b/llvm/test/CodeGen/AArch64/predicated-add-sub.ll
index 884ee19fa01b5c..6b3cfc040cb3d4 100644
--- a/llvm/test/CodeGen/AArch64/predicated-add-sub.ll
+++ b/llvm/test/CodeGen/AArch64/predicated-add-sub.ll
@@ -83,8 +83,8 @@ define <vscale x 2 x i64> @zext.add.2xi64(<vscale x 2 x i64> %a, <vscale x 2 x i
 define <vscale x 8 x i32> @zext.add.8xi32(<vscale x 8 x i32> %a, <vscale x 8 x i1> %v) #0 {
 ; CHECK-LABEL: zext.add.8xi32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.s, #1 // =0x1
 ; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    mov z2.s, #1 // =0x1
 ; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    add z1.s, p1/m, z1.s, z2.s
 ; CHECK-NEXT:    add z0.s, p0/m, z0.s, z2.s
@@ -98,16 +98,16 @@ define <vscale x 16 x i32> @zext.add.16xi32(<vscale x 16 x i32> %a, <vscale x 16
 ; CHECK-LABEL: zext.add.16xi32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    punpkhi p1.h, p0.b
-; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    mov z4.s, #1 // =0x1
+; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    punpkhi p2.h, p1.b
 ; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    punpkhi p3.h, p0.b
-; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    add z1.s, p3/m, z1.s, z4.s
-; CHECK-NEXT:    add z0.s, p0/m, z0.s, z4.s
-; CHECK-NEXT:    add z2.s, p1/m, z2.s, z4.s
+; CHECK-NEXT:    punpklo p3.h, p0.b
+; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    add z3.s, p2/m, z3.s, z4.s
+; CHECK-NEXT:    add z2.s, p1/m, z2.s, z4.s
+; CHECK-NEXT:    add z0.s, p3/m, z0.s, z4.s
+; CHECK-NEXT:    add z1.s, p0/m, z1.s, z4.s
 ; CHECK-NEXT:    ret
   %extend = zext <vscale x 16 x i1> %v to <vscale x 16 x i32>
   %result = add <vscale x 16 x i32> %a, %extend
@@ -194,8 +194,8 @@ define <vscale x 2 x i64> @zext.sub.2xi64(<vscale x 2 x i64> %a, <vscale x 2 x i
 define <vscale x 8 x i32> @zext.sub.8xi32(<vscale x 8 x i32> %a, <vscale x 8 x i1> %v) #0 {
 ; CHECK-LABEL: zext.sub.8xi32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.s, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    punpklo p1.h, p0.b
+; CHECK-NEXT:    mov z2.s, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    add z0.s, p1/m, z0.s, z2.s
 ; CHECK-NEXT:    add z1.s, p0/m, z1.s, z2.s
@@ -213,11 +213,11 @@ define <vscale x 16 x i32> @zext.sub.16xi32(<vscale x 16 x i32> %a, <vscale x 16
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    punpklo p2.h, p1.b
 ; CHECK-NEXT:    punpkhi p1.h, p1.b
+; CHECK-NEXT:    punpklo p3.h, p0.b
+; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    add z0.s, p2/m, z0.s, z4.s
 ; CHECK-NEXT:    add z1.s, p1/m, z1.s, z4.s
-; CHECK-NEXT:    punpklo p1.h, p0.b
-; CHECK-NEXT:    punpkhi p0.h, p0.b
-; CHECK-NEXT:    add z2.s, p1/m, z2.s, z4.s
+; CHECK-NEXT:    add z2.s, p3/m, z2.s, z4.s
 ; CHECK-NEXT:    add z3.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    ret
   %extend = zext <vscale x 16 x i1> %v to <vscale x 16 x i32>
@@ -305,8 +305,8 @@ define <vscale x 2 x i64> @sext.add.2xi64(<vscale x 2 x i64> %a, <vscale x 2 x i
 define <vscale x 8 x i32> @sext.add.8xi32(<vscale x 8 x i32> %a, <vscale x 8 x i1> %v) #0 {
 ; CHECK-LABEL: sext.add.8xi32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.s, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    mov z2.s, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    add z1.s, p1/m, z1.s, z2.s
 ; CHECK-NEXT:    add z0.s, p0/m, z0.s, z2.s
@@ -320,16 +320,16 @@ define <vscale x 16 x i32> @sext.add.16xi32(<vscale x 16 x i32> %a, <vscale x 16
 ; CHECK-LABEL: sext.add.16xi32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    punpkhi p1.h, p0.b
-; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    mov z4.s, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    punpkhi p2.h, p1.b
 ; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    punpkhi p3.h, p0.b
-; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    add z1.s, p3/m, z1.s, z4.s
-; CHECK-NEXT:    add z0.s, p0/m, z0.s, z4.s
-; CHECK-NEXT:    add z2.s, p1/m, z2.s, z4.s
+; CHECK-NEXT:    punpklo p3.h, p0.b
+; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    add z3.s, p2/m, z3.s, z4.s
+; CHECK-NEXT:    add z2.s, p1/m, z2.s, z4.s
+; CHECK-NEXT:    add z0.s, p3/m, z0.s, z4.s
+; CHECK-NEXT:    add z1.s, p0/m, z1.s, z4.s
 ; CHECK-NEXT:    ret
   %extend = sext <vscale x 16 x i1> %v to <vscale x 16 x i32>
   %result = add <vscale x 16 x i32> %a, %extend
@@ -416,8 +416,8 @@ define <vscale x 2 x i64> @sext.sub.2xi64(<vscale x 2 x i64> %a, <vscale x 2 x i
 define <vscale x 8 x i32> @sext.sub.8xi32(<vscale x 8 x i32> %a, <vscale x 8 x i1> %v) #0 {
 ; CHECK-LABEL: sext.sub.8xi32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.s, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    mov z2.s, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    sub z1.s, p1/m, z1.s, z2.s
 ; CHECK-NEXT:    sub z0.s, p0/m, z0.s, z2.s
@@ -431,16 +431,16 @@ define <vscale x 16 x i32> @sext.sub.16xi32(<vscale x 16 x i32> %a, <vscale x 16
 ; CHECK-LABEL: sext.sub.16xi32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    punpkhi p1.h, p0.b
-; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    mov z4.s, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    punpkhi p2.h, p1.b
 ; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    punpkhi p3.h, p0.b
-; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    sub z1.s, p3/m, z1.s, z4.s
-; CHECK-NEXT:    sub z0.s, p0/m, z0.s, z4.s
-; CHECK-NEXT:    sub z2.s, p1/m, z2.s, z4.s
+; CHECK-NEXT:    punpklo p3.h, p0.b
+; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    sub z3.s, p2/m, z3.s, z4.s
+; CHECK-NEXT:    sub z2.s, p1/m, z2.s, z4.s
+; CHECK-NEXT:    sub z0.s, p3/m, z0.s, z4.s
+; CHECK-NEXT:    sub z1.s, p0/m, z1.s, z4.s
 ; CHECK-NEXT:    ret
   %extend = sext <vscale x 16 x i1> %v to <vscale x 16 x i32>
   %result = sub <vscale x 16 x i32> %a, %extend

diff  --git a/llvm/test/CodeGen/AArch64/pull-negations-after-concat-of-truncates.ll b/llvm/test/CodeGen/AArch64/pull-negations-after-concat-of-truncates.ll
index 7fc83c725b0c43..70c3e8a9aa0bcb 100644
--- a/llvm/test/CodeGen/AArch64/pull-negations-after-concat-of-truncates.ll
+++ b/llvm/test/CodeGen/AArch64/pull-negations-after-concat-of-truncates.ll
@@ -48,11 +48,10 @@ define <8 x i16> @not_not_trunc_concat_multiple_uses(<4 x i32> %x, <4 x i32> %y)
 ; CHECK-NEXT:    xtn v1.4h, v1.4s
 ; CHECK-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-NEXT:    mvn v1.8b, v1.8b
-; CHECK-NEXT:    mov v2.16b, v0.16b
-; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    mov v2.d[1], v1.d[0]
-; CHECK-NEXT:    mov v0.d[1], v0.d[0]
-; CHECK-NEXT:    add v0.8h, v2.8h, v0.8h
+; CHECK-NEXT:    add v2.4h, v0.4h, v1.4h
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    mov v2.d[1], v2.d[0]
+; CHECK-NEXT:    add v0.8h, v0.8h, v2.8h
 ; CHECK-NEXT:    ret
   %notx = xor <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
   %trnx = trunc <4 x i32> %notx to <4 x i16>

diff  --git a/llvm/test/CodeGen/AArch64/ragreedy-csr.ll b/llvm/test/CodeGen/AArch64/ragreedy-csr.ll
index 25c9a8ea24ac0b..5b501762418ef5 100644
--- a/llvm/test/CodeGen/AArch64/ragreedy-csr.ll
+++ b/llvm/test/CodeGen/AArch64/ragreedy-csr.ll
@@ -33,14 +33,14 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:  Lloh0:
 ; CHECK-NEXT:    adrp x14, __DefaultRuneLocale at GOTPAGE
-; CHECK-NEXT:    mov x9, xzr
 ; CHECK-NEXT:    ldrb w12, [x0, #4]
 ; CHECK-NEXT:    ldrb w13, [x1, #4]
-; CHECK-NEXT:    ldr x10, [x0, #16]
-; CHECK-NEXT:    ldr x11, [x1, #16]
+; CHECK-NEXT:    ldr x9, [x0, #16]
+; CHECK-NEXT:    ldr x10, [x1, #16]
+; CHECK-NEXT:    mov x11, xzr
 ; CHECK-NEXT:  Lloh1:
 ; CHECK-NEXT:    ldr x14, [x14, __DefaultRuneLocale at GOTPAGEOFF]
-; CHECK-NEXT:    ldrsb x8, [x10, x9]
+; CHECK-NEXT:    ldrsb x8, [x9, x11]
 ; CHECK-NEXT:    tbz x8, #63, LBB0_3
 ; CHECK-NEXT:  LBB0_2: ; %cond.false.i.i
 ; CHECK-NEXT:    stp x9, x0, [sp, #32] ; 16-byte Folded Spill
@@ -69,7 +69,7 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
 ; CHECK-NEXT:    and w8, w8, #0x8000
 ; CHECK-NEXT:    cbnz w8, LBB0_6
 ; CHECK-NEXT:  LBB0_4: ; %lor.rhs
-; CHECK-NEXT:    ldrsb x8, [x11, x9]
+; CHECK-NEXT:    ldrsb x8, [x10, x11]
 ; CHECK-NEXT:    tbnz x8, #63, LBB0_8
 ; CHECK-NEXT:  ; %bb.5: ; %cond.true.i.i217
 ; CHECK-NEXT:    add x8, x14, x8, lsl #2
@@ -77,13 +77,13 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
 ; CHECK-NEXT:    and w8, w8, #0x8000
 ; CHECK-NEXT:    cbz w8, LBB0_9
 ; CHECK-NEXT:  LBB0_6: ; %while.body
-; CHECK-NEXT:    ldrb w8, [x10, x9]
-; CHECK-NEXT:    ldrb w15, [x11, x9]
+; CHECK-NEXT:    ldrb w8, [x9, x11]
+; CHECK-NEXT:    ldrb w15, [x10, x11]
 ; CHECK-NEXT:    cmp w8, w15
 ; CHECK-NEXT:    b.ne LBB0_42
 ; CHECK-NEXT:  ; %bb.7: ; %if.end17
-; CHECK-NEXT:    add x9, x9, #1
-; CHECK-NEXT:    ldrsb x8, [x10, x9]
+; CHECK-NEXT:    add x11, x11, #1
+; CHECK-NEXT:    ldrsb x8, [x9, x11]
 ; CHECK-NEXT:    tbz x8, #63, LBB0_3
 ; CHECK-NEXT:    b LBB0_2
 ; CHECK-NEXT:  LBB0_8: ; %cond.false.i.i219
@@ -111,16 +111,16 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
 ; CHECK-NEXT:    cbnz w8, LBB0_24
 ; CHECK-NEXT:  ; %bb.10: ; %if.then23
 ; CHECK-NEXT:    ldr x12, [x0, #16]
-; CHECK-NEXT:    ldrb w8, [x10, x9]
+; CHECK-NEXT:    ldrb w8, [x9, x11]
 ; CHECK-NEXT:    ldrb w13, [x12]
 ; CHECK-NEXT:    cmp w13, #83
 ; CHECK-NEXT:    b.eq LBB0_19
 ; CHECK-NEXT:  LBB0_11: ; %while.cond59.preheader
 ; CHECK-NEXT:    cbz w8, LBB0_23
 ; CHECK-NEXT:  LBB0_12: ; %land.rhs.preheader
-; CHECK-NEXT:    add x10, x10, x9
-; CHECK-NEXT:    add x9, x11, x9
-; CHECK-NEXT:    add x10, x10, #1
+; CHECK-NEXT:    add x12, x9, x11
+; CHECK-NEXT:    add x9, x10, x11
+; CHECK-NEXT:    add x10, x12, #1
 ; CHECK-NEXT:  LBB0_13: ; %land.rhs
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrb w11, [x9], #1
@@ -135,8 +135,8 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
 ; CHECK-NEXT:    b.eq LBB0_18
 ; CHECK-NEXT:  ; %bb.16: ; %lor.lhs.false74
 ; CHECK-NEXT:    ; in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    cmp w8, w11
+; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    b.ne LBB0_43
 ; CHECK-NEXT:  ; %bb.17: ; %lor.lhs.false74
 ; CHECK-NEXT:    ; in Loop: Header=BB0_13 Depth=1
@@ -154,12 +154,12 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
 ; CHECK-NEXT:    cmp w8, #112
 ; CHECK-NEXT:    b.ne LBB0_12
 ; CHECK-NEXT:  ; %bb.21: ; %land.lhs.true35
-; CHECK-NEXT:    ldrb w13, [x11, x9]
+; CHECK-NEXT:    ldrb w13, [x10, x11]
 ; CHECK-NEXT:    cmp w13, #112
 ; CHECK-NEXT:    b.ne LBB0_12
 ; CHECK-NEXT:  ; %bb.22: ; %land.lhs.true43
-; CHECK-NEXT:    sub x12, x10, x12
-; CHECK-NEXT:    add x12, x12, x9
+; CHECK-NEXT:    sub x12, x9, x12
+; CHECK-NEXT:    add x12, x12, x11
 ; CHECK-NEXT:    cmp x12, #1
 ; CHECK-NEXT:    b.ne LBB0_44
 ; CHECK-NEXT:  LBB0_23:
@@ -172,7 +172,7 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
 ; CHECK-NEXT:    cmp w13, #2
 ; CHECK-NEXT:    b.ne LBB0_33
 ; CHECK-NEXT:  ; %bb.26: ; %while.cond95.preheader
-; CHECK-NEXT:    ldrb w12, [x10, x9]
+; CHECK-NEXT:    ldrb w12, [x9, x11]
 ; CHECK-NEXT:    cbz w12, LBB0_23
 ; CHECK-NEXT:  ; %bb.27: ; %land.rhs99.preheader
 ; CHECK-NEXT:    mov x8, xzr
@@ -180,15 +180,15 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
 ; CHECK-NEXT:    b LBB0_29
 ; CHECK-NEXT:  LBB0_28: ; %if.then117
 ; CHECK-NEXT:    ; in Loop: Header=BB0_29 Depth=1
-; CHECK-NEXT:    add x12, x10, x8
+; CHECK-NEXT:    add x12, x9, x8
 ; CHECK-NEXT:    add x8, x8, #1
-; CHECK-NEXT:    add x12, x12, x9
+; CHECK-NEXT:    add x12, x12, x11
 ; CHECK-NEXT:    ldrb w12, [x12, #1]
 ; CHECK-NEXT:    cbz w12, LBB0_43
 ; CHECK-NEXT:  LBB0_29: ; %land.rhs99
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add x13, x11, x8
-; CHECK-NEXT:    ldrb w13, [x13, x9]
+; CHECK-NEXT:    add x13, x10, x8
+; CHECK-NEXT:    ldrb w13, [x13, x11]
 ; CHECK-NEXT:    cbz w13, LBB0_23
 ; CHECK-NEXT:  ; %bb.30: ; %while.body104
 ; CHECK-NEXT:    ; in Loop: Header=BB0_29 Depth=1
@@ -204,14 +204,14 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
 ; CHECK-NEXT:    b.eq LBB0_28
 ; CHECK-NEXT:    b LBB0_42
 ; CHECK-NEXT:  LBB0_33: ; %if.else123
-; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    cmp w13, #1
+; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    b.ne LBB0_43
 ; CHECK-NEXT:  ; %bb.34: ; %if.else123
 ; CHECK-NEXT:    cmp w12, #2
 ; CHECK-NEXT:    b.ne LBB0_43
 ; CHECK-NEXT:  ; %bb.35: ; %while.cond130.preheader
-; CHECK-NEXT:    ldrb w8, [x10, x9]
+; CHECK-NEXT:    ldrb w8, [x9, x11]
 ; CHECK-NEXT:    cbz w8, LBB0_23
 ; CHECK-NEXT:  ; %bb.36: ; %land.rhs134.preheader
 ; CHECK-NEXT:    mov x12, xzr
@@ -219,15 +219,15 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
 ; CHECK-NEXT:    b LBB0_38
 ; CHECK-NEXT:  LBB0_37: ; %if.then152
 ; CHECK-NEXT:    ; in Loop: Header=BB0_38 Depth=1
-; CHECK-NEXT:    add x8, x10, x12
+; CHECK-NEXT:    add x8, x9, x12
 ; CHECK-NEXT:    add x12, x12, #1
-; CHECK-NEXT:    add x8, x8, x9
+; CHECK-NEXT:    add x8, x8, x11
 ; CHECK-NEXT:    ldrb w8, [x8, #1]
 ; CHECK-NEXT:    cbz w8, LBB0_43
 ; CHECK-NEXT:  LBB0_38: ; %land.rhs134
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add x13, x11, x12
-; CHECK-NEXT:    ldrb w13, [x13, x9]
+; CHECK-NEXT:    add x13, x10, x12
+; CHECK-NEXT:    ldrb w13, [x13, x11]
 ; CHECK-NEXT:    cbz w13, LBB0_23
 ; CHECK-NEXT:  ; %bb.39: ; %while.body139
 ; CHECK-NEXT:    ; in Loop: Header=BB0_38 Depth=1
@@ -251,7 +251,7 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
 ; CHECK-NEXT:    cmp x12, #2
 ; CHECK-NEXT:    b.ne LBB0_11
 ; CHECK-NEXT:  ; %bb.45: ; %land.lhs.true52
-; CHECK-NEXT:    add x12, x10, x9
+; CHECK-NEXT:    add x12, x9, x11
 ; CHECK-NEXT:    mov w0, #1 ; =0x1
 ; CHECK-NEXT:    ldurb w12, [x12, #-1]
 ; CHECK-NEXT:    cmp w12, #73

diff  --git a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
index d1d9db10351377..43f40badc1ae2e 100644
--- a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
+++ b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
@@ -8,34 +8,40 @@
 define dso_local void @run_test() local_unnamed_addr uwtable {
 ; CHECK-LABEL: run_test:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #96
-; CHECK-NEXT:    .cfi_def_cfa_offset 96
-; CHECK-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_offset b8, -8
-; CHECK-NEXT:    .cfi_offset b9, -16
-; CHECK-NEXT:    .cfi_offset b10, -24
-; CHECK-NEXT:    .cfi_offset b11, -32
-; CHECK-NEXT:    .cfi_offset b12, -40
-; CHECK-NEXT:    .cfi_offset b13, -48
-; CHECK-NEXT:    .cfi_offset b14, -56
-; CHECK-NEXT:    .cfi_offset b15, -64
-; CHECK-NEXT:    movi v14.2d, #0000000000000000
+; CHECK-NEXT:    sub sp, sp, #192
+; CHECK-NEXT:    .cfi_def_cfa_offset 192
+; CHECK-NEXT:    stp d15, d14, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #112] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #128] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #144] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #160] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #176] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w21, -24
+; CHECK-NEXT:    .cfi_offset w22, -32
+; CHECK-NEXT:    .cfi_offset b8, -40
+; CHECK-NEXT:    .cfi_offset b9, -48
+; CHECK-NEXT:    .cfi_offset b10, -56
+; CHECK-NEXT:    .cfi_offset b11, -64
+; CHECK-NEXT:    .cfi_offset b12, -72
+; CHECK-NEXT:    .cfi_offset b13, -80
+; CHECK-NEXT:    .cfi_offset b14, -88
+; CHECK-NEXT:    .cfi_offset b15, -96
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    mov x9, xzr
 ; CHECK-NEXT:    adrp x10, B+48
 ; CHECK-NEXT:    add x10, x10, :lo12:B+48
 ; CHECK-NEXT:    adrp x11, A
 ; CHECK-NEXT:    add x11, x11, :lo12:A
-; CHECK-NEXT:    // implicit-def: $q2
+; CHECK-NEXT:    // implicit-def: $q6
+; CHECK-NEXT:    // implicit-def: $q7
+; CHECK-NEXT:    // implicit-def: $q10
 ; CHECK-NEXT:    // implicit-def: $q3
-; CHECK-NEXT:    // implicit-def: $q15
 ; CHECK-NEXT:    // implicit-def: $q4
 ; CHECK-NEXT:    // implicit-def: $q5
-; CHECK-NEXT:    // implicit-def: $q6
-; CHECK-NEXT:    // implicit-def: $q7
+; CHECK-NEXT:    // implicit-def: $q2
 ; CHECK-NEXT:    // implicit-def: $q16
 ; CHECK-NEXT:    // implicit-def: $q17
 ; CHECK-NEXT:    // implicit-def: $q18
@@ -46,131 +52,189 @@ define dso_local void @run_test() local_unnamed_addr uwtable {
 ; CHECK-NEXT:    // implicit-def: $q23
 ; CHECK-NEXT:    // implicit-def: $q24
 ; CHECK-NEXT:    // implicit-def: $q25
-; CHECK-NEXT:    // implicit-def: $q26
 ; CHECK-NEXT:    // implicit-def: $q27
+; CHECK-NEXT:    // implicit-def: $q26
 ; CHECK-NEXT:    // implicit-def: $q28
-; CHECK-NEXT:    // implicit-def: $q29
 ; CHECK-NEXT:    // implicit-def: $q30
+; CHECK-NEXT:    // implicit-def: $q15
+; CHECK-NEXT:    // implicit-def: $q29
 ; CHECK-NEXT:    // implicit-def: $q31
-; CHECK-NEXT:    // implicit-def: $q8
-; CHECK-NEXT:    // implicit-def: $q9
-; CHECK-NEXT:    // implicit-def: $q10
 ; CHECK-NEXT:    // implicit-def: $q11
+; CHECK-NEXT:    // implicit-def: $q9
+; CHECK-NEXT:    // kill: killed $q6
 ; CHECK-NEXT:    // implicit-def: $q12
 ; CHECK-NEXT:    // implicit-def: $q13
+; CHECK-NEXT:    // implicit-def: $q6
+; CHECK-NEXT:    // kill: killed $q6
 ; CHECK-NEXT:  .LBB0_1: // %for.cond1.preheader
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    mov x12, xzr
-; CHECK-NEXT:    stp q15, q14, [sp] // 32-byte Folded Spill
 ; CHECK-NEXT:    ldr q14, [x8]
-; CHECK-NEXT:    add x15, x11, x8
-; CHECK-NEXT:    ldr q15, [x10], #64
-; CHECK-NEXT:    ldr q0, [x12]
-; CHECK-NEXT:    add x9, x9, #1
-; CHECK-NEXT:    ldr x12, [x12]
-; CHECK-NEXT:    fmov x13, d14
+; CHECK-NEXT:    mov x12, xzr
+; CHECK-NEXT:    add x7, x11, x8
+; CHECK-NEXT:    ldr x13, [x12]
+; CHECK-NEXT:    ldr x5, [x8]
+; CHECK-NEXT:    ldr x7, [x7, #128]
 ; CHECK-NEXT:    mov x14, v14.d[1]
-; CHECK-NEXT:    fmov x0, d15
-; CHECK-NEXT:    fmov x16, d0
-; CHECK-NEXT:    ldr x15, [x15, #128]
-; CHECK-NEXT:    mul x17, x13, x12
-; CHECK-NEXT:    mov x18, v0.d[1]
-; CHECK-NEXT:    mul x4, x0, x12
-; CHECK-NEXT:    mul x1, x16, x12
-; CHECK-NEXT:    mul x3, x14, x12
-; CHECK-NEXT:    fmov d0, x17
-; CHECK-NEXT:    mul x5, x13, x15
-; CHECK-NEXT:    mov x17, v15.d[1]
-; CHECK-NEXT:    fmov d15, x4
-; CHECK-NEXT:    fmov d14, x1
-; CHECK-NEXT:    mul x1, x18, x12
-; CHECK-NEXT:    mov v0.d[1], x3
-; CHECK-NEXT:    mul x3, x16, x15
-; CHECK-NEXT:    ldr x2, [x8], #8
-; CHECK-NEXT:    mul x12, x17, x12
-; CHECK-NEXT:    fmov d1, x5
-; CHECK-NEXT:    mov v14.d[1], x1
-; CHECK-NEXT:    mul x1, x14, x15
-; CHECK-NEXT:    add v12.2d, v12.2d, v0.2d
-; CHECK-NEXT:    mul x13, x13, x2
-; CHECK-NEXT:    fmov d0, x3
-; CHECK-NEXT:    mul x3, x0, x15
-; CHECK-NEXT:    mov v15.d[1], x12
-; CHECK-NEXT:    mul x12, x18, x2
-; CHECK-NEXT:    mov v1.d[1], x1
-; CHECK-NEXT:    mul x18, x18, x15
-; CHECK-NEXT:    mul x16, x16, x2
+; CHECK-NEXT:    stp q22, q26, [sp] // 32-byte Folded Spill
+; CHECK-NEXT:    mov v22.16b, v9.16b
+; CHECK-NEXT:    stp q31, q15, [sp, #32] // 32-byte Folded Spill
+; CHECK-NEXT:    ldr q15, [x12]
+; CHECK-NEXT:    fmov x12, d14
+; CHECK-NEXT:    ldr q14, [x10], #64
+; CHECK-NEXT:    mov v9.16b, v30.16b
+; CHECK-NEXT:    fmov x17, d15
+; CHECK-NEXT:    mov x16, v15.d[1]
+; CHECK-NEXT:    mov v30.16b, v27.16b
+; CHECK-NEXT:    mul x15, x12, x13
+; CHECK-NEXT:    mov x0, v14.d[1]
+; CHECK-NEXT:    fmov x4, d14
+; CHECK-NEXT:    mov v27.16b, v23.16b
+; CHECK-NEXT:    mov v23.16b, v19.16b
+; CHECK-NEXT:    mov v19.16b, v2.16b
+; CHECK-NEXT:    mul x1, x14, x13
+; CHECK-NEXT:    mov v8.16b, v28.16b
+; CHECK-NEXT:    mov v28.16b, v24.16b
+; CHECK-NEXT:    mov v24.16b, v20.16b
+; CHECK-NEXT:    mov v20.16b, v16.16b
+; CHECK-NEXT:    mov v16.16b, v3.16b
+; CHECK-NEXT:    mul x18, x17, x13
+; CHECK-NEXT:    mov v31.16b, v18.16b
+; CHECK-NEXT:    mov v26.16b, v5.16b
+; CHECK-NEXT:    fmov d15, x15
+; CHECK-NEXT:    mov v5.16b, v1.16b
+; CHECK-NEXT:    mov v18.16b, v10.16b
+; CHECK-NEXT:    mul x2, x16, x13
+; CHECK-NEXT:    mov v10.16b, v29.16b
+; CHECK-NEXT:    mov v29.16b, v25.16b
+; CHECK-NEXT:    mov v25.16b, v21.16b
+; CHECK-NEXT:    mov v21.16b, v17.16b
+; CHECK-NEXT:    mov v17.16b, v4.16b
+; CHECK-NEXT:    mov v15.d[1], x1
+; CHECK-NEXT:    mul x19, x12, x5
+; CHECK-NEXT:    add x8, x8, #8
+; CHECK-NEXT:    fmov d14, x18
 ; CHECK-NEXT:    cmp x8, #64
-; CHECK-NEXT:    mul x15, x17, x15
-; CHECK-NEXT:    add v13.2d, v13.2d, v14.2d
-; CHECK-NEXT:    mul x14, x14, x2
-; CHECK-NEXT:    add v11.2d, v11.2d, v14.2d
-; CHECK-NEXT:    fmov d14, x3
-; CHECK-NEXT:    add v10.2d, v10.2d, v15.2d
-; CHECK-NEXT:    fmov d15, x13
-; CHECK-NEXT:    mov v0.d[1], x18
-; CHECK-NEXT:    mul x13, x0, x2
-; CHECK-NEXT:    add v29.2d, v29.2d, v1.2d
-; CHECK-NEXT:    fmov d1, x16
-; CHECK-NEXT:    mov v14.d[1], x15
-; CHECK-NEXT:    mov v15.d[1], x14
-; CHECK-NEXT:    mov v1.d[1], x12
-; CHECK-NEXT:    mul x12, x17, x2
-; CHECK-NEXT:    add v28.2d, v28.2d, v0.2d
+; CHECK-NEXT:    add x9, x9, #1
+; CHECK-NEXT:    mul x12, x12, x7
+; CHECK-NEXT:    mov v14.d[1], x2
+; CHECK-NEXT:    add v12.2d, v12.2d, v15.2d
+; CHECK-NEXT:    mul x3, x0, x13
+; CHECK-NEXT:    fmov d1, x19
+; CHECK-NEXT:    mul x13, x4, x13
+; CHECK-NEXT:    fmov d2, x12
+; CHECK-NEXT:    mul x6, x14, x5
+; CHECK-NEXT:    add v6.2d, v13.2d, v14.2d
+; CHECK-NEXT:    mov v13.16b, v12.16b
+; CHECK-NEXT:    ldr q12, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    mul x14, x14, x7
 ; CHECK-NEXT:    fmov d0, x13
-; CHECK-NEXT:    add v27.2d, v27.2d, v14.2d
-; CHECK-NEXT:    ldr q14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add v8.2d, v8.2d, v15.2d
-; CHECK-NEXT:    mov v0.d[1], x12
-; CHECK-NEXT:    add v25.2d, v25.2d, v15.2d
-; CHECK-NEXT:    add v22.2d, v22.2d, v15.2d
-; CHECK-NEXT:    add v18.2d, v18.2d, v15.2d
-; CHECK-NEXT:    add v6.2d, v6.2d, v15.2d
-; CHECK-NEXT:    add v14.2d, v14.2d, v15.2d
-; CHECK-NEXT:    ldr q15, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    add v9.2d, v9.2d, v1.2d
-; CHECK-NEXT:    add v31.2d, v31.2d, v1.2d
-; CHECK-NEXT:    add v26.2d, v26.2d, v1.2d
+; CHECK-NEXT:    add v12.2d, v12.2d, v14.2d
+; CHECK-NEXT:    mul x21, x17, x7
+; CHECK-NEXT:    mov v1.d[1], x6
+; CHECK-NEXT:    mul x18, x4, x7
+; CHECK-NEXT:    mov v0.d[1], x3
+; CHECK-NEXT:    mov v2.d[1], x14
+; CHECK-NEXT:    str q12, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov v12.16b, v13.16b
+; CHECK-NEXT:    mul x13, x17, x5
+; CHECK-NEXT:    mov v13.16b, v6.16b
+; CHECK-NEXT:    fmov d3, x21
+; CHECK-NEXT:    ldp q15, q6, [sp, #48] // 32-byte Folded Reload
+; CHECK-NEXT:    mul x20, x16, x7
+; CHECK-NEXT:    add v11.2d, v11.2d, v1.2d
+; CHECK-NEXT:    fmov d4, x18
+; CHECK-NEXT:    mul x22, x0, x7
+; CHECK-NEXT:    add v6.2d, v6.2d, v0.2d
+; CHECK-NEXT:    add v15.2d, v15.2d, v2.2d
+; CHECK-NEXT:    fmov d14, x13
+; CHECK-NEXT:    mov v2.16b, v19.16b
+; CHECK-NEXT:    mov v19.16b, v23.16b
+; CHECK-NEXT:    mul x14, x4, x5
+; CHECK-NEXT:    mov v23.16b, v27.16b
+; CHECK-NEXT:    mov v27.16b, v30.16b
+; CHECK-NEXT:    mov v3.d[1], x20
+; CHECK-NEXT:    mov v30.16b, v9.16b
+; CHECK-NEXT:    mov v9.16b, v22.16b
+; CHECK-NEXT:    mul x12, x16, x5
+; CHECK-NEXT:    str q6, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    mov v6.16b, v18.16b
+; CHECK-NEXT:    mov v4.d[1], x22
+; CHECK-NEXT:    add v27.2d, v27.2d, v1.2d
 ; CHECK-NEXT:    add v23.2d, v23.2d, v1.2d
-; CHECK-NEXT:    add v21.2d, v21.2d, v1.2d
+; CHECK-NEXT:    mul x13, x0, x5
 ; CHECK-NEXT:    add v19.2d, v19.2d, v1.2d
-; CHECK-NEXT:    add v17.2d, v17.2d, v1.2d
-; CHECK-NEXT:    add v7.2d, v7.2d, v1.2d
-; CHECK-NEXT:    add v5.2d, v5.2d, v1.2d
-; CHECK-NEXT:    add v15.2d, v15.2d, v1.2d
-; CHECK-NEXT:    add v3.2d, v3.2d, v1.2d
-; CHECK-NEXT:    add v30.2d, v30.2d, v0.2d
-; CHECK-NEXT:    add v24.2d, v24.2d, v0.2d
-; CHECK-NEXT:    add v20.2d, v20.2d, v0.2d
-; CHECK-NEXT:    add v16.2d, v16.2d, v0.2d
+; CHECK-NEXT:    add v2.2d, v2.2d, v1.2d
+; CHECK-NEXT:    fmov d0, x14
+; CHECK-NEXT:    add v30.2d, v30.2d, v3.2d
+; CHECK-NEXT:    mov v3.16b, v16.16b
+; CHECK-NEXT:    mov v16.16b, v20.16b
+; CHECK-NEXT:    mov v20.16b, v24.16b
+; CHECK-NEXT:    mov v24.16b, v28.16b
+; CHECK-NEXT:    mov v14.d[1], x12
+; CHECK-NEXT:    mov v28.16b, v8.16b
+; CHECK-NEXT:    add v1.2d, v5.2d, v1.2d
+; CHECK-NEXT:    add v28.2d, v8.2d, v4.2d
+; CHECK-NEXT:    mov v4.16b, v17.16b
+; CHECK-NEXT:    mov v17.16b, v21.16b
+; CHECK-NEXT:    mov v0.d[1], x13
+; CHECK-NEXT:    mov v21.16b, v25.16b
+; CHECK-NEXT:    mov v25.16b, v29.16b
+; CHECK-NEXT:    mov v29.16b, v10.16b
+; CHECK-NEXT:    mov v5.16b, v26.16b
+; CHECK-NEXT:    mov v18.16b, v31.16b
+; CHECK-NEXT:    ldp q22, q26, [sp] // 32-byte Folded Reload
+; CHECK-NEXT:    ldr q31, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    add v9.2d, v9.2d, v14.2d
+; CHECK-NEXT:    add v24.2d, v24.2d, v14.2d
+; CHECK-NEXT:    add v20.2d, v20.2d, v14.2d
+; CHECK-NEXT:    add v31.2d, v31.2d, v14.2d
+; CHECK-NEXT:    add v18.2d, v18.2d, v14.2d
+; CHECK-NEXT:    add v16.2d, v16.2d, v14.2d
+; CHECK-NEXT:    add v26.2d, v26.2d, v14.2d
+; CHECK-NEXT:    add v22.2d, v22.2d, v14.2d
+; CHECK-NEXT:    add v5.2d, v5.2d, v14.2d
+; CHECK-NEXT:    add v3.2d, v3.2d, v14.2d
+; CHECK-NEXT:    add v10.2d, v6.2d, v14.2d
+; CHECK-NEXT:    add v29.2d, v29.2d, v0.2d
+; CHECK-NEXT:    add v25.2d, v25.2d, v0.2d
+; CHECK-NEXT:    add v21.2d, v21.2d, v0.2d
+; CHECK-NEXT:    add v17.2d, v17.2d, v0.2d
 ; CHECK-NEXT:    add v4.2d, v4.2d, v0.2d
-; CHECK-NEXT:    add v2.2d, v2.2d, v0.2d
+; CHECK-NEXT:    add v7.2d, v7.2d, v0.2d
 ; CHECK-NEXT:    b.ne .LBB0_1
 ; CHECK-NEXT:  // %bb.2: // %for.cond.cleanup
 ; CHECK-NEXT:    adrp x8, C
 ; CHECK-NEXT:    add x8, x8, :lo12:C
+; CHECK-NEXT:    ldr q0, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    stp q13, q12, [x8]
-; CHECK-NEXT:    stp q11, q10, [x8, #32]
-; CHECK-NEXT:    stp q9, q8, [x8, #64]
-; CHECK-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    stp q31, q30, [x8, #96]
-; CHECK-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    stp q29, q28, [x8, #144]
-; CHECK-NEXT:    stp q27, q26, [x8, #176]
-; CHECK-NEXT:    str q25, [x8, #208]
-; CHECK-NEXT:    stp q24, q23, [x8, #240]
-; CHECK-NEXT:    stp q22, q21, [x8, #272]
-; CHECK-NEXT:    stp q20, q19, [x8, #304]
-; CHECK-NEXT:    stp q18, q17, [x8, #336]
-; CHECK-NEXT:    stp q16, q7, [x8, #368]
-; CHECK-NEXT:    stp q6, q5, [x8, #400]
-; CHECK-NEXT:    stp q4, q15, [x8, #432]
-; CHECK-NEXT:    stp q14, q3, [x8, #464]
-; CHECK-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT:    str q2, [x8, #496]
-; CHECK-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ldr q6, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    stp q9, q11, [x8, #64]
+; CHECK-NEXT:    ldp x20, x19, [sp, #176] // 16-byte Folded Reload
+; CHECK-NEXT:    stp q15, q30, [x8, #144]
+; CHECK-NEXT:    ldp x22, x21, [sp, #160] // 16-byte Folded Reload
+; CHECK-NEXT:    stp q4, q3, [x8, #432]
+; CHECK-NEXT:    ldp d9, d8, [sp, #144] // 16-byte Folded Reload
+; CHECK-NEXT:    stp q0, q6, [x8, #32]
+; CHECK-NEXT:    ldp d13, d12, [sp, #112] // 16-byte Folded Reload
+; CHECK-NEXT:    stp q31, q29, [x8, #96]
+; CHECK-NEXT:    ldp d15, d14, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    stp q28, q26, [x8, #176]
+; CHECK-NEXT:    str q27, [x8, #208]
+; CHECK-NEXT:    stp q25, q24, [x8, #240]
+; CHECK-NEXT:    stp q23, q22, [x8, #272]
+; CHECK-NEXT:    stp q21, q20, [x8, #304]
+; CHECK-NEXT:    stp q19, q18, [x8, #336]
+; CHECK-NEXT:    stp q17, q16, [x8, #368]
+; CHECK-NEXT:    stp q2, q5, [x8, #400]
+; CHECK-NEXT:    stp q1, q10, [x8, #464]
+; CHECK-NEXT:    ldp d11, d10, [sp, #128] // 16-byte Folded Reload
+; CHECK-NEXT:    str q7, [x8, #496]
+; CHECK-NEXT:    add sp, sp, #192
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w19
+; CHECK-NEXT:    .cfi_restore w20
+; CHECK-NEXT:    .cfi_restore w21
+; CHECK-NEXT:    .cfi_restore w22
 ; CHECK-NEXT:    .cfi_restore b8
 ; CHECK-NEXT:    .cfi_restore b9
 ; CHECK-NEXT:    .cfi_restore b10

diff  --git a/llvm/test/CodeGen/AArch64/rand.ll b/llvm/test/CodeGen/AArch64/rand.ll
index b742a9ab43d35f..5ba356e86cba27 100644
--- a/llvm/test/CodeGen/AArch64/rand.ll
+++ b/llvm/test/CodeGen/AArch64/rand.ll
@@ -7,9 +7,9 @@ define  i32 @rndr(ptr %__addr) {
 ; CHECK-NEXT:    mrs x10, RNDR
 ; CHECK-NEXT:    mov x9, x0
 ; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    str x10, [x9]
 ; CHECK-NEXT:    and w8, w8, #0x1
 ; CHECK-NEXT:    mov w0, w8
-; CHECK-NEXT:    str x10, [x9]
 ; CHECK-NEXT:    ret
   %1 = tail call { i64, i1 } @llvm.aarch64.rndr()
   %2 = extractvalue { i64, i1 } %1, 0
@@ -26,9 +26,9 @@ define  i32 @rndrrs(ptr  %__addr) {
 ; CHECK-NEXT:    mrs x10, RNDRRS
 ; CHECK-NEXT:    mov x9, x0
 ; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    str x10, [x9]
 ; CHECK-NEXT:    and w8, w8, #0x1
 ; CHECK-NEXT:    mov w0, w8
-; CHECK-NEXT:    str x10, [x9]
 ; CHECK-NEXT:    ret
   %1 = tail call { i64, i1 } @llvm.aarch64.rndrrs()
   %2 = extractvalue { i64, i1 } %1, 0

diff  --git a/llvm/test/CodeGen/AArch64/rcpc3-sve.ll b/llvm/test/CodeGen/AArch64/rcpc3-sve.ll
index d72a9a9f76b867..b9d93942d0f118 100644
--- a/llvm/test/CodeGen/AArch64/rcpc3-sve.ll
+++ b/llvm/test/CodeGen/AArch64/rcpc3-sve.ll
@@ -8,8 +8,8 @@
 define hidden <vscale x 2 x i64> @test_load_sve_lane0(ptr nocapture noundef readonly %a, <vscale x 2 x i64> noundef %b) local_unnamed_addr {
 ; CHECK-LABEL: test_load_sve_lane0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldapr x8, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    ldapr x8, [x0]
 ; CHECK-NEXT:    mov z0.d, p0/m, x8
 ; CHECK-NEXT:    ret
   %1 = load atomic i64, ptr %a acquire, align 8
@@ -20,13 +20,13 @@ define hidden <vscale x 2 x i64> @test_load_sve_lane0(ptr nocapture noundef read
 define hidden <vscale x 2 x i64> @test_load_sve_lane1(ptr nocapture noundef readonly %a, <vscale x 2 x i64> noundef %b) local_unnamed_addr {
 ; CHECK-LABEL: test_load_sve_lane1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    ldapr x9, [x0]
-; CHECK-NEXT:    index z2.d, #0, #1
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z1.d, x8
-; CHECK-NEXT:    cmpeq p0.d, p0/z, z2.d, z1.d
-; CHECK-NEXT:    mov z0.d, p0/m, x9
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    index z1.d, #0, #1
+; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    ldapr x8, [x0]
+; CHECK-NEXT:    cmpeq p0.d, p0/z, z1.d, z2.d
+; CHECK-NEXT:    mov z0.d, p0/m, x8
 ; CHECK-NEXT:    ret
   %1 = load atomic i64, ptr %a acquire, align 8
   %vldap1_lane = insertelement <vscale x 2 x i64> %b, i64 %1, i64 1

diff  --git a/llvm/test/CodeGen/AArch64/reduce-and.ll b/llvm/test/CodeGen/AArch64/reduce-and.ll
index 71b3f1d2479914..a20a76c00418d1 100644
--- a/llvm/test/CodeGen/AArch64/reduce-and.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-and.ll
@@ -95,14 +95,14 @@ define i1 @test_redand_v8i1(<8 x i1> %a) {
 ; GISEL-NEXT:    fmov w11, s3
 ; GISEL-NEXT:    fmov w12, s4
 ; GISEL-NEXT:    fmov w13, s5
+; GISEL-NEXT:    fmov w14, s6
 ; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    fmov w9, s6
+; GISEL-NEXT:    fmov w9, s7
 ; GISEL-NEXT:    and w10, w10, w11
-; GISEL-NEXT:    fmov w11, s7
-; GISEL-NEXT:    and w12, w12, w13
+; GISEL-NEXT:    and w11, w12, w13
 ; GISEL-NEXT:    and w8, w8, w10
-; GISEL-NEXT:    and w9, w9, w11
-; GISEL-NEXT:    and w9, w12, w9
+; GISEL-NEXT:    and w9, w14, w9
+; GISEL-NEXT:    and w9, w11, w9
 ; GISEL-NEXT:    and w8, w8, w9
 ; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
@@ -130,39 +130,39 @@ define i1 @test_redand_v16i1(<16 x i1> %a) {
 ; GISEL-NEXT:    mov b6, v0.b[6]
 ; GISEL-NEXT:    mov b7, v0.b[7]
 ; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
 ; GISEL-NEXT:    mov b16, v0.b[8]
 ; GISEL-NEXT:    mov b17, v0.b[9]
 ; GISEL-NEXT:    mov b18, v0.b[10]
 ; GISEL-NEXT:    mov b19, v0.b[11]
-; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    and w9, w10, w11
-; GISEL-NEXT:    fmov w10, s4
-; GISEL-NEXT:    fmov w11, s5
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    fmov w10, s2
+; GISEL-NEXT:    fmov w11, s3
 ; GISEL-NEXT:    fmov w12, s6
-; GISEL-NEXT:    fmov w13, s7
 ; GISEL-NEXT:    mov b20, v0.b[12]
 ; GISEL-NEXT:    mov b21, v0.b[13]
+; GISEL-NEXT:    fmov w13, s7
 ; GISEL-NEXT:    mov b22, v0.b[14]
 ; GISEL-NEXT:    mov b23, v0.b[15]
-; GISEL-NEXT:    and w10, w10, w11
-; GISEL-NEXT:    and w11, w12, w13
-; GISEL-NEXT:    fmov w12, s16
-; GISEL-NEXT:    fmov w13, s17
+; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    and w9, w10, w11
+; GISEL-NEXT:    fmov w10, s4
+; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    fmov w11, s5
 ; GISEL-NEXT:    fmov w14, s18
 ; GISEL-NEXT:    fmov w15, s19
 ; GISEL-NEXT:    fmov w16, s22
 ; GISEL-NEXT:    fmov w17, s23
+; GISEL-NEXT:    and w10, w10, w11
+; GISEL-NEXT:    and w11, w12, w13
+; GISEL-NEXT:    fmov w12, s16
+; GISEL-NEXT:    and w9, w10, w11
+; GISEL-NEXT:    fmov w13, s17
 ; GISEL-NEXT:    and w8, w8, w9
 ; GISEL-NEXT:    and w12, w12, w13
-; GISEL-NEXT:    and w9, w10, w11
 ; GISEL-NEXT:    and w13, w14, w15
 ; GISEL-NEXT:    fmov w14, s20
 ; GISEL-NEXT:    fmov w15, s21
 ; GISEL-NEXT:    and w10, w12, w13
-; GISEL-NEXT:    and w8, w8, w9
 ; GISEL-NEXT:    and w14, w14, w15
 ; GISEL-NEXT:    and w15, w16, w17
 ; GISEL-NEXT:    and w11, w14, w15
@@ -192,39 +192,39 @@ define <16 x i1> @test_redand_ins_v16i1(<16 x i1> %a) {
 ; GISEL-NEXT:    mov b6, v0.b[6]
 ; GISEL-NEXT:    mov b7, v0.b[7]
 ; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
 ; GISEL-NEXT:    mov b16, v0.b[8]
 ; GISEL-NEXT:    mov b17, v0.b[9]
 ; GISEL-NEXT:    mov b18, v0.b[10]
 ; GISEL-NEXT:    mov b19, v0.b[11]
-; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    and w9, w10, w11
-; GISEL-NEXT:    fmov w10, s4
-; GISEL-NEXT:    fmov w11, s5
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    fmov w10, s2
+; GISEL-NEXT:    fmov w11, s3
 ; GISEL-NEXT:    fmov w12, s6
-; GISEL-NEXT:    fmov w13, s7
 ; GISEL-NEXT:    mov b20, v0.b[12]
 ; GISEL-NEXT:    mov b21, v0.b[13]
+; GISEL-NEXT:    fmov w13, s7
 ; GISEL-NEXT:    mov b22, v0.b[14]
 ; GISEL-NEXT:    mov b23, v0.b[15]
-; GISEL-NEXT:    and w10, w10, w11
-; GISEL-NEXT:    and w11, w12, w13
-; GISEL-NEXT:    fmov w12, s16
-; GISEL-NEXT:    fmov w13, s17
+; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    and w9, w10, w11
+; GISEL-NEXT:    fmov w10, s4
+; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    fmov w11, s5
 ; GISEL-NEXT:    fmov w14, s18
 ; GISEL-NEXT:    fmov w15, s19
 ; GISEL-NEXT:    fmov w16, s22
 ; GISEL-NEXT:    fmov w17, s23
+; GISEL-NEXT:    and w10, w10, w11
+; GISEL-NEXT:    and w11, w12, w13
+; GISEL-NEXT:    fmov w12, s16
+; GISEL-NEXT:    and w9, w10, w11
+; GISEL-NEXT:    fmov w13, s17
 ; GISEL-NEXT:    and w8, w8, w9
 ; GISEL-NEXT:    and w12, w12, w13
-; GISEL-NEXT:    and w9, w10, w11
 ; GISEL-NEXT:    and w13, w14, w15
 ; GISEL-NEXT:    fmov w14, s20
 ; GISEL-NEXT:    fmov w15, s21
 ; GISEL-NEXT:    and w10, w12, w13
-; GISEL-NEXT:    and w8, w8, w9
 ; GISEL-NEXT:    and w14, w14, w15
 ; GISEL-NEXT:    and w15, w16, w17
 ; GISEL-NEXT:    and w11, w14, w15
@@ -328,14 +328,14 @@ define i8 @test_redand_v8i8(<8 x i8> %a) {
 ; GISEL-NEXT:    fmov w11, s3
 ; GISEL-NEXT:    fmov w12, s4
 ; GISEL-NEXT:    fmov w13, s5
+; GISEL-NEXT:    fmov w14, s6
 ; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    fmov w9, s6
+; GISEL-NEXT:    fmov w9, s7
 ; GISEL-NEXT:    and w10, w10, w11
-; GISEL-NEXT:    fmov w11, s7
-; GISEL-NEXT:    and w12, w12, w13
+; GISEL-NEXT:    and w11, w12, w13
 ; GISEL-NEXT:    and w8, w8, w10
-; GISEL-NEXT:    and w9, w9, w11
-; GISEL-NEXT:    and w9, w12, w9
+; GISEL-NEXT:    and w9, w14, w9
+; GISEL-NEXT:    and w9, w11, w9
 ; GISEL-NEXT:    and w0, w8, w9
 ; GISEL-NEXT:    ret
   %and_result = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a)
@@ -371,14 +371,14 @@ define i8 @test_redand_v16i8(<16 x i8> %a) {
 ; GISEL-NEXT:    fmov w11, s3
 ; GISEL-NEXT:    fmov w12, s4
 ; GISEL-NEXT:    fmov w13, s5
+; GISEL-NEXT:    fmov w14, s6
 ; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    fmov w9, s6
+; GISEL-NEXT:    fmov w9, s7
 ; GISEL-NEXT:    and w10, w10, w11
-; GISEL-NEXT:    fmov w11, s7
-; GISEL-NEXT:    and w12, w12, w13
+; GISEL-NEXT:    and w11, w12, w13
 ; GISEL-NEXT:    and w8, w8, w10
-; GISEL-NEXT:    and w9, w9, w11
-; GISEL-NEXT:    and w9, w12, w9
+; GISEL-NEXT:    and w9, w14, w9
+; GISEL-NEXT:    and w9, w11, w9
 ; GISEL-NEXT:    and w0, w8, w9
 ; GISEL-NEXT:    ret
   %and_result = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a)
@@ -416,14 +416,14 @@ define i8 @test_redand_v32i8(<32 x i8> %a) {
 ; GISEL-NEXT:    fmov w11, s3
 ; GISEL-NEXT:    fmov w12, s4
 ; GISEL-NEXT:    fmov w13, s5
+; GISEL-NEXT:    fmov w14, s6
 ; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    fmov w9, s6
+; GISEL-NEXT:    fmov w9, s7
 ; GISEL-NEXT:    and w10, w10, w11
-; GISEL-NEXT:    fmov w11, s7
-; GISEL-NEXT:    and w12, w12, w13
+; GISEL-NEXT:    and w11, w12, w13
 ; GISEL-NEXT:    and w8, w8, w10
-; GISEL-NEXT:    and w9, w9, w11
-; GISEL-NEXT:    and w9, w12, w9
+; GISEL-NEXT:    and w9, w14, w9
+; GISEL-NEXT:    and w9, w11, w9
 ; GISEL-NEXT:    and w0, w8, w9
 ; GISEL-NEXT:    ret
   %and_result = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %a)

diff  --git a/llvm/test/CodeGen/AArch64/reduce-or.ll b/llvm/test/CodeGen/AArch64/reduce-or.ll
index 591182018164c3..4c30a32934964b 100644
--- a/llvm/test/CodeGen/AArch64/reduce-or.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-or.ll
@@ -95,14 +95,14 @@ define i1 @test_redor_v8i1(<8 x i1> %a) {
 ; GISEL-NEXT:    fmov w11, s3
 ; GISEL-NEXT:    fmov w12, s4
 ; GISEL-NEXT:    fmov w13, s5
+; GISEL-NEXT:    fmov w14, s6
 ; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    fmov w9, s6
+; GISEL-NEXT:    fmov w9, s7
 ; GISEL-NEXT:    orr w10, w10, w11
-; GISEL-NEXT:    fmov w11, s7
-; GISEL-NEXT:    orr w12, w12, w13
+; GISEL-NEXT:    orr w11, w12, w13
 ; GISEL-NEXT:    orr w8, w8, w10
-; GISEL-NEXT:    orr w9, w9, w11
-; GISEL-NEXT:    orr w9, w12, w9
+; GISEL-NEXT:    orr w9, w14, w9
+; GISEL-NEXT:    orr w9, w11, w9
 ; GISEL-NEXT:    orr w8, w8, w9
 ; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
@@ -130,39 +130,39 @@ define i1 @test_redor_v16i1(<16 x i1> %a) {
 ; GISEL-NEXT:    mov b6, v0.b[6]
 ; GISEL-NEXT:    mov b7, v0.b[7]
 ; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
 ; GISEL-NEXT:    mov b16, v0.b[8]
 ; GISEL-NEXT:    mov b17, v0.b[9]
 ; GISEL-NEXT:    mov b18, v0.b[10]
 ; GISEL-NEXT:    mov b19, v0.b[11]
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    orr w9, w10, w11
-; GISEL-NEXT:    fmov w10, s4
-; GISEL-NEXT:    fmov w11, s5
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    fmov w10, s2
+; GISEL-NEXT:    fmov w11, s3
 ; GISEL-NEXT:    fmov w12, s6
-; GISEL-NEXT:    fmov w13, s7
 ; GISEL-NEXT:    mov b20, v0.b[12]
 ; GISEL-NEXT:    mov b21, v0.b[13]
+; GISEL-NEXT:    fmov w13, s7
 ; GISEL-NEXT:    mov b22, v0.b[14]
 ; GISEL-NEXT:    mov b23, v0.b[15]
-; GISEL-NEXT:    orr w10, w10, w11
-; GISEL-NEXT:    orr w11, w12, w13
-; GISEL-NEXT:    fmov w12, s16
-; GISEL-NEXT:    fmov w13, s17
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    orr w9, w10, w11
+; GISEL-NEXT:    fmov w10, s4
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    fmov w11, s5
 ; GISEL-NEXT:    fmov w14, s18
 ; GISEL-NEXT:    fmov w15, s19
 ; GISEL-NEXT:    fmov w16, s22
 ; GISEL-NEXT:    fmov w17, s23
+; GISEL-NEXT:    orr w10, w10, w11
+; GISEL-NEXT:    orr w11, w12, w13
+; GISEL-NEXT:    fmov w12, s16
+; GISEL-NEXT:    orr w9, w10, w11
+; GISEL-NEXT:    fmov w13, s17
 ; GISEL-NEXT:    orr w8, w8, w9
 ; GISEL-NEXT:    orr w12, w12, w13
-; GISEL-NEXT:    orr w9, w10, w11
 ; GISEL-NEXT:    orr w13, w14, w15
 ; GISEL-NEXT:    fmov w14, s20
 ; GISEL-NEXT:    fmov w15, s21
 ; GISEL-NEXT:    orr w10, w12, w13
-; GISEL-NEXT:    orr w8, w8, w9
 ; GISEL-NEXT:    orr w14, w14, w15
 ; GISEL-NEXT:    orr w15, w16, w17
 ; GISEL-NEXT:    orr w11, w14, w15
@@ -192,39 +192,39 @@ define <16 x i1> @test_redor_ins_v16i1(<16 x i1> %a) {
 ; GISEL-NEXT:    mov b6, v0.b[6]
 ; GISEL-NEXT:    mov b7, v0.b[7]
 ; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
 ; GISEL-NEXT:    mov b16, v0.b[8]
 ; GISEL-NEXT:    mov b17, v0.b[9]
 ; GISEL-NEXT:    mov b18, v0.b[10]
 ; GISEL-NEXT:    mov b19, v0.b[11]
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    orr w9, w10, w11
-; GISEL-NEXT:    fmov w10, s4
-; GISEL-NEXT:    fmov w11, s5
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    fmov w10, s2
+; GISEL-NEXT:    fmov w11, s3
 ; GISEL-NEXT:    fmov w12, s6
-; GISEL-NEXT:    fmov w13, s7
 ; GISEL-NEXT:    mov b20, v0.b[12]
 ; GISEL-NEXT:    mov b21, v0.b[13]
+; GISEL-NEXT:    fmov w13, s7
 ; GISEL-NEXT:    mov b22, v0.b[14]
 ; GISEL-NEXT:    mov b23, v0.b[15]
-; GISEL-NEXT:    orr w10, w10, w11
-; GISEL-NEXT:    orr w11, w12, w13
-; GISEL-NEXT:    fmov w12, s16
-; GISEL-NEXT:    fmov w13, s17
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    orr w9, w10, w11
+; GISEL-NEXT:    fmov w10, s4
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    fmov w11, s5
 ; GISEL-NEXT:    fmov w14, s18
 ; GISEL-NEXT:    fmov w15, s19
 ; GISEL-NEXT:    fmov w16, s22
 ; GISEL-NEXT:    fmov w17, s23
+; GISEL-NEXT:    orr w10, w10, w11
+; GISEL-NEXT:    orr w11, w12, w13
+; GISEL-NEXT:    fmov w12, s16
+; GISEL-NEXT:    orr w9, w10, w11
+; GISEL-NEXT:    fmov w13, s17
 ; GISEL-NEXT:    orr w8, w8, w9
 ; GISEL-NEXT:    orr w12, w12, w13
-; GISEL-NEXT:    orr w9, w10, w11
 ; GISEL-NEXT:    orr w13, w14, w15
 ; GISEL-NEXT:    fmov w14, s20
 ; GISEL-NEXT:    fmov w15, s21
 ; GISEL-NEXT:    orr w10, w12, w13
-; GISEL-NEXT:    orr w8, w8, w9
 ; GISEL-NEXT:    orr w14, w14, w15
 ; GISEL-NEXT:    orr w15, w16, w17
 ; GISEL-NEXT:    orr w11, w14, w15
@@ -330,14 +330,14 @@ define i8 @test_redor_v8i8(<8 x i8> %a) {
 ; GISEL-NEXT:    fmov w11, s3
 ; GISEL-NEXT:    fmov w12, s4
 ; GISEL-NEXT:    fmov w13, s5
+; GISEL-NEXT:    fmov w14, s6
 ; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    fmov w9, s6
+; GISEL-NEXT:    fmov w9, s7
 ; GISEL-NEXT:    orr w10, w10, w11
-; GISEL-NEXT:    fmov w11, s7
-; GISEL-NEXT:    orr w12, w12, w13
+; GISEL-NEXT:    orr w11, w12, w13
 ; GISEL-NEXT:    orr w8, w8, w10
-; GISEL-NEXT:    orr w9, w9, w11
-; GISEL-NEXT:    orr w9, w12, w9
+; GISEL-NEXT:    orr w9, w14, w9
+; GISEL-NEXT:    orr w9, w11, w9
 ; GISEL-NEXT:    orr w0, w8, w9
 ; GISEL-NEXT:    ret
   %or_result = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a)
@@ -373,14 +373,14 @@ define i8 @test_redor_v16i8(<16 x i8> %a) {
 ; GISEL-NEXT:    fmov w11, s3
 ; GISEL-NEXT:    fmov w12, s4
 ; GISEL-NEXT:    fmov w13, s5
+; GISEL-NEXT:    fmov w14, s6
 ; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    fmov w9, s6
+; GISEL-NEXT:    fmov w9, s7
 ; GISEL-NEXT:    orr w10, w10, w11
-; GISEL-NEXT:    fmov w11, s7
-; GISEL-NEXT:    orr w12, w12, w13
+; GISEL-NEXT:    orr w11, w12, w13
 ; GISEL-NEXT:    orr w8, w8, w10
-; GISEL-NEXT:    orr w9, w9, w11
-; GISEL-NEXT:    orr w9, w12, w9
+; GISEL-NEXT:    orr w9, w14, w9
+; GISEL-NEXT:    orr w9, w11, w9
 ; GISEL-NEXT:    orr w0, w8, w9
 ; GISEL-NEXT:    ret
   %or_result = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a)
@@ -418,14 +418,14 @@ define i8 @test_redor_v32i8(<32 x i8> %a) {
 ; GISEL-NEXT:    fmov w11, s3
 ; GISEL-NEXT:    fmov w12, s4
 ; GISEL-NEXT:    fmov w13, s5
+; GISEL-NEXT:    fmov w14, s6
 ; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    fmov w9, s6
+; GISEL-NEXT:    fmov w9, s7
 ; GISEL-NEXT:    orr w10, w10, w11
-; GISEL-NEXT:    fmov w11, s7
-; GISEL-NEXT:    orr w12, w12, w13
+; GISEL-NEXT:    orr w11, w12, w13
 ; GISEL-NEXT:    orr w8, w8, w10
-; GISEL-NEXT:    orr w9, w9, w11
-; GISEL-NEXT:    orr w9, w12, w9
+; GISEL-NEXT:    orr w9, w14, w9
+; GISEL-NEXT:    orr w9, w11, w9
 ; GISEL-NEXT:    orr w0, w8, w9
 ; GISEL-NEXT:    ret
   %or_result = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %a)

diff  --git a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
index a9365dbb1928c8..a080a7403811fc 100644
--- a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
@@ -4,122 +4,129 @@
 define i32 @v1(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocapture noundef readonly %p2, i32 noundef %i2) {
 ; CHECK-LABEL: v1:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w3 killed $w3 def $x3
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-NEXT:    sxtw x8, w1
-; CHECK-NEXT:    // kill: def $w3 killed $w3 def $x3
 ; CHECK-NEXT:    sxtw x9, w3
-; CHECK-NEXT:    add x10, x0, x8
-; CHECK-NEXT:    add x11, x2, x9
-; CHECK-NEXT:    add x12, x10, x8
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x2]
+; CHECK-NEXT:    add x10, x0, x8
+; CHECK-NEXT:    add x11, x2, x9
 ; CHECK-NEXT:    ldr d2, [x10]
-; CHECK-NEXT:    add x10, x11, x9
-; CHECK-NEXT:    ldr d6, [x12, x8]
-; CHECK-NEXT:    ldr d7, [x10, x9]
+; CHECK-NEXT:    add x10, x10, x8
 ; CHECK-NEXT:    ldr d3, [x11]
-; CHECK-NEXT:    ldr d4, [x12]
-; CHECK-NEXT:    ldr d5, [x10]
-; CHECK-NEXT:    usubl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    add x11, x11, x9
+; CHECK-NEXT:    ldr d4, [x10]
+; CHECK-NEXT:    ldr d6, [x10, x8]
+; CHECK-NEXT:    ldr d5, [x11]
+; CHECK-NEXT:    ldr d7, [x11, x9]
 ; CHECK-NEXT:    usubl v2.8h, v2.8b, v3.8b
+; CHECK-NEXT:    usubl v0.8h, v0.8b, v1.8b
 ; CHECK-NEXT:    usubl v1.8h, v4.8b, v5.8b
 ; CHECK-NEXT:    usubl v3.8h, v6.8b, v7.8b
-; CHECK-NEXT:    shll2 v4.4s, v0.8h, #16
-; CHECK-NEXT:    shll2 v5.4s, v2.8h, #16
+; CHECK-NEXT:    shll2 v4.4s, v2.8h, #16
+; CHECK-NEXT:    shll2 v5.4s, v0.8h, #16
 ; CHECK-NEXT:    shll2 v6.4s, v3.8h, #16
 ; CHECK-NEXT:    shll2 v7.4s, v1.8h, #16
-; CHECK-NEXT:    saddw v0.4s, v4.4s, v0.4h
-; CHECK-NEXT:    saddw v2.4s, v5.4s, v2.4h
+; CHECK-NEXT:    saddw v2.4s, v4.4s, v2.4h
+; CHECK-NEXT:    saddw v0.4s, v5.4s, v0.4h
 ; CHECK-NEXT:    saddw v3.4s, v6.4s, v3.4h
 ; CHECK-NEXT:    saddw v1.4s, v7.4s, v1.4h
-; CHECK-NEXT:    zip1 v5.4s, v2.4s, v0.4s
-; CHECK-NEXT:    zip2 v4.4s, v2.4s, v0.4s
-; CHECK-NEXT:    uzp2 v7.4s, v3.4s, v1.4s
+; CHECK-NEXT:    mov v7.16b, v2.16b
+; CHECK-NEXT:    zip1 v4.4s, v2.4s, v0.4s
+; CHECK-NEXT:    zip2 v6.4s, v2.4s, v0.4s
+; CHECK-NEXT:    uzp2 v5.4s, v3.4s, v1.4s
 ; CHECK-NEXT:    mov v17.16b, v1.16b
-; CHECK-NEXT:    zip2 v18.4s, v3.4s, v1.4s
-; CHECK-NEXT:    ext v19.16b, v2.16b, v5.16b, #8
-; CHECK-NEXT:    uzp2 v7.4s, v7.4s, v3.4s
-; CHECK-NEXT:    mov v2.s[3], v0.s[2]
-; CHECK-NEXT:    zip2 v6.4s, v1.4s, v3.4s
-; CHECK-NEXT:    ext v16.16b, v3.16b, v3.16b, #12
+; CHECK-NEXT:    zip2 v16.4s, v1.4s, v3.4s
+; CHECK-NEXT:    mov v7.s[3], v0.s[2]
+; CHECK-NEXT:    ext v18.16b, v3.16b, v3.16b, #12
+; CHECK-NEXT:    ext v2.16b, v2.16b, v4.16b, #8
 ; CHECK-NEXT:    mov v17.s[1], v3.s[0]
+; CHECK-NEXT:    uzp2 v0.4s, v5.4s, v3.4s
+; CHECK-NEXT:    zip2 v5.4s, v3.4s, v1.4s
 ; CHECK-NEXT:    mov v3.s[0], v1.s[1]
-; CHECK-NEXT:    mov v7.d[1], v4.d[1]
-; CHECK-NEXT:    mov v18.d[1], v2.d[1]
-; CHECK-NEXT:    mov v17.d[1], v19.d[1]
-; CHECK-NEXT:    mov v3.d[1], v5.d[1]
-; CHECK-NEXT:    ext v16.16b, v1.16b, v16.16b, #12
-; CHECK-NEXT:    add v1.4s, v7.4s, v18.4s
-; CHECK-NEXT:    mov v6.d[1], v2.d[1]
-; CHECK-NEXT:    add v0.4s, v3.4s, v17.4s
-; CHECK-NEXT:    mov v16.d[1], v4.d[1]
-; CHECK-NEXT:    sub v2.4s, v17.4s, v3.4s
+; CHECK-NEXT:    ext v1.16b, v1.16b, v18.16b, #12
+; CHECK-NEXT:    mov v16.d[1], v7.d[1]
+; CHECK-NEXT:    mov v17.d[1], v2.d[1]
+; CHECK-NEXT:    mov v0.d[1], v6.d[1]
+; CHECK-NEXT:    mov v5.d[1], v7.d[1]
+; CHECK-NEXT:    mov v3.d[1], v4.d[1]
+; CHECK-NEXT:    mov v1.d[1], v6.d[1]
+; CHECK-NEXT:    add v0.4s, v0.4s, v5.4s
+; CHECK-NEXT:    add v2.4s, v3.4s, v17.4s
+; CHECK-NEXT:    sub v3.4s, v17.4s, v3.4s
+; CHECK-NEXT:    sub v1.4s, v16.4s, v1.4s
+; CHECK-NEXT:    rev64 v4.4s, v0.4s
+; CHECK-NEXT:    rev64 v5.4s, v2.4s
+; CHECK-NEXT:    add v6.4s, v1.4s, v3.4s
+; CHECK-NEXT:    sub v1.4s, v3.4s, v1.4s
+; CHECK-NEXT:    mov v4.d[1], v0.d[1]
+; CHECK-NEXT:    mov v5.d[1], v2.d[1]
 ; CHECK-NEXT:    rev64 v3.4s, v1.4s
-; CHECK-NEXT:    rev64 v5.4s, v0.4s
-; CHECK-NEXT:    sub v4.4s, v6.4s, v16.4s
-; CHECK-NEXT:    mov v3.d[1], v1.d[1]
-; CHECK-NEXT:    mov v5.d[1], v0.d[1]
-; CHECK-NEXT:    add v6.4s, v4.4s, v2.4s
 ; CHECK-NEXT:    sub v2.4s, v2.4s, v4.4s
-; CHECK-NEXT:    sub v0.4s, v0.4s, v3.4s
-; CHECK-NEXT:    rev64 v4.4s, v2.4s
-; CHECK-NEXT:    rev64 v3.4s, v6.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v5.4s
-; CHECK-NEXT:    addp v7.4s, v0.4s, v2.4s
-; CHECK-NEXT:    addp v5.4s, v1.4s, v6.4s
-; CHECK-NEXT:    sub v2.4s, v2.4s, v4.4s
-; CHECK-NEXT:    sub v3.4s, v6.4s, v3.4s
-; CHECK-NEXT:    rev64 v6.4s, v0.4s
-; CHECK-NEXT:    ext v4.16b, v7.16b, v2.16b, #4
-; CHECK-NEXT:    rev64 v16.4s, v1.4s
-; CHECK-NEXT:    ext v17.16b, v5.16b, v3.16b, #4
-; CHECK-NEXT:    sub v0.4s, v0.4s, v6.4s
-; CHECK-NEXT:    zip2 v4.4s, v4.4s, v7.4s
-; CHECK-NEXT:    ext v6.16b, v0.16b, v7.16b, #8
-; CHECK-NEXT:    sub v1.4s, v1.4s, v16.4s
-; CHECK-NEXT:    zip2 v16.4s, v17.4s, v5.4s
-; CHECK-NEXT:    zip1 v18.4s, v5.4s, v5.4s
-; CHECK-NEXT:    ext v19.16b, v1.16b, v5.16b, #4
-; CHECK-NEXT:    ext v4.16b, v2.16b, v4.16b, #12
-; CHECK-NEXT:    mov v2.s[2], v7.s[3]
-; CHECK-NEXT:    ext v17.16b, v6.16b, v0.16b, #4
-; CHECK-NEXT:    ext v16.16b, v3.16b, v16.16b, #12
-; CHECK-NEXT:    mov v3.s[2], v5.s[3]
-; CHECK-NEXT:    trn2 v1.4s, v18.4s, v1.4s
-; CHECK-NEXT:    ext v18.16b, v19.16b, v19.16b, #4
-; CHECK-NEXT:    mov v0.s[2], v7.s[1]
-; CHECK-NEXT:    uzp2 v6.4s, v6.4s, v17.4s
-; CHECK-NEXT:    sub v17.4s, v2.4s, v4.4s
-; CHECK-NEXT:    sub v21.4s, v3.4s, v16.4s
-; CHECK-NEXT:    mov v3.s[1], v5.s[2]
-; CHECK-NEXT:    mov v2.s[1], v7.s[2]
-; CHECK-NEXT:    sub v19.4s, v1.4s, v18.4s
-; CHECK-NEXT:    mov v18.s[0], v5.s[1]
-; CHECK-NEXT:    sub v20.4s, v0.4s, v6.4s
-; CHECK-NEXT:    mov v0.s[1], v7.s[0]
-; CHECK-NEXT:    add v3.4s, v3.4s, v16.4s
-; CHECK-NEXT:    add v2.4s, v2.4s, v4.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v18.4s
-; CHECK-NEXT:    mov v2.d[1], v17.d[1]
-; CHECK-NEXT:    mov v3.d[1], v21.d[1]
+; CHECK-NEXT:    add v0.4s, v0.4s, v5.4s
+; CHECK-NEXT:    rev64 v4.4s, v6.4s
+; CHECK-NEXT:    rev64 v5.4s, v2.4s
+; CHECK-NEXT:    rev64 v7.4s, v0.4s
+; CHECK-NEXT:    addp v16.4s, v0.4s, v6.4s
+; CHECK-NEXT:    addp v17.4s, v2.4s, v1.4s
+; CHECK-NEXT:    sub v4.4s, v6.4s, v4.4s
+; CHECK-NEXT:    sub v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    sub v2.4s, v2.4s, v5.4s
+; CHECK-NEXT:    sub v0.4s, v0.4s, v7.4s
+; CHECK-NEXT:    zip1 v21.4s, v16.4s, v16.4s
+; CHECK-NEXT:    ext v5.16b, v17.16b, v1.16b, #4
+; CHECK-NEXT:    ext v6.16b, v16.16b, v4.16b, #4
+; CHECK-NEXT:    mov v18.16b, v1.16b
+; CHECK-NEXT:    mov v19.16b, v4.16b
+; CHECK-NEXT:    ext v3.16b, v2.16b, v17.16b, #8
+; CHECK-NEXT:    ext v7.16b, v0.16b, v16.16b, #4
+; CHECK-NEXT:    mov v18.s[2], v17.s[3]
+; CHECK-NEXT:    zip2 v5.4s, v5.4s, v17.4s
+; CHECK-NEXT:    zip2 v6.4s, v6.4s, v16.4s
+; CHECK-NEXT:    mov v19.s[2], v16.s[3]
+; CHECK-NEXT:    trn2 v0.4s, v21.4s, v0.4s
+; CHECK-NEXT:    ext v20.16b, v3.16b, v2.16b, #4
+; CHECK-NEXT:    ext v7.16b, v7.16b, v7.16b, #4
+; CHECK-NEXT:    mov v2.s[2], v17.s[1]
+; CHECK-NEXT:    ext v1.16b, v1.16b, v5.16b, #12
+; CHECK-NEXT:    ext v4.16b, v4.16b, v6.16b, #12
+; CHECK-NEXT:    mov v5.16b, v18.16b
+; CHECK-NEXT:    uzp2 v3.4s, v3.4s, v20.4s
+; CHECK-NEXT:    mov v6.16b, v7.16b
+; CHECK-NEXT:    mov v20.16b, v19.16b
+; CHECK-NEXT:    mov v21.16b, v2.16b
+; CHECK-NEXT:    mov v5.s[1], v17.s[2]
+; CHECK-NEXT:    sub v7.4s, v0.4s, v7.4s
+; CHECK-NEXT:    mov v6.s[0], v16.s[1]
+; CHECK-NEXT:    mov v20.s[1], v16.s[2]
+; CHECK-NEXT:    sub v16.4s, v19.4s, v4.4s
+; CHECK-NEXT:    mov v21.s[1], v17.s[0]
+; CHECK-NEXT:    sub v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    sub v17.4s, v18.4s, v1.4s
+; CHECK-NEXT:    add v1.4s, v5.4s, v1.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v6.4s
-; CHECK-NEXT:    mov v1.d[1], v19.d[1]
-; CHECK-NEXT:    mov v0.d[1], v20.d[1]
-; CHECK-NEXT:    cmlt v6.8h, v2.8h, #0
-; CHECK-NEXT:    cmlt v7.8h, v3.8h, #0
-; CHECK-NEXT:    cmlt v4.8h, v1.8h, #0
-; CHECK-NEXT:    add v2.4s, v6.4s, v2.4s
-; CHECK-NEXT:    add v3.4s, v7.4s, v3.4s
-; CHECK-NEXT:    cmlt v5.8h, v0.8h, #0
-; CHECK-NEXT:    add v1.4s, v4.4s, v1.4s
-; CHECK-NEXT:    eor v3.16b, v3.16b, v7.16b
-; CHECK-NEXT:    eor v2.16b, v2.16b, v6.16b
-; CHECK-NEXT:    add v2.4s, v3.4s, v2.4s
-; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
-; CHECK-NEXT:    eor v1.16b, v1.16b, v4.16b
-; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    eor v0.16b, v0.16b, v5.16b
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    add v4.4s, v20.4s, v4.4s
+; CHECK-NEXT:    add v3.4s, v21.4s, v3.4s
+; CHECK-NEXT:    mov v1.d[1], v17.d[1]
+; CHECK-NEXT:    mov v0.d[1], v7.d[1]
+; CHECK-NEXT:    mov v4.d[1], v16.d[1]
+; CHECK-NEXT:    mov v3.d[1], v2.d[1]
+; CHECK-NEXT:    cmlt v7.8h, v1.8h, #0
+; CHECK-NEXT:    cmlt v2.8h, v0.8h, #0
+; CHECK-NEXT:    cmlt v6.8h, v4.8h, #0
+; CHECK-NEXT:    cmlt v5.8h, v3.8h, #0
+; CHECK-NEXT:    add v1.4s, v7.4s, v1.4s
+; CHECK-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    add v4.4s, v6.4s, v4.4s
+; CHECK-NEXT:    add v3.4s, v5.4s, v3.4s
+; CHECK-NEXT:    eor v1.16b, v1.16b, v7.16b
+; CHECK-NEXT:    eor v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    eor v2.16b, v3.16b, v5.16b
+; CHECK-NEXT:    eor v3.16b, v4.16b, v6.16b
+; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    lsr w9, w8, #16
@@ -224,110 +231,112 @@ define i32 @v2(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
 ; CHECK-NEXT:    sxtw x8, w1
 ; CHECK-NEXT:    // kill: def $w3 killed $w3 def $x3
 ; CHECK-NEXT:    sxtw x9, w3
+; CHECK-NEXT:    ldr d4, [x0]
+; CHECK-NEXT:    ldr d5, [x2]
 ; CHECK-NEXT:    add x10, x0, x8
 ; CHECK-NEXT:    add x11, x2, x9
 ; CHECK-NEXT:    add x12, x10, x8
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ldr d1, [x2]
-; CHECK-NEXT:    ldr d2, [x10]
-; CHECK-NEXT:    add x10, x11, x9
-; CHECK-NEXT:    ldr d6, [x12, x8]
-; CHECK-NEXT:    ldr d7, [x10, x9]
-; CHECK-NEXT:    ldr d3, [x11]
-; CHECK-NEXT:    ldr d4, [x12]
-; CHECK-NEXT:    ldr d5, [x10]
-; CHECK-NEXT:    usubl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    usubl v2.8h, v2.8b, v3.8b
-; CHECK-NEXT:    usubl v1.8h, v4.8b, v5.8b
+; CHECK-NEXT:    ldr d6, [x10]
+; CHECK-NEXT:    ldr d7, [x11]
+; CHECK-NEXT:    ldr d0, [x12, x8]
+; CHECK-NEXT:    add x8, x11, x9
+; CHECK-NEXT:    ldr d1, [x12]
+; CHECK-NEXT:    ldr d2, [x8, x9]
+; CHECK-NEXT:    ldr d3, [x8]
+; CHECK-NEXT:    usubl v1.8h, v1.8b, v3.8b
+; CHECK-NEXT:    usubl v0.8h, v0.8b, v2.8b
 ; CHECK-NEXT:    usubl v3.8h, v6.8b, v7.8b
+; CHECK-NEXT:    usubl v2.8h, v4.8b, v5.8b
 ; CHECK-NEXT:    shll2 v4.4s, v0.8h, #16
-; CHECK-NEXT:    shll2 v5.4s, v2.8h, #16
-; CHECK-NEXT:    shll2 v6.4s, v3.8h, #16
-; CHECK-NEXT:    shll2 v7.4s, v1.8h, #16
+; CHECK-NEXT:    shll2 v5.4s, v1.8h, #16
+; CHECK-NEXT:    shll2 v7.4s, v3.8h, #16
+; CHECK-NEXT:    shll2 v6.4s, v2.8h, #16
 ; CHECK-NEXT:    saddw v0.4s, v4.4s, v0.4h
-; CHECK-NEXT:    saddw v2.4s, v5.4s, v2.4h
-; CHECK-NEXT:    saddw v3.4s, v6.4s, v3.4h
-; CHECK-NEXT:    saddw v1.4s, v7.4s, v1.4h
-; CHECK-NEXT:    zip1 v5.4s, v2.4s, v0.4s
-; CHECK-NEXT:    ext v17.16b, v3.16b, v3.16b, #12
-; CHECK-NEXT:    uzp2 v7.4s, v3.4s, v1.4s
-; CHECK-NEXT:    mov v16.16b, v3.16b
-; CHECK-NEXT:    zip2 v4.4s, v2.4s, v0.4s
-; CHECK-NEXT:    zip2 v6.4s, v1.4s, v3.4s
-; CHECK-NEXT:    zip2 v18.4s, v3.4s, v1.4s
-; CHECK-NEXT:    ext v19.16b, v2.16b, v5.16b, #8
-; CHECK-NEXT:    mov v16.s[0], v1.s[1]
-; CHECK-NEXT:    ext v17.16b, v1.16b, v17.16b, #12
-; CHECK-NEXT:    uzp2 v7.4s, v7.4s, v3.4s
-; CHECK-NEXT:    mov v2.s[3], v0.s[2]
-; CHECK-NEXT:    mov v1.s[1], v3.s[0]
-; CHECK-NEXT:    mov v16.d[1], v5.d[1]
-; CHECK-NEXT:    mov v7.d[1], v4.d[1]
+; CHECK-NEXT:    saddw v1.4s, v5.4s, v1.4h
+; CHECK-NEXT:    saddw v3.4s, v7.4s, v3.4h
+; CHECK-NEXT:    saddw v2.4s, v6.4s, v2.4h
+; CHECK-NEXT:    uzp2 v4.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mov v7.16b, v3.16b
+; CHECK-NEXT:    mov v17.16b, v1.16b
+; CHECK-NEXT:    zip1 v5.4s, v3.4s, v2.4s
+; CHECK-NEXT:    zip2 v6.4s, v3.4s, v2.4s
+; CHECK-NEXT:    zip2 v16.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ext v18.16b, v0.16b, v0.16b, #12
+; CHECK-NEXT:    mov v7.s[3], v2.s[2]
+; CHECK-NEXT:    mov v17.s[1], v0.s[0]
+; CHECK-NEXT:    uzp2 v2.4s, v4.4s, v0.4s
+; CHECK-NEXT:    mov v4.16b, v0.16b
+; CHECK-NEXT:    zip2 v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    ext v3.16b, v3.16b, v5.16b, #8
+; CHECK-NEXT:    mov v4.s[0], v1.s[1]
+; CHECK-NEXT:    mov v16.d[1], v7.d[1]
+; CHECK-NEXT:    ext v1.16b, v1.16b, v18.16b, #12
+; CHECK-NEXT:    mov v2.d[1], v6.d[1]
+; CHECK-NEXT:    mov v0.d[1], v7.d[1]
+; CHECK-NEXT:    mov v17.d[1], v3.d[1]
+; CHECK-NEXT:    mov v4.d[1], v5.d[1]
+; CHECK-NEXT:    mov v1.d[1], v6.d[1]
+; CHECK-NEXT:    add v2.4s, v2.4s, v16.4s
+; CHECK-NEXT:    add v3.4s, v4.4s, v17.4s
+; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    sub v1.4s, v17.4s, v4.4s
+; CHECK-NEXT:    rev64 v5.4s, v2.4s
+; CHECK-NEXT:    rev64 v6.4s, v3.4s
+; CHECK-NEXT:    sub v4.4s, v1.4s, v0.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mov v5.d[1], v2.d[1]
+; CHECK-NEXT:    mov v6.d[1], v3.d[1]
+; CHECK-NEXT:    sub v3.4s, v3.4s, v5.4s
+; CHECK-NEXT:    add v1.4s, v2.4s, v6.4s
+; CHECK-NEXT:    zip1 v2.4s, v3.4s, v4.4s
+; CHECK-NEXT:    zip2 v7.4s, v3.4s, v4.4s
+; CHECK-NEXT:    zip1 v5.4s, v1.4s, v0.4s
+; CHECK-NEXT:    uzp2 v6.4s, v1.4s, v0.4s
+; CHECK-NEXT:    mov v18.16b, v1.16b
+; CHECK-NEXT:    ext v16.16b, v3.16b, v2.16b, #8
+; CHECK-NEXT:    zip2 v17.4s, v1.4s, v0.4s
+; CHECK-NEXT:    mov v3.s[3], v4.s[2]
+; CHECK-NEXT:    mov v18.s[1], v0.s[1]
+; CHECK-NEXT:    trn2 v4.4s, v1.4s, v5.4s
+; CHECK-NEXT:    uzp2 v1.4s, v6.4s, v1.4s
+; CHECK-NEXT:    mov v17.d[1], v3.d[1]
 ; CHECK-NEXT:    mov v18.d[1], v2.d[1]
-; CHECK-NEXT:    mov v1.d[1], v19.d[1]
-; CHECK-NEXT:    mov v6.d[1], v2.d[1]
-; CHECK-NEXT:    mov v17.d[1], v4.d[1]
-; CHECK-NEXT:    add v0.4s, v7.4s, v18.4s
-; CHECK-NEXT:    add v2.4s, v16.4s, v1.4s
-; CHECK-NEXT:    rev64 v3.4s, v0.4s
-; CHECK-NEXT:    rev64 v4.4s, v2.4s
-; CHECK-NEXT:    sub v5.4s, v6.4s, v17.4s
-; CHECK-NEXT:    sub v1.4s, v1.4s, v16.4s
-; CHECK-NEXT:    mov v3.d[1], v0.d[1]
-; CHECK-NEXT:    mov v4.d[1], v2.d[1]
-; CHECK-NEXT:    add v6.4s, v5.4s, v1.4s
-; CHECK-NEXT:    sub v1.4s, v1.4s, v5.4s
-; CHECK-NEXT:    sub v2.4s, v2.4s, v3.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    zip1 v3.4s, v2.4s, v1.4s
-; CHECK-NEXT:    uzp2 v5.4s, v0.4s, v6.4s
-; CHECK-NEXT:    zip2 v4.4s, v2.4s, v1.4s
-; CHECK-NEXT:    zip1 v7.4s, v0.4s, v6.4s
-; CHECK-NEXT:    ext v16.16b, v2.16b, v3.16b, #8
-; CHECK-NEXT:    zip2 v17.4s, v0.4s, v6.4s
-; CHECK-NEXT:    uzp2 v5.4s, v5.4s, v0.4s
-; CHECK-NEXT:    mov v2.s[3], v1.s[2]
-; CHECK-NEXT:    mov v18.16b, v0.16b
-; CHECK-NEXT:    trn2 v0.4s, v0.4s, v7.4s
-; CHECK-NEXT:    mov v18.s[1], v6.s[1]
-; CHECK-NEXT:    mov v5.d[1], v4.d[1]
-; CHECK-NEXT:    mov v17.d[1], v2.d[1]
-; CHECK-NEXT:    mov v0.d[1], v16.d[1]
-; CHECK-NEXT:    mov v18.d[1], v3.d[1]
-; CHECK-NEXT:    add v1.4s, v17.4s, v5.4s
-; CHECK-NEXT:    sub v2.4s, v5.4s, v17.4s
-; CHECK-NEXT:    ext v4.16b, v1.16b, v1.16b, #4
-; CHECK-NEXT:    add v3.4s, v18.4s, v0.4s
-; CHECK-NEXT:    sub v0.4s, v0.4s, v18.4s
-; CHECK-NEXT:    ext v5.16b, v3.16b, v3.16b, #4
-; CHECK-NEXT:    ext v16.16b, v4.16b, v2.16b, #8
-; CHECK-NEXT:    zip1 v6.4s, v1.4s, v2.4s
-; CHECK-NEXT:    zip2 v7.4s, v1.4s, v2.4s
-; CHECK-NEXT:    ext v17.16b, v5.16b, v0.16b, #8
-; CHECK-NEXT:    zip2 v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    zip2 v2.4s, v0.4s, v3.4s
-; CHECK-NEXT:    ext v4.16b, v16.16b, v4.16b, #4
-; CHECK-NEXT:    zip1 v16.4s, v3.4s, v0.4s
-; CHECK-NEXT:    zip2 v0.4s, v3.4s, v0.4s
-; CHECK-NEXT:    ext v5.16b, v17.16b, v5.16b, #4
-; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    sub v3.4s, v6.4s, v16.4s
-; CHECK-NEXT:    sub v0.4s, v7.4s, v0.4s
-; CHECK-NEXT:    cmlt v6.8h, v1.8h, #0
-; CHECK-NEXT:    cmlt v7.8h, v0.8h, #0
-; CHECK-NEXT:    add v2.4s, v5.4s, v4.4s
-; CHECK-NEXT:    cmlt v4.8h, v3.8h, #0
-; CHECK-NEXT:    add v1.4s, v6.4s, v1.4s
-; CHECK-NEXT:    add v0.4s, v7.4s, v0.4s
-; CHECK-NEXT:    cmlt v5.8h, v2.8h, #0
-; CHECK-NEXT:    add v3.4s, v4.4s, v3.4s
-; CHECK-NEXT:    eor v0.16b, v0.16b, v7.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v6.16b
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    add v2.4s, v5.4s, v2.4s
-; CHECK-NEXT:    eor v1.16b, v3.16b, v4.16b
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    eor v1.16b, v2.16b, v5.16b
+; CHECK-NEXT:    mov v4.d[1], v16.d[1]
+; CHECK-NEXT:    mov v1.d[1], v7.d[1]
+; CHECK-NEXT:    add v0.4s, v17.4s, v1.4s
+; CHECK-NEXT:    sub v1.4s, v1.4s, v17.4s
+; CHECK-NEXT:    add v2.4s, v18.4s, v4.4s
+; CHECK-NEXT:    sub v3.4s, v4.4s, v18.4s
+; CHECK-NEXT:    zip2 v4.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ext v5.16b, v0.16b, v0.16b, #4
+; CHECK-NEXT:    ext v6.16b, v2.16b, v2.16b, #4
+; CHECK-NEXT:    zip2 v7.4s, v1.4s, v0.4s
+; CHECK-NEXT:    zip2 v16.4s, v3.4s, v2.4s
+; CHECK-NEXT:    zip2 v17.4s, v2.4s, v3.4s
+; CHECK-NEXT:    zip1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    zip1 v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    ext v1.16b, v5.16b, v1.16b, #8
+; CHECK-NEXT:    ext v18.16b, v6.16b, v3.16b, #8
+; CHECK-NEXT:    add v3.4s, v16.4s, v7.4s
+; CHECK-NEXT:    sub v4.4s, v4.4s, v17.4s
+; CHECK-NEXT:    sub v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ext v1.16b, v1.16b, v5.16b, #4
+; CHECK-NEXT:    ext v5.16b, v18.16b, v6.16b, #4
+; CHECK-NEXT:    cmlt v2.8h, v4.8h, #0
+; CHECK-NEXT:    cmlt v6.8h, v3.8h, #0
+; CHECK-NEXT:    add v3.4s, v6.4s, v3.4s
+; CHECK-NEXT:    add v4.4s, v2.4s, v4.4s
+; CHECK-NEXT:    add v1.4s, v5.4s, v1.4s
+; CHECK-NEXT:    cmlt v5.8h, v0.8h, #0
+; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
+; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
+; CHECK-NEXT:    eor v3.16b, v3.16b, v6.16b
+; CHECK-NEXT:    cmlt v4.8h, v1.8h, #0
+; CHECK-NEXT:    add v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    add v1.4s, v4.4s, v1.4s
+; CHECK-NEXT:    eor v0.16b, v0.16b, v5.16b
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    eor v1.16b, v1.16b, v4.16b
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
@@ -434,111 +443,110 @@ entry:
 define i32 @v3(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocapture noundef readonly %p2, i32 noundef %i2) {
 ; CHECK-LABEL: v3:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w3 killed $w3 def $x3
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-NEXT:    sxtw x8, w1
-; CHECK-NEXT:    // kill: def $w3 killed $w3 def $x3
 ; CHECK-NEXT:    sxtw x9, w3
-; CHECK-NEXT:    add x10, x0, x8
-; CHECK-NEXT:    add x11, x2, x9
-; CHECK-NEXT:    add x12, x10, x8
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x2]
+; CHECK-NEXT:    add x10, x0, x8
+; CHECK-NEXT:    add x11, x2, x9
 ; CHECK-NEXT:    ldr d2, [x10]
-; CHECK-NEXT:    add x10, x11, x9
-; CHECK-NEXT:    ldr d4, [x12, x8]
-; CHECK-NEXT:    ldr d5, [x10, x9]
 ; CHECK-NEXT:    ldr d3, [x11]
-; CHECK-NEXT:    ldr d6, [x12]
-; CHECK-NEXT:    ldr d7, [x10]
 ; CHECK-NEXT:    usubl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    usubl v1.8h, v4.8b, v5.8b
+; CHECK-NEXT:    add x10, x10, x8
+; CHECK-NEXT:    add x11, x11, x9
+; CHECK-NEXT:    usubl v1.8h, v2.8b, v3.8b
+; CHECK-NEXT:    ldr d2, [x10, x8]
+; CHECK-NEXT:    ldr d3, [x11, x9]
+; CHECK-NEXT:    ldr d4, [x10]
+; CHECK-NEXT:    ldr d5, [x11]
 ; CHECK-NEXT:    usubl v2.8h, v2.8b, v3.8b
-; CHECK-NEXT:    usubl v3.8h, v6.8b, v7.8b
+; CHECK-NEXT:    usubl v3.8h, v4.8b, v5.8b
 ; CHECK-NEXT:    shll2 v4.4s, v0.8h, #16
-; CHECK-NEXT:    shll2 v5.4s, v2.8h, #16
-; CHECK-NEXT:    shll2 v6.4s, v3.8h, #16
-; CHECK-NEXT:    shll2 v7.4s, v1.8h, #16
+; CHECK-NEXT:    shll2 v5.4s, v1.8h, #16
 ; CHECK-NEXT:    saddw v0.4s, v4.4s, v0.4h
-; CHECK-NEXT:    saddw v2.4s, v5.4s, v2.4h
-; CHECK-NEXT:    saddw v3.4s, v6.4s, v3.4h
-; CHECK-NEXT:    saddw v1.4s, v7.4s, v1.4h
+; CHECK-NEXT:    shll2 v4.4s, v2.8h, #16
+; CHECK-NEXT:    saddw v1.4s, v5.4s, v1.4h
+; CHECK-NEXT:    shll2 v5.4s, v3.8h, #16
+; CHECK-NEXT:    saddw v2.4s, v4.4s, v2.4h
+; CHECK-NEXT:    saddw v3.4s, v5.4s, v3.4h
 ; CHECK-NEXT:    rev64 v4.4s, v0.4s
-; CHECK-NEXT:    rev64 v5.4s, v2.4s
-; CHECK-NEXT:    rev64 v7.4s, v1.4s
-; CHECK-NEXT:    rev64 v16.4s, v3.4s
-; CHECK-NEXT:    addp v6.4s, v2.4s, v0.4s
-; CHECK-NEXT:    addp v17.4s, v1.4s, v3.4s
-; CHECK-NEXT:    sub v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    sub v2.4s, v2.4s, v5.4s
-; CHECK-NEXT:    sub v3.4s, v3.4s, v16.4s
-; CHECK-NEXT:    sub v1.4s, v1.4s, v7.4s
-; CHECK-NEXT:    ext v4.16b, v2.16b, v0.16b, #4
-; CHECK-NEXT:    zip2 v5.4s, v1.4s, v3.4s
-; CHECK-NEXT:    mov v0.s[3], v2.s[2]
-; CHECK-NEXT:    uzp2 v7.4s, v17.4s, v6.4s
-; CHECK-NEXT:    zip1 v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    ext v3.16b, v6.16b, v6.16b, #8
-; CHECK-NEXT:    mov v5.d[1], v0.d[1]
-; CHECK-NEXT:    ext v0.16b, v4.16b, v2.16b, #4
-; CHECK-NEXT:    uzp1 v2.4s, v17.4s, v6.4s
-; CHECK-NEXT:    rev64 v4.4s, v7.4s
-; CHECK-NEXT:    mov v1.d[1], v0.d[1]
-; CHECK-NEXT:    rev64 v0.4s, v2.4s
-; CHECK-NEXT:    uzp1 v2.4s, v17.4s, v3.4s
-; CHECK-NEXT:    uzp2 v3.4s, v17.4s, v3.4s
-; CHECK-NEXT:    add v6.4s, v5.4s, v1.4s
-; CHECK-NEXT:    add v0.4s, v4.4s, v0.4s
-; CHECK-NEXT:    sub v1.4s, v1.4s, v5.4s
-; CHECK-NEXT:    sub v2.4s, v2.4s, v3.4s
-; CHECK-NEXT:    zip1 v3.4s, v0.4s, v6.4s
-; CHECK-NEXT:    zip1 v4.4s, v2.4s, v1.4s
-; CHECK-NEXT:    mov v7.16b, v0.16b
-; CHECK-NEXT:    uzp2 v5.4s, v0.4s, v6.4s
-; CHECK-NEXT:    trn2 v3.4s, v0.4s, v3.4s
-; CHECK-NEXT:    ext v16.16b, v2.16b, v4.16b, #8
-; CHECK-NEXT:    mov v7.s[1], v6.s[1]
-; CHECK-NEXT:    uzp2 v5.4s, v5.4s, v0.4s
-; CHECK-NEXT:    zip2 v0.4s, v0.4s, v6.4s
-; CHECK-NEXT:    mov v3.d[1], v16.d[1]
-; CHECK-NEXT:    zip2 v6.4s, v2.4s, v1.4s
-; CHECK-NEXT:    mov v7.d[1], v4.d[1]
-; CHECK-NEXT:    mov v2.s[3], v1.s[2]
-; CHECK-NEXT:    mov v5.d[1], v6.d[1]
-; CHECK-NEXT:    add v1.4s, v3.4s, v7.4s
-; CHECK-NEXT:    mov v0.d[1], v2.d[1]
-; CHECK-NEXT:    ext v2.16b, v1.16b, v1.16b, #4
-; CHECK-NEXT:    sub v3.4s, v7.4s, v3.4s
-; CHECK-NEXT:    add v4.4s, v5.4s, v0.4s
-; CHECK-NEXT:    ext v6.16b, v2.16b, v3.16b, #8
-; CHECK-NEXT:    ext v7.16b, v4.16b, v4.16b, #4
+; CHECK-NEXT:    rev64 v5.4s, v1.4s
+; CHECK-NEXT:    rev64 v6.4s, v2.4s
+; CHECK-NEXT:    rev64 v7.4s, v3.4s
+; CHECK-NEXT:    sub v4.4s, v0.4s, v4.4s
+; CHECK-NEXT:    addp v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    sub v5.4s, v1.4s, v5.4s
+; CHECK-NEXT:    sub v6.4s, v2.4s, v6.4s
+; CHECK-NEXT:    addp v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    sub v1.4s, v3.4s, v7.4s
+; CHECK-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ext v7.16b, v5.16b, v4.16b, #4
+; CHECK-NEXT:    mov v4.s[3], v5.s[2]
+; CHECK-NEXT:    zip2 v16.4s, v6.4s, v1.4s
+; CHECK-NEXT:    zip1 v1.4s, v6.4s, v1.4s
+; CHECK-NEXT:    uzp2 v6.4s, v2.4s, v0.4s
+; CHECK-NEXT:    ext v5.16b, v7.16b, v5.16b, #4
+; CHECK-NEXT:    uzp1 v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    uzp1 v7.4s, v2.4s, v3.4s
+; CHECK-NEXT:    uzp2 v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    mov v16.d[1], v4.d[1]
+; CHECK-NEXT:    rev64 v3.4s, v6.4s
+; CHECK-NEXT:    mov v1.d[1], v5.d[1]
+; CHECK-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-NEXT:    sub v2.4s, v7.4s, v2.4s
+; CHECK-NEXT:    sub v4.4s, v1.4s, v16.4s
+; CHECK-NEXT:    add v0.4s, v3.4s, v0.4s
+; CHECK-NEXT:    add v1.4s, v16.4s, v1.4s
+; CHECK-NEXT:    zip1 v3.4s, v2.4s, v4.4s
+; CHECK-NEXT:    zip1 v5.4s, v0.4s, v1.4s
+; CHECK-NEXT:    uzp2 v6.4s, v0.4s, v1.4s
+; CHECK-NEXT:    zip2 v7.4s, v0.4s, v1.4s
+; CHECK-NEXT:    zip2 v17.4s, v2.4s, v4.4s
+; CHECK-NEXT:    ext v16.16b, v2.16b, v3.16b, #8
+; CHECK-NEXT:    trn2 v5.4s, v0.4s, v5.4s
+; CHECK-NEXT:    uzp2 v6.4s, v6.4s, v0.4s
+; CHECK-NEXT:    mov v2.s[3], v4.s[2]
+; CHECK-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-NEXT:    mov v5.d[1], v16.d[1]
+; CHECK-NEXT:    mov v6.d[1], v17.d[1]
+; CHECK-NEXT:    mov v7.d[1], v2.d[1]
+; CHECK-NEXT:    mov v0.d[1], v3.d[1]
+; CHECK-NEXT:    add v1.4s, v6.4s, v7.4s
+; CHECK-NEXT:    sub v2.4s, v7.4s, v6.4s
+; CHECK-NEXT:    add v3.4s, v5.4s, v0.4s
 ; CHECK-NEXT:    sub v0.4s, v0.4s, v5.4s
-; CHECK-NEXT:    zip2 v5.4s, v3.4s, v1.4s
-; CHECK-NEXT:    ext v2.16b, v6.16b, v2.16b, #4
-; CHECK-NEXT:    ext v6.16b, v7.16b, v0.16b, #8
-; CHECK-NEXT:    zip1 v16.4s, v4.4s, v0.4s
-; CHECK-NEXT:    zip2 v17.4s, v4.4s, v0.4s
-; CHECK-NEXT:    zip2 v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    ext v4.16b, v6.16b, v7.16b, #4
-; CHECK-NEXT:    zip1 v6.4s, v1.4s, v3.4s
-; CHECK-NEXT:    zip2 v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    zip2 v4.4s, v1.4s, v2.4s
+; CHECK-NEXT:    ext v5.16b, v1.16b, v1.16b, #4
+; CHECK-NEXT:    ext v6.16b, v3.16b, v3.16b, #4
+; CHECK-NEXT:    zip2 v7.4s, v2.4s, v1.4s
+; CHECK-NEXT:    zip2 v16.4s, v0.4s, v3.4s
+; CHECK-NEXT:    zip2 v17.4s, v3.4s, v0.4s
+; CHECK-NEXT:    zip1 v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    ext v2.16b, v5.16b, v2.16b, #8
+; CHECK-NEXT:    ext v18.16b, v6.16b, v0.16b, #8
+; CHECK-NEXT:    zip1 v0.4s, v3.4s, v0.4s
+; CHECK-NEXT:    add v3.4s, v16.4s, v7.4s
+; CHECK-NEXT:    sub v4.4s, v4.4s, v17.4s
+; CHECK-NEXT:    ext v2.16b, v2.16b, v5.16b, #4
+; CHECK-NEXT:    ext v5.16b, v18.16b, v6.16b, #4
+; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    cmlt v1.8h, v4.8h, #0
+; CHECK-NEXT:    cmlt v6.8h, v3.8h, #0
+; CHECK-NEXT:    add v3.4s, v6.4s, v3.4s
+; CHECK-NEXT:    add v4.4s, v1.4s, v4.4s
+; CHECK-NEXT:    add v2.4s, v5.4s, v2.4s
+; CHECK-NEXT:    cmlt v5.8h, v0.8h, #0
 ; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
-; CHECK-NEXT:    add v2.4s, v2.4s, v4.4s
-; CHECK-NEXT:    sub v3.4s, v16.4s, v6.4s
-; CHECK-NEXT:    sub v1.4s, v17.4s, v1.4s
-; CHECK-NEXT:    cmlt v6.8h, v0.8h, #0
-; CHECK-NEXT:    cmlt v7.8h, v1.8h, #0
-; CHECK-NEXT:    cmlt v4.8h, v3.8h, #0
-; CHECK-NEXT:    add v0.4s, v6.4s, v0.4s
-; CHECK-NEXT:    add v1.4s, v7.4s, v1.4s
-; CHECK-NEXT:    cmlt v5.8h, v2.8h, #0
-; CHECK-NEXT:    add v3.4s, v4.4s, v3.4s
-; CHECK-NEXT:    eor v1.16b, v1.16b, v7.16b
-; CHECK-NEXT:    eor v0.16b, v0.16b, v6.16b
+; CHECK-NEXT:    eor v1.16b, v4.16b, v1.16b
+; CHECK-NEXT:    eor v3.16b, v3.16b, v6.16b
+; CHECK-NEXT:    cmlt v4.8h, v2.8h, #0
+; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
+; CHECK-NEXT:    add v2.4s, v4.4s, v2.4s
+; CHECK-NEXT:    eor v0.16b, v0.16b, v5.16b
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    add v2.4s, v5.4s, v2.4s
-; CHECK-NEXT:    eor v1.16b, v3.16b, v4.16b
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    eor v1.16b, v2.16b, v5.16b
+; CHECK-NEXT:    eor v1.16b, v2.16b, v4.16b
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0

diff  --git a/llvm/test/CodeGen/AArch64/reduce-xor.ll b/llvm/test/CodeGen/AArch64/reduce-xor.ll
index 494399f8085793..c74b3734a1b76c 100644
--- a/llvm/test/CodeGen/AArch64/reduce-xor.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-xor.ll
@@ -88,14 +88,14 @@ define i1 @test_redxor_v8i1(<8 x i1> %a) {
 ; GISEL-NEXT:    fmov w11, s3
 ; GISEL-NEXT:    fmov w12, s4
 ; GISEL-NEXT:    fmov w13, s5
+; GISEL-NEXT:    fmov w14, s6
 ; GISEL-NEXT:    eor w8, w8, w9
-; GISEL-NEXT:    fmov w9, s6
+; GISEL-NEXT:    fmov w9, s7
 ; GISEL-NEXT:    eor w10, w10, w11
-; GISEL-NEXT:    fmov w11, s7
-; GISEL-NEXT:    eor w12, w12, w13
+; GISEL-NEXT:    eor w11, w12, w13
 ; GISEL-NEXT:    eor w8, w8, w10
-; GISEL-NEXT:    eor w9, w9, w11
-; GISEL-NEXT:    eor w9, w12, w9
+; GISEL-NEXT:    eor w9, w14, w9
+; GISEL-NEXT:    eor w9, w11, w9
 ; GISEL-NEXT:    eor w8, w8, w9
 ; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
@@ -121,39 +121,39 @@ define i1 @test_redxor_v16i1(<16 x i1> %a) {
 ; GISEL-NEXT:    mov b6, v0.b[6]
 ; GISEL-NEXT:    mov b7, v0.b[7]
 ; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
 ; GISEL-NEXT:    mov b16, v0.b[8]
 ; GISEL-NEXT:    mov b17, v0.b[9]
 ; GISEL-NEXT:    mov b18, v0.b[10]
 ; GISEL-NEXT:    mov b19, v0.b[11]
-; GISEL-NEXT:    eor w8, w8, w9
-; GISEL-NEXT:    eor w9, w10, w11
-; GISEL-NEXT:    fmov w10, s4
-; GISEL-NEXT:    fmov w11, s5
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    fmov w10, s2
+; GISEL-NEXT:    fmov w11, s3
 ; GISEL-NEXT:    fmov w12, s6
-; GISEL-NEXT:    fmov w13, s7
 ; GISEL-NEXT:    mov b20, v0.b[12]
 ; GISEL-NEXT:    mov b21, v0.b[13]
+; GISEL-NEXT:    fmov w13, s7
 ; GISEL-NEXT:    mov b22, v0.b[14]
 ; GISEL-NEXT:    mov b23, v0.b[15]
-; GISEL-NEXT:    eor w10, w10, w11
-; GISEL-NEXT:    eor w11, w12, w13
-; GISEL-NEXT:    fmov w12, s16
-; GISEL-NEXT:    fmov w13, s17
+; GISEL-NEXT:    eor w8, w8, w9
+; GISEL-NEXT:    eor w9, w10, w11
+; GISEL-NEXT:    fmov w10, s4
+; GISEL-NEXT:    eor w8, w8, w9
+; GISEL-NEXT:    fmov w11, s5
 ; GISEL-NEXT:    fmov w14, s18
 ; GISEL-NEXT:    fmov w15, s19
 ; GISEL-NEXT:    fmov w16, s22
 ; GISEL-NEXT:    fmov w17, s23
+; GISEL-NEXT:    eor w10, w10, w11
+; GISEL-NEXT:    eor w11, w12, w13
+; GISEL-NEXT:    fmov w12, s16
+; GISEL-NEXT:    eor w9, w10, w11
+; GISEL-NEXT:    fmov w13, s17
 ; GISEL-NEXT:    eor w8, w8, w9
 ; GISEL-NEXT:    eor w12, w12, w13
-; GISEL-NEXT:    eor w9, w10, w11
 ; GISEL-NEXT:    eor w13, w14, w15
 ; GISEL-NEXT:    fmov w14, s20
 ; GISEL-NEXT:    fmov w15, s21
 ; GISEL-NEXT:    eor w10, w12, w13
-; GISEL-NEXT:    eor w8, w8, w9
 ; GISEL-NEXT:    eor w14, w14, w15
 ; GISEL-NEXT:    eor w15, w16, w17
 ; GISEL-NEXT:    eor w11, w14, w15
@@ -181,39 +181,39 @@ define <16 x i1> @test_redxor_ins_v16i1(<16 x i1> %a) {
 ; GISEL-NEXT:    mov b6, v0.b[6]
 ; GISEL-NEXT:    mov b7, v0.b[7]
 ; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
 ; GISEL-NEXT:    mov b16, v0.b[8]
 ; GISEL-NEXT:    mov b17, v0.b[9]
 ; GISEL-NEXT:    mov b18, v0.b[10]
 ; GISEL-NEXT:    mov b19, v0.b[11]
-; GISEL-NEXT:    eor w8, w8, w9
-; GISEL-NEXT:    eor w9, w10, w11
-; GISEL-NEXT:    fmov w10, s4
-; GISEL-NEXT:    fmov w11, s5
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    fmov w10, s2
+; GISEL-NEXT:    fmov w11, s3
 ; GISEL-NEXT:    fmov w12, s6
-; GISEL-NEXT:    fmov w13, s7
 ; GISEL-NEXT:    mov b20, v0.b[12]
 ; GISEL-NEXT:    mov b21, v0.b[13]
+; GISEL-NEXT:    fmov w13, s7
 ; GISEL-NEXT:    mov b22, v0.b[14]
 ; GISEL-NEXT:    mov b23, v0.b[15]
-; GISEL-NEXT:    eor w10, w10, w11
-; GISEL-NEXT:    eor w11, w12, w13
-; GISEL-NEXT:    fmov w12, s16
-; GISEL-NEXT:    fmov w13, s17
+; GISEL-NEXT:    eor w8, w8, w9
+; GISEL-NEXT:    eor w9, w10, w11
+; GISEL-NEXT:    fmov w10, s4
+; GISEL-NEXT:    eor w8, w8, w9
+; GISEL-NEXT:    fmov w11, s5
 ; GISEL-NEXT:    fmov w14, s18
 ; GISEL-NEXT:    fmov w15, s19
 ; GISEL-NEXT:    fmov w16, s22
 ; GISEL-NEXT:    fmov w17, s23
+; GISEL-NEXT:    eor w10, w10, w11
+; GISEL-NEXT:    eor w11, w12, w13
+; GISEL-NEXT:    fmov w12, s16
+; GISEL-NEXT:    eor w9, w10, w11
+; GISEL-NEXT:    fmov w13, s17
 ; GISEL-NEXT:    eor w8, w8, w9
 ; GISEL-NEXT:    eor w12, w12, w13
-; GISEL-NEXT:    eor w9, w10, w11
 ; GISEL-NEXT:    eor w13, w14, w15
 ; GISEL-NEXT:    fmov w14, s20
 ; GISEL-NEXT:    fmov w15, s21
 ; GISEL-NEXT:    eor w10, w12, w13
-; GISEL-NEXT:    eor w8, w8, w9
 ; GISEL-NEXT:    eor w14, w14, w15
 ; GISEL-NEXT:    eor w15, w16, w17
 ; GISEL-NEXT:    eor w11, w14, w15
@@ -319,14 +319,14 @@ define i8 @test_redxor_v8i8(<8 x i8> %a) {
 ; GISEL-NEXT:    fmov w11, s3
 ; GISEL-NEXT:    fmov w12, s4
 ; GISEL-NEXT:    fmov w13, s5
+; GISEL-NEXT:    fmov w14, s6
 ; GISEL-NEXT:    eor w8, w8, w9
-; GISEL-NEXT:    fmov w9, s6
+; GISEL-NEXT:    fmov w9, s7
 ; GISEL-NEXT:    eor w10, w10, w11
-; GISEL-NEXT:    fmov w11, s7
-; GISEL-NEXT:    eor w12, w12, w13
+; GISEL-NEXT:    eor w11, w12, w13
 ; GISEL-NEXT:    eor w8, w8, w10
-; GISEL-NEXT:    eor w9, w9, w11
-; GISEL-NEXT:    eor w9, w12, w9
+; GISEL-NEXT:    eor w9, w14, w9
+; GISEL-NEXT:    eor w9, w11, w9
 ; GISEL-NEXT:    eor w0, w8, w9
 ; GISEL-NEXT:    ret
   %xor_result = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %a)
@@ -362,14 +362,14 @@ define i8 @test_redxor_v16i8(<16 x i8> %a) {
 ; GISEL-NEXT:    fmov w11, s3
 ; GISEL-NEXT:    fmov w12, s4
 ; GISEL-NEXT:    fmov w13, s5
+; GISEL-NEXT:    fmov w14, s6
 ; GISEL-NEXT:    eor w8, w8, w9
-; GISEL-NEXT:    fmov w9, s6
+; GISEL-NEXT:    fmov w9, s7
 ; GISEL-NEXT:    eor w10, w10, w11
-; GISEL-NEXT:    fmov w11, s7
-; GISEL-NEXT:    eor w12, w12, w13
+; GISEL-NEXT:    eor w11, w12, w13
 ; GISEL-NEXT:    eor w8, w8, w10
-; GISEL-NEXT:    eor w9, w9, w11
-; GISEL-NEXT:    eor w9, w12, w9
+; GISEL-NEXT:    eor w9, w14, w9
+; GISEL-NEXT:    eor w9, w11, w9
 ; GISEL-NEXT:    eor w0, w8, w9
 ; GISEL-NEXT:    ret
   %xor_result = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %a)
@@ -407,14 +407,14 @@ define i8 @test_redxor_v32i8(<32 x i8> %a) {
 ; GISEL-NEXT:    fmov w11, s3
 ; GISEL-NEXT:    fmov w12, s4
 ; GISEL-NEXT:    fmov w13, s5
+; GISEL-NEXT:    fmov w14, s6
 ; GISEL-NEXT:    eor w8, w8, w9
-; GISEL-NEXT:    fmov w9, s6
+; GISEL-NEXT:    fmov w9, s7
 ; GISEL-NEXT:    eor w10, w10, w11
-; GISEL-NEXT:    fmov w11, s7
-; GISEL-NEXT:    eor w12, w12, w13
+; GISEL-NEXT:    eor w11, w12, w13
 ; GISEL-NEXT:    eor w8, w8, w10
-; GISEL-NEXT:    eor w9, w9, w11
-; GISEL-NEXT:    eor w9, w12, w9
+; GISEL-NEXT:    eor w9, w14, w9
+; GISEL-NEXT:    eor w9, w11, w9
 ; GISEL-NEXT:    eor w0, w8, w9
 ; GISEL-NEXT:    ret
   %xor_result = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %a)

diff  --git a/llvm/test/CodeGen/AArch64/regress-tblgen-chains.ll b/llvm/test/CodeGen/AArch64/regress-tblgen-chains.ll
index 0fe61e38e916d8..873bc6771858e2 100644
--- a/llvm/test/CodeGen/AArch64/regress-tblgen-chains.ll
+++ b/llvm/test/CodeGen/AArch64/regress-tblgen-chains.ll
@@ -25,9 +25,9 @@ define i64 @test_chains() {
 ; CHECK-NEXT:    bl _bar
 ; CHECK-NEXT:    ldurb w8, [x29, #-1]
 ; CHECK-NEXT:    add x8, x8, #1
-; CHECK-NEXT:    and x0, x8, #0xff
 ; CHECK-NEXT:    sturb w8, [x29, #-1]
 ; CHECK-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-NEXT:    and x0, x8, #0xff
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
 

diff  --git a/llvm/test/CodeGen/AArch64/rotate-extract.ll b/llvm/test/CodeGen/AArch64/rotate-extract.ll
index 20008c41c42e84..e3eaf81245ff43 100644
--- a/llvm/test/CodeGen/AArch64/rotate-extract.ll
+++ b/llvm/test/CodeGen/AArch64/rotate-extract.ll
@@ -50,7 +50,7 @@ define i32 @ror_extract_mul(i32 %i) nounwind {
 define i64 @ror_extract_udiv(i64 %i) nounwind {
 ; CHECK-LABEL: ror_extract_udiv:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-6148914691236517206
+; CHECK-NEXT:    mov x8, #-6148914691236517206 // =0xaaaaaaaaaaaaaaaa
 ; CHECK-NEXT:    movk x8, #43691
 ; CHECK-NEXT:    umulh x8, x0, x8
 ; CHECK-NEXT:    lsr x8, x8, #1
@@ -127,15 +127,15 @@ define i64 @no_extract_mul(i64 %i) nounwind {
 define i32 @no_extract_udiv(i32 %i) nounwind {
 ; CHECK-LABEL: no_extract_udiv:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #33437
-; CHECK-NEXT:    mov w9, #43691
-; CHECK-NEXT:    movk w8, #21399, lsl #16
-; CHECK-NEXT:    movk w9, #43690, lsl #16
+; CHECK-NEXT:    mov w8, #43691 // =0xaaab
+; CHECK-NEXT:    mov w9, #33437 // =0x829d
+; CHECK-NEXT:    movk w8, #43690, lsl #16
+; CHECK-NEXT:    movk w9, #21399, lsl #16
 ; CHECK-NEXT:    umull x8, w0, w8
 ; CHECK-NEXT:    umull x9, w0, w9
-; CHECK-NEXT:    lsr x8, x8, #32
-; CHECK-NEXT:    lsr x9, x9, #33
-; CHECK-NEXT:    extr w0, w9, w8, #4
+; CHECK-NEXT:    lsr x8, x8, #33
+; CHECK-NEXT:    lsr x9, x9, #32
+; CHECK-NEXT:    extr w0, w8, w9, #4
 ; CHECK-NEXT:    ret
   %lhs_div = udiv i32 %i, 3
   %rhs_div = udiv i32 %i, 49

diff  --git a/llvm/test/CodeGen/AArch64/sadd_sat.ll b/llvm/test/CodeGen/AArch64/sadd_sat.ll
index 172c2e7de7794e..16326a64f67ee2 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat.ll
@@ -36,11 +36,11 @@ define i16 @func16(i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: func16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    mov w9, #32767
+; CHECK-NEXT:    mov w9, #32767 // =0x7fff
 ; CHECK-NEXT:    add w8, w8, w1, sxth
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    csel w8, w8, w9, lt
-; CHECK-NEXT:    mov w9, #-32768
+; CHECK-NEXT:    mov w9, #-32768 // =0xffff8000
 ; CHECK-NEXT:    cmn w8, #8, lsl #12 // =32768
 ; CHECK-NEXT:    csel w0, w8, w9, gt
 ; CHECK-NEXT:    ret
@@ -51,12 +51,12 @@ define i16 @func16(i16 %x, i16 %y) nounwind {
 define i8 @func8(i8 %x, i8 %y) nounwind {
 ; CHECK-LABEL: func8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    mov w9, #127
-; CHECK-NEXT:    add w8, w8, w1, sxtb
-; CHECK-NEXT:    cmp w8, #127
-; CHECK-NEXT:    csel w8, w8, w9, lt
-; CHECK-NEXT:    mov w9, #-128
+; CHECK-NEXT:    sxtb w9, w0
+; CHECK-NEXT:    mov w8, #127 // =0x7f
+; CHECK-NEXT:    add w9, w9, w1, sxtb
+; CHECK-NEXT:    cmp w9, #127
+; CHECK-NEXT:    csel w8, w9, w8, lt
+; CHECK-NEXT:    mov w9, #-128 // =0xffffff80
 ; CHECK-NEXT:    cmn w8, #128
 ; CHECK-NEXT:    csel w0, w8, w9, gt
 ; CHECK-NEXT:    ret
@@ -67,13 +67,13 @@ define i8 @func8(i8 %x, i8 %y) nounwind {
 define i4 @func3(i4 %x, i4 %y) nounwind {
 ; CHECK-LABEL: func3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w1, #28
-; CHECK-NEXT:    sbfx w9, w0, #0, #4
-; CHECK-NEXT:    add w8, w9, w8, asr #28
-; CHECK-NEXT:    mov w9, #7
-; CHECK-NEXT:    cmp w8, #7
-; CHECK-NEXT:    csel w8, w8, w9, lt
-; CHECK-NEXT:    mov w9, #-8
+; CHECK-NEXT:    lsl w9, w1, #28
+; CHECK-NEXT:    sbfx w10, w0, #0, #4
+; CHECK-NEXT:    mov w8, #7 // =0x7
+; CHECK-NEXT:    add w9, w10, w9, asr #28
+; CHECK-NEXT:    cmp w9, #7
+; CHECK-NEXT:    csel w8, w9, w8, lt
+; CHECK-NEXT:    mov w9, #-8 // =0xfffffff8
 ; CHECK-NEXT:    cmn w8, #8
 ; CHECK-NEXT:    csel w0, w8, w9, gt
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sadd_sat_plus.ll b/llvm/test/CodeGen/AArch64/sadd_sat_plus.ll
index 20e7c8381ae635..49ee5ae261a616 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_plus.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_plus.ll
@@ -37,13 +37,13 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
 ; CHECK-LABEL: func16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul w9, w1, w2
-; CHECK-NEXT:    sxth w10, w0
-; CHECK-NEXT:    mov w8, #32767
-; CHECK-NEXT:    add w9, w10, w9, sxth
-; CHECK-NEXT:    cmp w9, w8
-; CHECK-NEXT:    csel w8, w9, w8, lt
-; CHECK-NEXT:    mov w9, #-32768
+; CHECK-NEXT:    mul w8, w1, w2
+; CHECK-NEXT:    sxth w9, w0
+; CHECK-NEXT:    add w8, w9, w8, sxth
+; CHECK-NEXT:    mov w9, #32767 // =0x7fff
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    csel w8, w8, w9, lt
+; CHECK-NEXT:    mov w9, #-32768 // =0xffff8000
 ; CHECK-NEXT:    cmn w8, #8, lsl #12 // =32768
 ; CHECK-NEXT:    csel w0, w8, w9, gt
 ; CHECK-NEXT:    ret
@@ -55,13 +55,13 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
 define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind {
 ; CHECK-LABEL: func8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul w9, w1, w2
-; CHECK-NEXT:    sxtb w10, w0
-; CHECK-NEXT:    mov w8, #127
-; CHECK-NEXT:    add w9, w10, w9, sxtb
-; CHECK-NEXT:    cmp w9, #127
-; CHECK-NEXT:    csel w8, w9, w8, lt
-; CHECK-NEXT:    mov w9, #-128
+; CHECK-NEXT:    mul w8, w1, w2
+; CHECK-NEXT:    sxtb w9, w0
+; CHECK-NEXT:    add w8, w9, w8, sxtb
+; CHECK-NEXT:    mov w9, #127 // =0x7f
+; CHECK-NEXT:    cmp w8, #127
+; CHECK-NEXT:    csel w8, w8, w9, lt
+; CHECK-NEXT:    mov w9, #-128 // =0xffffff80
 ; CHECK-NEXT:    cmn w8, #128
 ; CHECK-NEXT:    csel w0, w8, w9, gt
 ; CHECK-NEXT:    ret
@@ -73,14 +73,14 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind {
 define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind {
 ; CHECK-LABEL: func4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul w9, w1, w2
-; CHECK-NEXT:    sbfx w10, w0, #0, #4
-; CHECK-NEXT:    mov w8, #7
-; CHECK-NEXT:    lsl w9, w9, #28
-; CHECK-NEXT:    add w9, w10, w9, asr #28
-; CHECK-NEXT:    cmp w9, #7
-; CHECK-NEXT:    csel w8, w9, w8, lt
-; CHECK-NEXT:    mov w9, #-8
+; CHECK-NEXT:    mul w8, w1, w2
+; CHECK-NEXT:    sbfx w9, w0, #0, #4
+; CHECK-NEXT:    lsl w8, w8, #28
+; CHECK-NEXT:    add w8, w9, w8, asr #28
+; CHECK-NEXT:    mov w9, #7 // =0x7
+; CHECK-NEXT:    cmp w8, #7
+; CHECK-NEXT:    csel w8, w8, w9, lt
+; CHECK-NEXT:    mov w9, #-8 // =0xfffffff8
 ; CHECK-NEXT:    cmn w8, #8
 ; CHECK-NEXT:    csel w0, w8, w9, gt
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index 5e39a0196d74a0..6ec5d22dca1836 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -44,8 +44,8 @@ define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
 ; CHECK-LABEL: v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    sqadd v1.16b, v1.16b, v3.16b
+; CHECK-NEXT:    sqadd v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %z = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %x, <32 x i8> %y)
   ret <32 x i8> %z
@@ -75,8 +75,8 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
 ; CHECK-LABEL: v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v0.8h, v0.8h, v2.8h
 ; CHECK-NEXT:    sqadd v1.8h, v1.8h, v3.8h
+; CHECK-NEXT:    sqadd v0.8h, v0.8h, v2.8h
 ; CHECK-NEXT:    ret
   %z = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %x, <16 x i16> %y)
   ret <16 x i16> %z
@@ -97,9 +97,9 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
 define void @v8i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-LABEL: v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    sqadd v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    sqadd v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <8 x i8>, ptr %px
@@ -116,8 +116,8 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-NEXT:    ldr s1, [x1]
 ; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-NEXT:    shl v1.4h, v1.4h, #8
+; CHECK-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-NEXT:    sqadd v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
 ; CHECK-NEXT:    xtn v0.8b, v0.8h
@@ -133,15 +133,15 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-LABEL: v2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1 { v0.b }[0], [x1]
-; CHECK-NEXT:    add x8, x1, #1
-; CHECK-NEXT:    ld1 { v1.b }[0], [x0]
-; CHECK-NEXT:    add x9, x0, #1
+; CHECK-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-NEXT:    ld1 { v1.b }[0], [x1]
+; CHECK-NEXT:    add x8, x0, #1
+; CHECK-NEXT:    add x9, x1, #1
 ; CHECK-NEXT:    ld1 { v0.b }[4], [x8]
 ; CHECK-NEXT:    ld1 { v1.b }[4], [x9]
-; CHECK-NEXT:    shl v0.2s, v0.2s, #24
 ; CHECK-NEXT:    shl v1.2s, v1.2s, #24
-; CHECK-NEXT:    sqadd v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-NEXT:    sqadd v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ushr v0.2s, v0.2s, #24
 ; CHECK-NEXT:    mov w8, v0.s[1]
 ; CHECK-NEXT:    fmov w9, s0
@@ -158,9 +158,9 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-LABEL: v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    sqadd v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    sqadd v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <4 x i16>, ptr %px
@@ -173,15 +173,15 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-LABEL: v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1 { v0.h }[0], [x1]
-; CHECK-NEXT:    add x8, x1, #2
-; CHECK-NEXT:    ld1 { v1.h }[0], [x0]
-; CHECK-NEXT:    add x9, x0, #2
+; CHECK-NEXT:    ld1 { v0.h }[0], [x0]
+; CHECK-NEXT:    ld1 { v1.h }[0], [x1]
+; CHECK-NEXT:    add x8, x0, #2
+; CHECK-NEXT:    add x9, x1, #2
 ; CHECK-NEXT:    ld1 { v0.h }[2], [x8]
 ; CHECK-NEXT:    ld1 { v1.h }[2], [x9]
-; CHECK-NEXT:    shl v0.2s, v0.2s, #16
 ; CHECK-NEXT:    shl v1.2s, v1.2s, #16
-; CHECK-NEXT:    sqadd v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-NEXT:    sqadd v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ushr v0.2s, v0.2s, #16
 ; CHECK-NEXT:    mov w8, v0.s[1]
 ; CHECK-NEXT:    fmov w9, s0
@@ -224,9 +224,9 @@ define void @v12i16(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v1i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-LABEL: v1i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr b0, [x1]
-; CHECK-NEXT:    ldr b1, [x0]
-; CHECK-NEXT:    sqadd v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    ldr b1, [x1]
+; CHECK-NEXT:    sqadd v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    st1 { v0.b }[0], [x2]
 ; CHECK-NEXT:    ret
   %x = load <1 x i8>, ptr %px
@@ -239,9 +239,9 @@ define void @v1i8(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v1i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-LABEL: v1i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr h0, [x1]
-; CHECK-NEXT:    ldr h1, [x0]
-; CHECK-NEXT:    sqadd v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ldr h1, [x1]
+; CHECK-NEXT:    sqadd v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    str h0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <1 x i16>, ptr %px
@@ -297,8 +297,8 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; CHECK-LABEL: v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    sqadd v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    sqadd v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
   %z = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
   ret <8 x i32> %z
@@ -328,8 +328,8 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; CHECK-LABEL: v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqadd v0.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    sqadd v1.2d, v1.2d, v3.2d
+; CHECK-NEXT:    sqadd v0.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    ret
   %z = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
   ret <4 x i64> %z
@@ -353,16 +353,16 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; CHECK-NEXT:    adds x8, x2, x6
 ; CHECK-NEXT:    adcs x9, x3, x7
 ; CHECK-NEXT:    asr x10, x9, #63
+; CHECK-NEXT:    eor x11, x10, #0x8000000000000000
 ; CHECK-NEXT:    csel x2, x10, x8, vs
-; CHECK-NEXT:    eor x8, x10, #0x8000000000000000
-; CHECK-NEXT:    csel x3, x8, x9, vs
+; CHECK-NEXT:    csel x3, x11, x9, vs
 ; CHECK-NEXT:    adds x8, x0, x4
 ; CHECK-NEXT:    adcs x9, x1, x5
 ; CHECK-NEXT:    asr x10, x9, #63
 ; CHECK-NEXT:    csel x8, x10, x8, vs
-; CHECK-NEXT:    eor x10, x10, #0x8000000000000000
-; CHECK-NEXT:    csel x1, x10, x9, vs
+; CHECK-NEXT:    eor x11, x10, #0x8000000000000000
 ; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    csel x1, x11, x9, vs
 ; CHECK-NEXT:    mov v0.d[1], x1
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll
index 06fc023d927d45..86c224bee990ad 100644
--- a/llvm/test/CodeGen/AArch64/sat-add.ll
+++ b/llvm/test/CodeGen/AArch64/sat-add.ll
@@ -10,7 +10,7 @@ define i8 @unsigned_sat_constant_i8_using_min(i8 %x) {
 ; CHECK-LABEL: unsigned_sat_constant_i8_using_min:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w9, w0, #0xff
-; CHECK-NEXT:    mov w8, #-43
+; CHECK-NEXT:    mov w8, #-43 // =0xffffffd5
 ; CHECK-NEXT:    cmp w9, #213
 ; CHECK-NEXT:    csel w8, w0, w8, lo
 ; CHECK-NEXT:    add w0, w8, #42
@@ -52,9 +52,9 @@ define i8 @unsigned_sat_constant_i8_using_cmp_notval(i8 %x) {
 define i16 @unsigned_sat_constant_i16_using_min(i16 %x) {
 ; CHECK-LABEL: unsigned_sat_constant_i16_using_min:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #65493
+; CHECK-NEXT:    mov w8, #65493 // =0xffd5
 ; CHECK-NEXT:    cmp w8, w0, uxth
-; CHECK-NEXT:    mov w8, #-43
+; CHECK-NEXT:    mov w8, #-43 // =0xffffffd5
 ; CHECK-NEXT:    csel w8, w0, w8, hi
 ; CHECK-NEXT:    add w0, w8, #42
 ; CHECK-NEXT:    ret
@@ -81,7 +81,7 @@ define i16 @unsigned_sat_constant_i16_using_cmp_sum(i16 %x) {
 define i16 @unsigned_sat_constant_i16_using_cmp_notval(i16 %x) {
 ; CHECK-LABEL: unsigned_sat_constant_i16_using_cmp_notval:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #65493
+; CHECK-NEXT:    mov w8, #65493 // =0xffd5
 ; CHECK-NEXT:    add w9, w0, #42
 ; CHECK-NEXT:    cmp w8, w0, uxth
 ; CHECK-NEXT:    csinv w0, w9, wzr, hs
@@ -95,7 +95,7 @@ define i16 @unsigned_sat_constant_i16_using_cmp_notval(i16 %x) {
 define i32 @unsigned_sat_constant_i32_using_min(i32 %x) {
 ; CHECK-LABEL: unsigned_sat_constant_i32_using_min:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-43
+; CHECK-NEXT:    mov w8, #-43 // =0xffffffd5
 ; CHECK-NEXT:    cmn w0, #43
 ; CHECK-NEXT:    csel w8, w0, w8, lo
 ; CHECK-NEXT:    add w0, w8, #42
@@ -133,7 +133,7 @@ define i32 @unsigned_sat_constant_i32_using_cmp_notval(i32 %x) {
 define i64 @unsigned_sat_constant_i64_using_min(i64 %x) {
 ; CHECK-LABEL: unsigned_sat_constant_i64_using_min:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-43
+; CHECK-NEXT:    mov x8, #-43 // =0xffffffffffffffd5
 ; CHECK-NEXT:    cmn x0, #43
 ; CHECK-NEXT:    csel x8, x0, x8, lo
 ; CHECK-NEXT:    add x0, x8, #42
@@ -217,9 +217,9 @@ define i8 @unsigned_sat_variable_i8_using_cmp_notval(i8 %x, i8 %y) {
 define i16 @unsigned_sat_variable_i16_using_min(i16 %x, i16 %y) {
 ; CHECK-LABEL: unsigned_sat_variable_i16_using_min:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w1
-; CHECK-NEXT:    and w9, w0, #0xffff
-; CHECK-NEXT:    cmp w9, w8, uxth
+; CHECK-NEXT:    and w8, w0, #0xffff
+; CHECK-NEXT:    mvn w9, w1
+; CHECK-NEXT:    cmp w8, w9, uxth
 ; CHECK-NEXT:    csinv w8, w0, w1, lo
 ; CHECK-NEXT:    add w0, w8, w1
 ; CHECK-NEXT:    ret
@@ -346,9 +346,9 @@ define <16 x i8> @unsigned_sat_constant_v16i8_using_min(<16 x i8> %x) {
 ; CHECK-LABEL: unsigned_sat_constant_v16i8_using_min:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.16b, #213
-; CHECK-NEXT:    movi v2.16b, #42
 ; CHECK-NEXT:    umin v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    add v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    movi v1.16b, #42
+; CHECK-NEXT:    add v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %c = icmp ult <16 x i8> %x, <i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43>
   %s = select <16 x i1> %c, <16 x i8> %x, <16 x i8> <i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43>
@@ -383,9 +383,9 @@ define <16 x i8> @unsigned_sat_constant_v16i8_using_cmp_notval(<16 x i8> %x) {
 define <8 x i16> @unsigned_sat_constant_v8i16_using_min(<8 x i16> %x) {
 ; CHECK-LABEL: unsigned_sat_constant_v8i16_using_min:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvni v1.8h, #42
+; CHECK-NEXT:    umin v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    movi v1.8h, #42
-; CHECK-NEXT:    mvni v2.8h, #42
-; CHECK-NEXT:    umin v0.8h, v0.8h, v2.8h
 ; CHECK-NEXT:    add v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
   %c = icmp ult <8 x i16> %x, <i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43>
@@ -459,9 +459,9 @@ define <4 x i32> @unsigned_sat_constant_v4i32_using_cmp_notval(<4 x i32> %x) {
 define <2 x i64> @unsigned_sat_constant_v2i64_using_min(<2 x i64> %x) {
 ; CHECK-LABEL: unsigned_sat_constant_v2i64_using_min:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-43
+; CHECK-NEXT:    mov x8, #-43 // =0xffffffffffffffd5
 ; CHECK-NEXT:    dup v1.2d, x8
-; CHECK-NEXT:    mov w8, #42
+; CHECK-NEXT:    mov w8, #42 // =0x2a
 ; CHECK-NEXT:    cmhi v2.2d, v1.2d, v0.2d
 ; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    dup v1.2d, x8
@@ -476,7 +476,7 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_min(<2 x i64> %x) {
 define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_sum(<2 x i64> %x) {
 ; CHECK-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #42
+; CHECK-NEXT:    mov w8, #42 // =0x2a
 ; CHECK-NEXT:    dup v1.2d, x8
 ; CHECK-NEXT:    uqadd v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
@@ -489,7 +489,7 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_sum(<2 x i64> %x) {
 define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_notval(<2 x i64> %x) {
 ; CHECK-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #42
+; CHECK-NEXT:    mov w8, #42 // =0x2a
 ; CHECK-NEXT:    dup v1.2d, x8
 ; CHECK-NEXT:    uqadd v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/select-constant-xor.ll b/llvm/test/CodeGen/AArch64/select-constant-xor.ll
index abd8ee278a6517..3adf48e84b44c1 100644
--- a/llvm/test/CodeGen/AArch64/select-constant-xor.ll
+++ b/llvm/test/CodeGen/AArch64/select-constant-xor.ll
@@ -52,7 +52,7 @@ define i64 @selecti32i64(i32 %a) {
 define i8 @xori32i8(i32 %a) {
 ; CHECK-LABEL: xori32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #84
+; CHECK-NEXT:    mov w8, #84 // =0x54
 ; CHECK-NEXT:    eor w0, w8, w0, asr #31
 ; CHECK-NEXT:    ret
   %shr4 = ashr i32 %a, 31
@@ -64,7 +64,7 @@ define i8 @xori32i8(i32 %a) {
 define i32 @selecti32i32(i32 %a) {
 ; CHECK-LABEL: selecti32i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #84
+; CHECK-NEXT:    mov w8, #84 // =0x54
 ; CHECK-NEXT:    eor w0, w8, w0, asr #31
 ; CHECK-NEXT:    ret
   %c = icmp sgt i32 %a, -1
@@ -75,7 +75,7 @@ define i32 @selecti32i32(i32 %a) {
 define i8 @selecti32i8(i32 %a) {
 ; CHECK-LABEL: selecti32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #84
+; CHECK-NEXT:    mov w8, #84 // =0x54
 ; CHECK-NEXT:    eor w0, w8, w0, asr #31
 ; CHECK-NEXT:    ret
   %c = icmp sgt i32 %a, -1
@@ -87,7 +87,7 @@ define i32 @selecti8i32(i8 %a) {
 ; CHECK-LABEL: selecti8i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    mov w9, #84
+; CHECK-NEXT:    mov w9, #84 // =0x54
 ; CHECK-NEXT:    eor w0, w9, w8, asr #7
 ; CHECK-NEXT:    ret
   %c = icmp sgt i8 %a, -1
@@ -200,8 +200,8 @@ define i32 @oneusecmp(i32 %a, i32 %b, i32 %d) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    asr w8, w0, #31
 ; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    eor w8, w8, #0x7f
 ; CHECK-NEXT:    csel w9, w2, w1, lt
+; CHECK-NEXT:    eor w8, w8, #0x7f
 ; CHECK-NEXT:    add w0, w8, w9
 ; CHECK-NEXT:    ret
   %c = icmp sle i32 %a, -1

diff  --git a/llvm/test/CodeGen/AArch64/select_const.ll b/llvm/test/CodeGen/AArch64/select_const.ll
index 8da64191a268cc..cd50d776e913f1 100644
--- a/llvm/test/CodeGen/AArch64/select_const.ll
+++ b/llvm/test/CodeGen/AArch64/select_const.ll
@@ -9,7 +9,7 @@
 define i32 @select_0_or_1(i1 %cond) {
 ; CHECK-LABEL: select_0_or_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    bic w0, w8, w0
 ; CHECK-NEXT:    ret
   %sel = select i1 %cond, i32 0, i32 1
@@ -28,7 +28,7 @@ define i32 @select_0_or_1_zeroext(i1 zeroext %cond) {
 define i32 @select_0_or_1_signext(i1 signext %cond) {
 ; CHECK-LABEL: select_0_or_1_signext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    bic w0, w8, w0
 ; CHECK-NEXT:    ret
   %sel = select i1 %cond, i32 0, i32 1
@@ -126,7 +126,7 @@ define i32 @select_neg1_or_0_signext(i1 signext %cond) {
 define i32 @select_Cplus1_C(i1 %cond) {
 ; CHECK-LABEL: select_Cplus1_C:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #41
+; CHECK-NEXT:    mov w8, #41 // =0x29
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    cinc w0, w8, ne
 ; CHECK-NEXT:    ret
@@ -137,7 +137,7 @@ define i32 @select_Cplus1_C(i1 %cond) {
 define i32 @select_Cplus1_C_zeroext(i1 zeroext %cond) {
 ; CHECK-LABEL: select_Cplus1_C_zeroext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #41
+; CHECK-NEXT:    mov w8, #41 // =0x29
 ; CHECK-NEXT:    cmp w0, #0
 ; CHECK-NEXT:    cinc w0, w8, ne
 ; CHECK-NEXT:    ret
@@ -148,7 +148,7 @@ define i32 @select_Cplus1_C_zeroext(i1 zeroext %cond) {
 define i32 @select_Cplus1_C_signext(i1 signext %cond) {
 ; CHECK-LABEL: select_Cplus1_C_signext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #41
+; CHECK-NEXT:    mov w8, #41 // =0x29
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    cinc w0, w8, ne
 ; CHECK-NEXT:    ret
@@ -161,7 +161,7 @@ define i32 @select_Cplus1_C_signext(i1 signext %cond) {
 define i32 @select_C_Cplus1(i1 %cond) {
 ; CHECK-LABEL: select_C_Cplus1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #41
+; CHECK-NEXT:    mov w8, #41 // =0x29
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    cinc w0, w8, eq
 ; CHECK-NEXT:    ret
@@ -172,7 +172,7 @@ define i32 @select_C_Cplus1(i1 %cond) {
 define i32 @select_C_Cplus1_zeroext(i1 zeroext %cond) {
 ; CHECK-LABEL: select_C_Cplus1_zeroext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #41
+; CHECK-NEXT:    mov w8, #41 // =0x29
 ; CHECK-NEXT:    cmp w0, #0
 ; CHECK-NEXT:    cinc w0, w8, eq
 ; CHECK-NEXT:    ret
@@ -183,7 +183,7 @@ define i32 @select_C_Cplus1_zeroext(i1 zeroext %cond) {
 define i32 @select_C_Cplus1_signext(i1 signext %cond) {
 ; CHECK-LABEL: select_C_Cplus1_signext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #41
+; CHECK-NEXT:    mov w8, #41 // =0x29
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    cinc w0, w8, eq
 ; CHECK-NEXT:    ret
@@ -197,9 +197,9 @@ define i32 @select_C_Cplus1_signext(i1 signext %cond) {
 define i32 @select_C1_C2(i1 %cond) {
 ; CHECK-LABEL: select_C1_C2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #42
+; CHECK-NEXT:    mov w8, #42 // =0x2a
 ; CHECK-NEXT:    tst w0, #0x1
-; CHECK-NEXT:    mov w9, #421
+; CHECK-NEXT:    mov w9, #421 // =0x1a5
 ; CHECK-NEXT:    csel w0, w9, w8, ne
 ; CHECK-NEXT:    ret
   %sel = select i1 %cond, i32 421, i32 42
@@ -209,9 +209,9 @@ define i32 @select_C1_C2(i1 %cond) {
 define i32 @select_C1_C2_zeroext(i1 zeroext %cond) {
 ; CHECK-LABEL: select_C1_C2_zeroext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #42
+; CHECK-NEXT:    mov w8, #42 // =0x2a
 ; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    mov w9, #421
+; CHECK-NEXT:    mov w9, #421 // =0x1a5
 ; CHECK-NEXT:    csel w0, w9, w8, ne
 ; CHECK-NEXT:    ret
   %sel = select i1 %cond, i32 421, i32 42
@@ -221,9 +221,9 @@ define i32 @select_C1_C2_zeroext(i1 zeroext %cond) {
 define i32 @select_C1_C2_signext(i1 signext %cond) {
 ; CHECK-LABEL: select_C1_C2_signext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #42
+; CHECK-NEXT:    mov w8, #42 // =0x2a
 ; CHECK-NEXT:    tst w0, #0x1
-; CHECK-NEXT:    mov w9, #421
+; CHECK-NEXT:    mov w9, #421 // =0x1a5
 ; CHECK-NEXT:    csel w0, w9, w8, ne
 ; CHECK-NEXT:    ret
   %sel = select i1 %cond, i32 421, i32 42
@@ -235,7 +235,7 @@ define i32 @select_C1_C2_signext(i1 signext %cond) {
 define i8 @sel_constants_add_constant(i1 %cond) {
 ; CHECK-LABEL: sel_constants_add_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #28
+; CHECK-NEXT:    mov w8, #28 // =0x1c
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    csinc w0, w8, wzr, eq
 ; CHECK-NEXT:    ret
@@ -247,9 +247,9 @@ define i8 @sel_constants_add_constant(i1 %cond) {
 define i8 @sel_constants_sub_constant(i1 %cond) {
 ; CHECK-LABEL: sel_constants_sub_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #18
+; CHECK-NEXT:    mov w8, #18 // =0x12
 ; CHECK-NEXT:    tst w0, #0x1
-; CHECK-NEXT:    mov w9, #-9
+; CHECK-NEXT:    mov w9, #-9 // =0xfffffff7
 ; CHECK-NEXT:    csel w0, w9, w8, ne
 ; CHECK-NEXT:    ret
   %sel = select i1 %cond, i8 -4, i8 23
@@ -260,9 +260,9 @@ define i8 @sel_constants_sub_constant(i1 %cond) {
 define i8 @sel_constants_sub_constant_sel_constants(i1 %cond) {
 ; CHECK-LABEL: sel_constants_sub_constant_sel_constants:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #2
+; CHECK-NEXT:    mov w8, #2 // =0x2
 ; CHECK-NEXT:    tst w0, #0x1
-; CHECK-NEXT:    mov w9, #9
+; CHECK-NEXT:    mov w9, #9 // =0x9
 ; CHECK-NEXT:    csel w0, w9, w8, ne
 ; CHECK-NEXT:    ret
   %sel = select i1 %cond, i8 -4, i8 3
@@ -273,9 +273,9 @@ define i8 @sel_constants_sub_constant_sel_constants(i1 %cond) {
 define i8 @sel_constants_mul_constant(i1 %cond) {
 ; CHECK-LABEL: sel_constants_mul_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #115
+; CHECK-NEXT:    mov w8, #115 // =0x73
 ; CHECK-NEXT:    tst w0, #0x1
-; CHECK-NEXT:    mov w9, #-20
+; CHECK-NEXT:    mov w9, #-20 // =0xffffffec
 ; CHECK-NEXT:    csel w0, w9, w8, ne
 ; CHECK-NEXT:    ret
   %sel = select i1 %cond, i8 -4, i8 23
@@ -286,7 +286,7 @@ define i8 @sel_constants_mul_constant(i1 %cond) {
 define i8 @sel_constants_sdiv_constant(i1 %cond) {
 ; CHECK-LABEL: sel_constants_sdiv_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #4
+; CHECK-NEXT:    mov w8, #4 // =0x4
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    csel w0, wzr, w8, ne
 ; CHECK-NEXT:    ret
@@ -298,7 +298,7 @@ define i8 @sel_constants_sdiv_constant(i1 %cond) {
 define i8 @sdiv_constant_sel_constants(i1 %cond) {
 ; CHECK-LABEL: sdiv_constant_sel_constants:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #5
+; CHECK-NEXT:    mov w8, #5 // =0x5
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    csel w0, wzr, w8, ne
 ; CHECK-NEXT:    ret
@@ -310,9 +310,9 @@ define i8 @sdiv_constant_sel_constants(i1 %cond) {
 define i8 @sel_constants_udiv_constant(i1 %cond) {
 ; CHECK-LABEL: sel_constants_udiv_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #4
+; CHECK-NEXT:    mov w8, #4 // =0x4
 ; CHECK-NEXT:    tst w0, #0x1
-; CHECK-NEXT:    mov w9, #50
+; CHECK-NEXT:    mov w9, #50 // =0x32
 ; CHECK-NEXT:    csel w0, w9, w8, ne
 ; CHECK-NEXT:    ret
   %sel = select i1 %cond, i8 -4, i8 23
@@ -323,7 +323,7 @@ define i8 @sel_constants_udiv_constant(i1 %cond) {
 define i8 @udiv_constant_sel_constants(i1 %cond) {
 ; CHECK-LABEL: udiv_constant_sel_constants:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #5
+; CHECK-NEXT:    mov w8, #5 // =0x5
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    csel w0, wzr, w8, ne
 ; CHECK-NEXT:    ret
@@ -335,7 +335,7 @@ define i8 @udiv_constant_sel_constants(i1 %cond) {
 define i8 @sel_constants_srem_constant(i1 %cond) {
 ; CHECK-LABEL: sel_constants_srem_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-4
+; CHECK-NEXT:    mov w8, #-4 // =0xfffffffc
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    cinv w0, w8, eq
 ; CHECK-NEXT:    ret
@@ -347,9 +347,9 @@ define i8 @sel_constants_srem_constant(i1 %cond) {
 define i8 @srem_constant_sel_constants(i1 %cond) {
 ; CHECK-LABEL: srem_constant_sel_constants:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #5
+; CHECK-NEXT:    mov w8, #5 // =0x5
 ; CHECK-NEXT:    tst w0, #0x1
-; CHECK-NEXT:    mov w9, #120
+; CHECK-NEXT:    mov w9, #120 // =0x78
 ; CHECK-NEXT:    csel w0, w9, w8, ne
 ; CHECK-NEXT:    ret
   %sel = select i1 %cond, i8 121, i8 23
@@ -360,7 +360,7 @@ define i8 @srem_constant_sel_constants(i1 %cond) {
 define i8 @sel_constants_urem_constant(i1 %cond) {
 ; CHECK-LABEL: sel_constants_urem_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #2
+; CHECK-NEXT:    mov w8, #2 // =0x2
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    cinc w0, w8, eq
 ; CHECK-NEXT:    ret
@@ -372,9 +372,9 @@ define i8 @sel_constants_urem_constant(i1 %cond) {
 define i8 @urem_constant_sel_constants(i1 %cond) {
 ; CHECK-LABEL: urem_constant_sel_constants:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #5
+; CHECK-NEXT:    mov w8, #5 // =0x5
 ; CHECK-NEXT:    tst w0, #0x1
-; CHECK-NEXT:    mov w9, #120
+; CHECK-NEXT:    mov w9, #120 // =0x78
 ; CHECK-NEXT:    csel w0, w9, w8, ne
 ; CHECK-NEXT:    ret
   %sel = select i1 %cond, i8 -4, i8 23
@@ -385,7 +385,7 @@ define i8 @urem_constant_sel_constants(i1 %cond) {
 define i8 @sel_constants_and_constant(i1 %cond) {
 ; CHECK-LABEL: sel_constants_and_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #4
+; CHECK-NEXT:    mov w8, #4 // =0x4
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    cinc w0, w8, eq
 ; CHECK-NEXT:    ret
@@ -397,9 +397,9 @@ define i8 @sel_constants_and_constant(i1 %cond) {
 define i8 @sel_constants_or_constant(i1 %cond) {
 ; CHECK-LABEL: sel_constants_or_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #23
+; CHECK-NEXT:    mov w8, #23 // =0x17
 ; CHECK-NEXT:    tst w0, #0x1
-; CHECK-NEXT:    mov w9, #-3
+; CHECK-NEXT:    mov w9, #-3 // =0xfffffffd
 ; CHECK-NEXT:    csel w0, w9, w8, ne
 ; CHECK-NEXT:    ret
   %sel = select i1 %cond, i8 -4, i8 23
@@ -410,9 +410,9 @@ define i8 @sel_constants_or_constant(i1 %cond) {
 define i8 @sel_constants_xor_constant(i1 %cond) {
 ; CHECK-LABEL: sel_constants_xor_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #18
+; CHECK-NEXT:    mov w8, #18 // =0x12
 ; CHECK-NEXT:    tst w0, #0x1
-; CHECK-NEXT:    mov w9, #-7
+; CHECK-NEXT:    mov w9, #-7 // =0xfffffff9
 ; CHECK-NEXT:    csel w0, w9, w8, ne
 ; CHECK-NEXT:    ret
   %sel = select i1 %cond, i8 -4, i8 23
@@ -423,9 +423,9 @@ define i8 @sel_constants_xor_constant(i1 %cond) {
 define i8 @sel_constants_shl_constant(i1 %cond) {
 ; CHECK-LABEL: sel_constants_shl_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-32
+; CHECK-NEXT:    mov w8, #-32 // =0xffffffe0
 ; CHECK-NEXT:    tst w0, #0x1
-; CHECK-NEXT:    mov w9, #-128
+; CHECK-NEXT:    mov w9, #-128 // =0xffffff80
 ; CHECK-NEXT:    csel w0, w9, w8, ne
 ; CHECK-NEXT:    ret
   %sel = select i1 %cond, i8 -4, i8 23
@@ -436,9 +436,9 @@ define i8 @sel_constants_shl_constant(i1 %cond) {
 define i8 @shl_constant_sel_constants(i1 %cond) {
 ; CHECK-LABEL: shl_constant_sel_constants:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #8
+; CHECK-NEXT:    mov w8, #8 // =0x8
 ; CHECK-NEXT:    tst w0, #0x1
-; CHECK-NEXT:    mov w9, #4
+; CHECK-NEXT:    mov w9, #4 // =0x4
 ; CHECK-NEXT:    csel w0, w9, w8, ne
 ; CHECK-NEXT:    ret
   %sel = select i1 %cond, i8 2, i8 3
@@ -449,7 +449,7 @@ define i8 @shl_constant_sel_constants(i1 %cond) {
 define i8 @sel_constants_lshr_constant(i1 %cond) {
 ; CHECK-LABEL: sel_constants_lshr_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #7
+; CHECK-NEXT:    mov w8, #7 // =0x7
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    csel w0, w8, wzr, ne
 ; CHECK-NEXT:    ret
@@ -461,9 +461,9 @@ define i8 @sel_constants_lshr_constant(i1 %cond) {
 define i8 @lshr_constant_sel_constants(i1 %cond) {
 ; CHECK-LABEL: lshr_constant_sel_constants:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #8
+; CHECK-NEXT:    mov w8, #8 // =0x8
 ; CHECK-NEXT:    tst w0, #0x1
-; CHECK-NEXT:    mov w9, #16
+; CHECK-NEXT:    mov w9, #16 // =0x10
 ; CHECK-NEXT:    csel w0, w9, w8, ne
 ; CHECK-NEXT:    ret
   %sel = select i1 %cond, i8 2, i8 3
@@ -485,9 +485,9 @@ define i8 @sel_constants_ashr_constant(i1 %cond) {
 define i8 @ashr_constant_sel_constants(i1 %cond) {
 ; CHECK-LABEL: ashr_constant_sel_constants:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-16
+; CHECK-NEXT:    mov w8, #-16 // =0xfffffff0
 ; CHECK-NEXT:    tst w0, #0x1
-; CHECK-NEXT:    mov w9, #-32
+; CHECK-NEXT:    mov w9, #-32 // =0xffffffe0
 ; CHECK-NEXT:    csel w0, w9, w8, ne
 ; CHECK-NEXT:    ret
   %sel = select i1 %cond, i8 2, i8 3
@@ -498,13 +498,13 @@ define i8 @ashr_constant_sel_constants(i1 %cond) {
 define double @sel_constants_fadd_constant(i1 %cond) {
 ; CHECK-LABEL: sel_constants_fadd_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #7378697629483820646
-; CHECK-NEXT:    adrp x9, .LCPI42_0
-; CHECK-NEXT:    movk x8, #16444, lsl #48
+; CHECK-NEXT:    mov x9, #7378697629483820646 // =0x6666666666666666
+; CHECK-NEXT:    adrp x8, .LCPI42_0
 ; CHECK-NEXT:    tst w0, #0x1
-; CHECK-NEXT:    ldr d1, [x9, :lo12:.LCPI42_0]
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fcsel d0, d1, d0, ne
+; CHECK-NEXT:    movk x9, #16444, lsl #48
+; CHECK-NEXT:    ldr d0, [x8, :lo12:.LCPI42_0]
+; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    fcsel d0, d0, d1, ne
 ; CHECK-NEXT:    ret
   %sel = select i1 %cond, double -4.0, double 23.3
   %bo = fadd double %sel, 5.1
@@ -514,12 +514,12 @@ define double @sel_constants_fadd_constant(i1 %cond) {
 define double @sel_constants_fsub_constant(i1 %cond) {
 ; CHECK-LABEL: sel_constants_fsub_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x9, #3689348814741910323
 ; CHECK-NEXT:    adrp x8, .LCPI43_0
-; CHECK-NEXT:    movk x9, #49186, lsl #48
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    ldr d0, [x8, :lo12:.LCPI43_0]
-; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    mov x8, #3689348814741910323 // =0x3333333333333333
+; CHECK-NEXT:    movk x8, #49186, lsl #48
+; CHECK-NEXT:    fmov d1, x8
 ; CHECK-NEXT:    fcsel d0, d1, d0, ne
 ; CHECK-NEXT:    ret
   %sel = select i1 %cond, double -4.0, double 23.3
@@ -530,12 +530,12 @@ define double @sel_constants_fsub_constant(i1 %cond) {
 define double @fsub_constant_sel_constants(i1 %cond) {
 ; CHECK-LABEL: fsub_constant_sel_constants:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x9, #3689348814741910323
 ; CHECK-NEXT:    adrp x8, .LCPI44_0
-; CHECK-NEXT:    movk x9, #16418, lsl #48
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    ldr d0, [x8, :lo12:.LCPI44_0]
-; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    mov x8, #3689348814741910323 // =0x3333333333333333
+; CHECK-NEXT:    movk x8, #16418, lsl #48
+; CHECK-NEXT:    fmov d1, x8
 ; CHECK-NEXT:    fcsel d0, d1, d0, ne
 ; CHECK-NEXT:    ret
   %sel = select i1 %cond, double -4.0, double 23.3
@@ -546,12 +546,12 @@ define double @fsub_constant_sel_constants(i1 %cond) {
 define double @sel_constants_fmul_constant(i1 %cond) {
 ; CHECK-LABEL: sel_constants_fmul_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x9, #7378697629483820646
 ; CHECK-NEXT:    adrp x8, .LCPI45_0
-; CHECK-NEXT:    movk x9, #49204, lsl #48
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    ldr d0, [x8, :lo12:.LCPI45_0]
-; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    mov x8, #7378697629483820646 // =0x6666666666666666
+; CHECK-NEXT:    movk x8, #49204, lsl #48
+; CHECK-NEXT:    fmov d1, x8
 ; CHECK-NEXT:    fcsel d0, d1, d0, ne
 ; CHECK-NEXT:    ret
   %sel = select i1 %cond, double -4.0, double 23.3
@@ -577,12 +577,12 @@ define double @sel_constants_fdiv_constant(i1 %cond) {
 define double @fdiv_constant_sel_constants(i1 %cond) {
 ; CHECK-LABEL: fdiv_constant_sel_constants:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x9, #7378697629483820646
 ; CHECK-NEXT:    adrp x8, .LCPI47_0
-; CHECK-NEXT:    movk x9, #49140, lsl #48
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    ldr d0, [x8, :lo12:.LCPI47_0]
-; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    mov x8, #7378697629483820646 // =0x6666666666666666
+; CHECK-NEXT:    movk x8, #49140, lsl #48
+; CHECK-NEXT:    fmov d1, x8
 ; CHECK-NEXT:    fcsel d0, d1, d0, ne
 ; CHECK-NEXT:    ret
   %sel = select i1 %cond, double -4.0, double 23.3
@@ -594,10 +594,10 @@ define double @sel_constants_frem_constant(i1 %cond) {
 ; CHECK-LABEL: sel_constants_frem_constant:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI48_0
-; CHECK-NEXT:    fmov d1, #-4.00000000
+; CHECK-NEXT:    fmov d0, #-4.00000000
 ; CHECK-NEXT:    tst w0, #0x1
-; CHECK-NEXT:    ldr d0, [x8, :lo12:.LCPI48_0]
-; CHECK-NEXT:    fcsel d0, d1, d0, ne
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI48_0]
+; CHECK-NEXT:    fcsel d0, d0, d1, ne
 ; CHECK-NEXT:    ret
   %sel = select i1 %cond, double -4.0, double 23.3
   %bo = frem double %sel, 5.1
@@ -607,13 +607,13 @@ define double @sel_constants_frem_constant(i1 %cond) {
 define double @frem_constant_sel_constants(i1 %cond) {
 ; CHECK-LABEL: frem_constant_sel_constants:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #7378697629483820646
-; CHECK-NEXT:    adrp x9, .LCPI49_0
-; CHECK-NEXT:    movk x8, #16404, lsl #48
+; CHECK-NEXT:    mov x9, #7378697629483820646 // =0x6666666666666666
+; CHECK-NEXT:    adrp x8, .LCPI49_0
 ; CHECK-NEXT:    tst w0, #0x1
-; CHECK-NEXT:    ldr d1, [x9, :lo12:.LCPI49_0]
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fcsel d0, d1, d0, ne
+; CHECK-NEXT:    movk x9, #16404, lsl #48
+; CHECK-NEXT:    ldr d0, [x8, :lo12:.LCPI49_0]
+; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    fcsel d0, d0, d1, ne
 ; CHECK-NEXT:    ret
   %sel = select i1 %cond, double -4.0, double 23.3
   %bo = frem double 5.1, %sel

diff  --git a/llvm/test/CodeGen/AArch64/select_fmf.ll b/llvm/test/CodeGen/AArch64/select_fmf.ll
index c5efd699f61698..5479e5f3b88d2d 100644
--- a/llvm/test/CodeGen/AArch64/select_fmf.ll
+++ b/llvm/test/CodeGen/AArch64/select_fmf.ll
@@ -7,11 +7,11 @@
 define float @select_select_fold_select_and(float %w, float %x, float %y, float %z) {
 ; CHECK-LABEL: select_select_fold_select_and:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fminnm s5, s1, s2
+; CHECK-NEXT:    fminnm s4, s1, s2
 ; CHECK-NEXT:    fcmp s1, s2
 ; CHECK-NEXT:    fmaxnm s1, s0, s3
+; CHECK-NEXT:    fccmp s4, s0, #4, lt
 ; CHECK-NEXT:    fmov s4, #0.50000000
-; CHECK-NEXT:    fccmp s5, s0, #4, lt
 ; CHECK-NEXT:    fcsel s2, s1, s0, gt
 ; CHECK-NEXT:    fadd s1, s0, s4
 ; CHECK-NEXT:    fadd s4, s1, s2
@@ -22,11 +22,11 @@ define float @select_select_fold_select_and(float %w, float %x, float %y, float
 ; CHECK-NEXT:    fadd s0, s2, s0
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB0_2: // %if.end.i159.i.i
-; CHECK-NEXT:    mov w8, #52429
-; CHECK-NEXT:    mov w9, #13107
+; CHECK-NEXT:    mov w8, #52429 // =0xcccd
+; CHECK-NEXT:    mov w9, #13107 // =0x3333
+; CHECK-NEXT:    fcmp s1, #0.0
 ; CHECK-NEXT:    movk w8, #48844, lsl #16
 ; CHECK-NEXT:    movk w9, #48819, lsl #16
-; CHECK-NEXT:    fcmp s1, #0.0
 ; CHECK-NEXT:    fmov s2, w8
 ; CHECK-NEXT:    fmov s4, w9
 ; CHECK-NEXT:    fadd s0, s0, s2
@@ -65,11 +65,11 @@ exit:                                     ; preds = %if.end.i159.i.i, %if.then.i
 define float @select_select_fold_select_or(float %w, float %x, float %y, float %z) {
 ; CHECK-LABEL: select_select_fold_select_or:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fminnm s5, s1, s2
+; CHECK-NEXT:    fminnm s4, s1, s2
 ; CHECK-NEXT:    fcmp s1, s2
 ; CHECK-NEXT:    fmaxnm s1, s0, s3
+; CHECK-NEXT:    fccmp s4, s0, #0, ge
 ; CHECK-NEXT:    fmov s4, #0.50000000
-; CHECK-NEXT:    fccmp s5, s0, #0, ge
 ; CHECK-NEXT:    fcsel s2, s0, s1, gt
 ; CHECK-NEXT:    fadd s1, s0, s4
 ; CHECK-NEXT:    fadd s4, s1, s2
@@ -80,11 +80,11 @@ define float @select_select_fold_select_or(float %w, float %x, float %y, float %
 ; CHECK-NEXT:    fadd s0, s2, s0
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB1_2: // %if.end.i159.i.i
-; CHECK-NEXT:    mov w8, #52429
-; CHECK-NEXT:    mov w9, #13107
+; CHECK-NEXT:    mov w8, #52429 // =0xcccd
+; CHECK-NEXT:    mov w9, #13107 // =0x3333
+; CHECK-NEXT:    fcmp s1, #0.0
 ; CHECK-NEXT:    movk w8, #48844, lsl #16
 ; CHECK-NEXT:    movk w9, #48819, lsl #16
-; CHECK-NEXT:    fcmp s1, #0.0
 ; CHECK-NEXT:    fmov s2, w8
 ; CHECK-NEXT:    fmov s4, w9
 ; CHECK-NEXT:    fadd s0, s0, s2

diff  --git a/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll b/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll
index 56082bcb4c1bc9..23de2d668cffa7 100644
--- a/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll
+++ b/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll
@@ -6,7 +6,7 @@
 define i32 @neg_sel_constants(i32 %a) {
 ; CHECK-LABEL: neg_sel_constants:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #5
+; CHECK-NEXT:    mov w8, #5 // =0x5
 ; CHECK-NEXT:    and w0, w8, w0, asr #31
 ; CHECK-NEXT:    ret
   %tmp.1 = icmp slt i32 %a, 0
@@ -58,7 +58,7 @@ define i32 @not_pos_sel_same_variable(i32 %a) {
 define i32 @pos_sel_constants(i32 %a) {
 ; CHECK-LABEL: pos_sel_constants:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #5
+; CHECK-NEXT:    mov w8, #5 // =0x5
 ; CHECK-NEXT:    bic w0, w8, w0, asr #31
 ; CHECK-NEXT:    ret
   %tmp.1 = icmp sgt i32 %a, -1
@@ -71,7 +71,7 @@ define i32 @pos_sel_constants(i32 %a) {
 define i32 @pos_sel_special_constant(i32 %a) {
 ; CHECK-LABEL: pos_sel_special_constant:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #512
+; CHECK-NEXT:    mov w8, #512 // =0x200
 ; CHECK-NEXT:    bic w0, w8, w0, lsr #22
 ; CHECK-NEXT:    ret
   %tmp.1 = icmp sgt i32 %a, -1
@@ -121,7 +121,7 @@ define i32 @PR31175(i32 %x, i32 %y) {
 define i8 @sel_shift_bool_i8(i1 %t) {
 ; CHECK-LABEL: sel_shift_bool_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-128
+; CHECK-NEXT:    mov w8, #-128 // =0xffffff80
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    csel w0, w8, wzr, ne
 ; CHECK-NEXT:    ret
@@ -132,7 +132,7 @@ define i8 @sel_shift_bool_i8(i1 %t) {
 define i16 @sel_shift_bool_i16(i1 %t) {
 ; CHECK-LABEL: sel_shift_bool_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #128
+; CHECK-NEXT:    mov w8, #128 // =0x80
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    csel w0, w8, wzr, ne
 ; CHECK-NEXT:    ret
@@ -143,7 +143,7 @@ define i16 @sel_shift_bool_i16(i1 %t) {
 define i32 @sel_shift_bool_i32(i1 %t) {
 ; CHECK-LABEL: sel_shift_bool_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #64
+; CHECK-NEXT:    mov w8, #64 // =0x40
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    csel w0, w8, wzr, ne
 ; CHECK-NEXT:    ret
@@ -154,7 +154,7 @@ define i32 @sel_shift_bool_i32(i1 %t) {
 define i64 @sel_shift_bool_i64(i1 %t) {
 ; CHECK-LABEL: sel_shift_bool_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #65536
+; CHECK-NEXT:    mov w8, #65536 // =0x10000
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    csel x0, x8, xzr, ne
 ; CHECK-NEXT:    ret
@@ -165,8 +165,8 @@ define i64 @sel_shift_bool_i64(i1 %t) {
 define <16 x i8> @sel_shift_bool_v16i8(<16 x i1> %t) {
 ; CHECK-LABEL: sel_shift_bool_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.16b, #128
 ; CHECK-NEXT:    shl v0.16b, v0.16b, #7
+; CHECK-NEXT:    movi v1.16b, #128
 ; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -204,9 +204,9 @@ define <2 x i64> @sel_shift_bool_v2i64(<2 x i1> %t) {
 ; CHECK-LABEL: sel_shift_bool_v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-NEXT:    mov w8, #65536
-; CHECK-NEXT:    shl v0.2d, v0.2d, #63
+; CHECK-NEXT:    mov w8, #65536 // =0x10000
 ; CHECK-NEXT:    dup v1.2d, x8
+; CHECK-NEXT:    shl v0.2d, v0.2d, #63
 ; CHECK-NEXT:    cmlt v0.2d, v0.2d, #0
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll b/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll
index e482833ffe4528..c0a728014e390e 100644
--- a/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll
+++ b/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll
@@ -6,11 +6,11 @@ define void @test_mismatched_setcc(<4 x i22> %l, <4 x i22> %r, ptr %addr) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v2.4s, #63, msl #16
 ; CHECK-NEXT:    adrp x8, .LCPI0_0
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI0_0]
 ; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    strb w8, [x0]

diff  --git a/llvm/test/CodeGen/AArch64/settag-merge-order.ll b/llvm/test/CodeGen/AArch64/settag-merge-order.ll
index ec13a7c99ad425..dd3b1fb071fe54 100644
--- a/llvm/test/CodeGen/AArch64/settag-merge-order.ll
+++ b/llvm/test/CodeGen/AArch64/settag-merge-order.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc < %s -mtriple=aarch64 -mattr=+mte -aarch64-order-frame-objects=1 | FileCheck %s
 
 declare void @use(ptr %p)
@@ -7,13 +8,26 @@ declare void @llvm.aarch64.settag.zero(ptr %p, i64 %a)
 ; Two loops of size 256; the second loop updates SP.
 ; After frame reordering, two loops can be merged into one.
 define void @stg128_128_gap_128_128() {
-entry:
 ; CHECK-LABEL: stg128_128_gap_128_128:
-; CHECK: mov     x8, #512
-; CHECK: st2g    sp, [sp], #32
-; CHECK: subs    x8, x8, #32
-; CHECK: b.ne
-; CHECK: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #544
+; CHECK-NEXT:    .cfi_def_cfa_offset 560
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    add x0, sp, #512
+; CHECK-NEXT:    bl use
+; CHECK-NEXT:    mov x8, #512 // =0x200
+; CHECK-NEXT:  .LBB0_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    st2g sp, [sp], #32
+; CHECK-NEXT:    subs x8, x8, #32
+; CHECK-NEXT:    b.ne .LBB0_1
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
   %a = alloca i8, i32 128, align 16
   %a2 = alloca i8, i32 128, align 16
   %b = alloca i8, i32 32, align 16
@@ -28,8 +42,51 @@ entry:
 }
 
 define void @stg2(i1 %flag) {
-entry:
 ; CHECK-LABEL: stg2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #608
+; CHECK-NEXT:    .cfi_def_cfa_offset 640
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    mov w19, w0
+; CHECK-NEXT:    add x0, sp, #576
+; CHECK-NEXT:    bl use
+; CHECK-NEXT:    tbz w19, #0, .LBB1_4
+; CHECK-NEXT:  // %bb.1: // %if.then
+; CHECK-NEXT:    add x9, sp, #256
+; CHECK-NEXT:    mov x8, #320 // =0x140
+; CHECK-NEXT:  .LBB1_2: // %if.then
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    st2g x9, [x9], #32
+; CHECK-NEXT:    subs x8, x8, #32
+; CHECK-NEXT:    b.ne .LBB1_2
+; CHECK-NEXT:  // %bb.3: // %if.then
+; CHECK-NEXT:    b .LBB1_7
+; CHECK-NEXT:  .LBB1_4: // %if.else
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    mov x8, #256 // =0x100
+; CHECK-NEXT:  .LBB1_5: // %if.else
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    st2g x9, [x9], #32
+; CHECK-NEXT:    subs x8, x8, #32
+; CHECK-NEXT:    b.ne .LBB1_5
+; CHECK-NEXT:  // %bb.6: // %if.else
+; CHECK-NEXT:  .LBB1_7: // %if.end
+; CHECK-NEXT:    mov x8, #576 // =0x240
+; CHECK-NEXT:  .LBB1_8: // %if.end
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    st2g sp, [sp], #32
+; CHECK-NEXT:    subs x8, x8, #32
+; CHECK-NEXT:    b.ne .LBB1_8
+; CHECK-NEXT:  // %bb.9: // %if.end
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
   %a = alloca i8, i32 160, align 16
   %a2 = alloca i8, i32 160, align 16
   %b = alloca i8, i32 32, align 16
@@ -39,33 +96,20 @@ entry:
   br i1 %flag, label %if.then, label %if.else
 
 if.then:
-; CHECK: mov     x8, #320
-; CHECK: subs    x8, x8, #32
-; CHECK: st2g    x9, [x9], #32
-; CHECK: b.ne
   call void @llvm.aarch64.settag(ptr %a, i64 160)
   call void @llvm.aarch64.settag(ptr %a2, i64 160)
   br label %if.end
 
 if.else:
-; CHECK: mov     x8, #256
-; CHECK: subs    x8, x8, #32
-; CHECK: st2g    x9, [x9], #32
-; CHECK: b.ne
   call void @llvm.aarch64.settag(ptr %c, i64 128)
   call void @llvm.aarch64.settag(ptr %c2, i64 128)
   br label %if.end
 
 if.end:
-; CHECK: mov     x8, #576
-; CHECK: st2g    sp, [sp], #32
-; CHECK: subs    x8, x8, #32
-; CHECK: b.ne
   call void @llvm.aarch64.settag(ptr %a, i64 160)
   call void @llvm.aarch64.settag(ptr %a2, i64 160)
   call void @llvm.aarch64.settag(ptr %c, i64 128)
   call void @llvm.aarch64.settag(ptr %c2, i64 128)
 
-; CHECK: ret
   ret void
 }

diff  --git a/llvm/test/CodeGen/AArch64/settag-merge.ll b/llvm/test/CodeGen/AArch64/settag-merge.ll
index 50cc1fd43227d4..0c00931a1fd0c7 100644
--- a/llvm/test/CodeGen/AArch64/settag-merge.ll
+++ b/llvm/test/CodeGen/AArch64/settag-merge.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc < %s -mtriple=aarch64 -mattr=+mte -aarch64-order-frame-objects=0 | FileCheck %s
 
 declare void @use(ptr %p)
@@ -5,10 +6,13 @@ declare void @llvm.aarch64.settag(ptr %p, i64 %a)
 declare void @llvm.aarch64.settag.zero(ptr %p, i64 %a)
 
 define void @stg16_16() {
-entry:
 ; CHECK-LABEL: stg16_16:
-; CHECK: st2g sp, [sp], #32
-; CHECK: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    st2g sp, [sp], #32
+; CHECK-NEXT:    ret
+entry:
   %a = alloca i8, i32 16, align 16
   %b = alloca i8, i32 16, align 16
   call void @llvm.aarch64.settag(ptr %a, i64 16)
@@ -17,12 +21,15 @@ entry:
 }
 
 define i32 @stg16_16_16_16_ret() {
-entry:
 ; CHECK-LABEL: stg16_16_16_16_ret:
-; CHECK: mov  w0, wzr
-; CHECK: st2g sp, [sp, #32]
-; CHECK: st2g sp, [sp], #64
-; CHECK: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    st2g sp, [sp, #32]
+; CHECK-NEXT:    st2g sp, [sp], #64
+; CHECK-NEXT:    ret
+entry:
   %a = alloca i8, i32 16, align 16
   %b = alloca i8, i32 16, align 16
   %c = alloca i8, i32 16, align 16
@@ -35,11 +42,14 @@ entry:
 }
 
 define void @stg16_16_16_16() {
-entry:
 ; CHECK-LABEL: stg16_16_16_16:
-; CHECK: st2g sp, [sp, #32]
-; CHECK: st2g sp, [sp], #64
-; CHECK: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    st2g sp, [sp, #32]
+; CHECK-NEXT:    st2g sp, [sp], #64
+; CHECK-NEXT:    ret
+entry:
   %a = alloca i8, i32 16, align 16
   %b = alloca i8, i32 16, align 16
   %c = alloca i8, i32 16, align 16
@@ -52,13 +62,22 @@ entry:
 }
 
 define void @stg128_128_128_128() {
-entry:
 ; CHECK-LABEL: stg128_128_128_128:
-; CHECK: mov     x8, #512
-; CHECK: st2g    sp, [sp], #32
-; CHECK: subs    x8, x8, #32
-; CHECK: b.ne
-; CHECK: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #512
+; CHECK-NEXT:    .cfi_def_cfa_offset 528
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, #512 // =0x200
+; CHECK-NEXT:  .LBB3_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    st2g sp, [sp], #32
+; CHECK-NEXT:    subs x8, x8, #32
+; CHECK-NEXT:    b.ne .LBB3_1
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
   %a = alloca i8, i32 128, align 16
   %b = alloca i8, i32 128, align 16
   %c = alloca i8, i32 128, align 16
@@ -71,13 +90,22 @@ entry:
 }
 
 define void @stg16_512_16() {
-entry:
 ; CHECK-LABEL: stg16_512_16:
-; CHECK: mov     x8, #544
-; CHECK: st2g    sp, [sp], #32
-; CHECK: subs    x8, x8, #32
-; CHECK: b.ne
-; CHECK: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #544
+; CHECK-NEXT:    .cfi_def_cfa_offset 560
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, #544 // =0x220
+; CHECK-NEXT:  .LBB4_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    st2g sp, [sp], #32
+; CHECK-NEXT:    subs x8, x8, #32
+; CHECK-NEXT:    b.ne .LBB4_1
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
   %a = alloca i8, i32 16, align 16
   %b = alloca i8, i32 512, align 16
   %c = alloca i8, i32 16, align 16
@@ -88,13 +116,22 @@ entry:
 }
 
 define void @stg512_512_512() {
-entry:
 ; CHECK-LABEL: stg512_512_512:
-; CHECK: mov     x8, #1536
-; CHECK: st2g    sp, [sp], #32
-; CHECK: subs    x8, x8, #32
-; CHECK: b.ne
-; CHECK: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1536
+; CHECK-NEXT:    .cfi_def_cfa_offset 1552
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, #1536 // =0x600
+; CHECK-NEXT:  .LBB5_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    st2g sp, [sp], #32
+; CHECK-NEXT:    subs x8, x8, #32
+; CHECK-NEXT:    b.ne .LBB5_1
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
   %a = alloca i8, i32 512, align 16
   %b = alloca i8, i32 512, align 16
   %c = alloca i8, i32 512, align 16
@@ -105,16 +142,20 @@ entry:
 }
 
 define void @early(i1 %flag) {
-entry:
 ; CHECK-LABEL: early:
-; CHECK: tbz     w0, #0, [[LABEL:.LBB.*]]
-; CHECK: st2g    sp, [sp, #
-; CHECK: st2g    sp, [sp, #
-; CHECK: st2g    sp, [sp, #
-; CHECK: [[LABEL]]:
-; CHECK: stg     sp, [sp, #
-; CHECK: st2g    sp, [sp], #
-; CHECK: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #144
+; CHECK-NEXT:    .cfi_def_cfa_offset 144
+; CHECK-NEXT:    tbz w0, #0, .LBB6_2
+; CHECK-NEXT:  // %bb.1: // %if.then
+; CHECK-NEXT:    st2g sp, [sp, #48]
+; CHECK-NEXT:    st2g sp, [sp, #80]
+; CHECK-NEXT:    st2g sp, [sp, #112]
+; CHECK-NEXT:  .LBB6_2: // %if.end
+; CHECK-NEXT:    stg sp, [sp, #32]
+; CHECK-NEXT:    st2g sp, [sp], #144
+; CHECK-NEXT:    ret
+entry:
   %a = alloca i8, i32 48, align 16
   %b = alloca i8, i32 48, align 16
   %c = alloca i8, i32 48, align 16
@@ -131,18 +172,28 @@ if.end:
 }
 
 define void @early_128_128(i1 %flag) {
-entry:
 ; CHECK-LABEL: early_128_128:
-; CHECK: tbz   w0, #0, [[LABEL:.LBB.*]]
-; CHECK: add   x9, sp, #
-; CHECK: mov   x8, #256
-; CHECK: subs  x8, x8, #32
-; CHECK: st2g  x9, [x9], #32
-; CHECK: b.ne
-; CHECK: [[LABEL]]:
-; CHECK: stg     sp, [sp, #
-; CHECK: st2g    sp, [sp], #
-; CHECK: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #320
+; CHECK-NEXT:    str x29, [sp, #304] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 320
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    tbz w0, #0, .LBB7_4
+; CHECK-NEXT:  // %bb.1: // %if.then
+; CHECK-NEXT:    add x9, sp, #48
+; CHECK-NEXT:    mov x8, #256 // =0x100
+; CHECK-NEXT:  .LBB7_2: // %if.then
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    st2g x9, [x9], #32
+; CHECK-NEXT:    subs x8, x8, #32
+; CHECK-NEXT:    b.ne .LBB7_2
+; CHECK-NEXT:  // %bb.3: // %if.then
+; CHECK-NEXT:  .LBB7_4: // %if.end
+; CHECK-NEXT:    stg sp, [sp, #32]
+; CHECK-NEXT:    st2g sp, [sp], #304
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
   %a = alloca i8, i32 128, align 16
   %b = alloca i8, i32 128, align 16
   %c = alloca i8, i32 48, align 16
@@ -159,18 +210,28 @@ if.end:
 }
 
 define void @early_512_512(i1 %flag) {
-entry:
 ; CHECK-LABEL: early_512_512:
-; CHECK: tbz   w0, #0, [[LABEL:.LBB.*]]
-; CHECK: add   x9, sp, #
-; CHECK: mov   x8, #1024
-; CHECK: subs  x8, x8, #32
-; CHECK: st2g  x9, [x9], #32
-; CHECK: b.ne
-; CHECK: [[LABEL]]:
-; CHECK: stg     sp, [sp, #
-; CHECK: st2g    sp, [sp], #
-; CHECK: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1072
+; CHECK-NEXT:    .cfi_def_cfa_offset 1088
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    tbz w0, #0, .LBB8_4
+; CHECK-NEXT:  // %bb.1: // %if.then
+; CHECK-NEXT:    add x9, sp, #48
+; CHECK-NEXT:    mov x8, #1024 // =0x400
+; CHECK-NEXT:  .LBB8_2: // %if.then
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    st2g x9, [x9], #32
+; CHECK-NEXT:    subs x8, x8, #32
+; CHECK-NEXT:    b.ne .LBB8_2
+; CHECK-NEXT:  // %bb.3: // %if.then
+; CHECK-NEXT:  .LBB8_4: // %if.end
+; CHECK-NEXT:    stg sp, [sp, #32]
+; CHECK-NEXT:    st2g sp, [sp], #1072
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
   %a = alloca i8, i32 512, align 16
   %b = alloca i8, i32 512, align 16
   %c = alloca i8, i32 48, align 16
@@ -188,18 +249,34 @@ if.end:
 
 ; Two loops of size 256; the second loop updates SP.
 define void @stg128_128_gap_128_128() {
-entry:
 ; CHECK-LABEL: stg128_128_gap_128_128:
-; CHECK: mov     x9, sp
-; CHECK: mov     x8, #256
-; CHECK: subs    x8, x8, #32
-; CHECK: st2g    x9, [x9], #32
-; CHECK: b.ne
-; CHECK: mov     x8, #256
-; CHECK: st2g    sp, [sp], #32
-; CHECK: subs    x8, x8, #32
-; CHECK: b.ne
-; CHECK: ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #544
+; CHECK-NEXT:    .cfi_def_cfa_offset 560
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    add x0, sp, #256
+; CHECK-NEXT:    bl use
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    mov x8, #256 // =0x100
+; CHECK-NEXT:  .LBB9_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    st2g x9, [x9], #32
+; CHECK-NEXT:    subs x8, x8, #32
+; CHECK-NEXT:    b.ne .LBB9_1
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    add sp, sp, #288
+; CHECK-NEXT:    mov x8, #256 // =0x100
+; CHECK-NEXT:  .LBB9_3: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    st2g sp, [sp], #32
+; CHECK-NEXT:    subs x8, x8, #32
+; CHECK-NEXT:    b.ne .LBB9_3
+; CHECK-NEXT:  // %bb.4: // %entry
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
   %a = alloca i8, i32 128, align 16
   %a2 = alloca i8, i32 128, align 16
   %b = alloca i8, i32 32, align 16

diff  --git a/llvm/test/CodeGen/AArch64/settag.ll b/llvm/test/CodeGen/AArch64/settag.ll
index 747a21a60241f0..3d094ac8a517d8 100644
--- a/llvm/test/CodeGen/AArch64/settag.ll
+++ b/llvm/test/CodeGen/AArch64/settag.ll
@@ -58,11 +58,11 @@ entry:
 define void @stg16(ptr %p) {
 ; CHECK-LABEL: stg16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov x8, #256
+; CHECK-NEXT:    mov x8, #256 // =0x100
 ; CHECK-NEXT:  .LBB5_1: // %entry
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    subs x8, x8, #32
 ; CHECK-NEXT:    st2g x0, [x0], #32
+; CHECK-NEXT:    subs x8, x8, #32
 ; CHECK-NEXT:    b.ne .LBB5_1
 ; CHECK-NEXT:  // %bb.2: // %entry
 ; CHECK-NEXT:    ret
@@ -74,12 +74,12 @@ entry:
 define void @stg17(ptr %p) {
 ; CHECK-LABEL: stg17:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov x8, #256
 ; CHECK-NEXT:    stg x0, [x0], #16
+; CHECK-NEXT:    mov x8, #256 // =0x100
 ; CHECK-NEXT:  .LBB6_1: // %entry
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    subs x8, x8, #32
 ; CHECK-NEXT:    st2g x0, [x0], #32
+; CHECK-NEXT:    subs x8, x8, #32
 ; CHECK-NEXT:    b.ne .LBB6_1
 ; CHECK-NEXT:  // %bb.2: // %entry
 ; CHECK-NEXT:    ret
@@ -102,12 +102,12 @@ entry:
 define void @stzg17(ptr %p) {
 ; CHECK-LABEL: stzg17:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov x8, #256
 ; CHECK-NEXT:    stzg x0, [x0], #16
+; CHECK-NEXT:    mov x8, #256 // =0x100
 ; CHECK-NEXT:  .LBB8_1: // %entry
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    subs x8, x8, #32
 ; CHECK-NEXT:    stz2g x0, [x0], #32
+; CHECK-NEXT:    subs x8, x8, #32
 ; CHECK-NEXT:    b.ne .LBB8_1
 ; CHECK-NEXT:  // %bb.2: // %entry
 ; CHECK-NEXT:    ret
@@ -150,7 +150,7 @@ define void @stg_alloca17() nounwind {
 ; CHECK-LABEL: stg_alloca17:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    sub sp, sp, #288
-; CHECK-NEXT:    mov x8, #256
+; CHECK-NEXT:    mov x8, #256 // =0x100
 ; CHECK-NEXT:    str x29, [sp, #272] // 8-byte Folded Spill
 ; CHECK-NEXT:  .LBB11_1: // %entry
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -175,12 +175,12 @@ define void @stg_alloca18() uwtable {
 ; CHECK-NEXT:    str x29, [sp, #272] // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    mov x8, #256
+; CHECK-NEXT:    mov x8, #256 // =0x100
 ; CHECK-NEXT:    stg x9, [x9], #16
 ; CHECK-NEXT:  .LBB12_1: // %entry
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    subs x8, x8, #32
 ; CHECK-NEXT:    st2g x9, [x9], #32
+; CHECK-NEXT:    subs x8, x8, #32
 ; CHECK-NEXT:    b.ne .LBB12_1
 ; CHECK-NEXT:  // %bb.2: // %entry
 ; CHECK-NEXT:    add sp, sp, #272
@@ -198,11 +198,31 @@ entry:
 ; Verify that SLH works together with MTE stack tagging,
 ; see issue https://github.com/llvm/llvm-project/issues/61830
 define void @test_slh() speculative_load_hardening {
-; CHECK-LABEL: test_slh
+; CHECK-LABEL: test_slh:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp sp, #0
+; CHECK-NEXT:    csetm x16, ne
+; CHECK-NEXT:    sub sp, sp, #208
+; CHECK-NEXT:    str x30, [sp, #192] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 208
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    mov x1, sp
+; CHECK-NEXT:    mov x0, sp
+; CHECK-NEXT:    and x1, x1, x16
+; CHECK-NEXT:    mov sp, x1
+; CHECK-NEXT:    bl b
+; CHECK-NEXT:    cmp sp, #0
+; CHECK-NEXT:    ldr x30, [sp, #192] // 8-byte Folded Reload
+; CHECK-NEXT:    csetm x16, ne
+; CHECK-NEXT:    and x30, x30, x16
+; CHECK-NEXT:    add sp, sp, #208
+; CHECK-NEXT:    mov x0, sp
+; CHECK-NEXT:    and x0, x0, x16
+; CHECK-NEXT:    mov sp, x0
+; CHECK-NEXT:    csdb
+; CHECK-NEXT:    ret
 ; Verify that the memtag loop uses a b.cc conditional branch
 ; rather than an cb[n]z branch.
-;CHECK-NOT:   cb{{n?}}z
-;CHECK:       b.
   %d = alloca [48 x i32], align 4
   call void @b(ptr %d)
   ret void

diff  --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll
index d794991895b3c4..fd991104e43465 100644
--- a/llvm/test/CodeGen/AArch64/sext.ll
+++ b/llvm/test/CodeGen/AArch64/sext.ll
@@ -245,12 +245,12 @@ entry:
 define <3 x i64> @sext_v3i8_v3i64(<3 x i8> %a) {
 ; CHECK-SD-LABEL: sext_v3i8_v3i64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmov s1, w0
-; CHECK-SD-NEXT:    fmov s0, w2
-; CHECK-SD-NEXT:    mov v1.s[1], w1
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    fmov s1, w2
+; CHECK-SD-NEXT:    mov v0.s[1], w1
+; CHECK-SD-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-SD-NEXT:    shl v2.2d, v1.2d, #56
 ; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-NEXT:    shl v2.2d, v0.2d, #56
-; CHECK-SD-NEXT:    ushll v0.2d, v1.2s, #0
 ; CHECK-SD-NEXT:    sshr v2.2d, v2.2d, #56
 ; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #56
 ; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
@@ -313,9 +313,9 @@ define <3 x i64> @sext_v3i16_v3i64(<3 x i16> %a) {
 ; CHECK-SD-NEXT:    sshll v2.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    sshll v0.2d, v2.2s, #0
 ; CHECK-SD-NEXT:    sshll2 v2.2d, v2.4s, #0
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
 ; CHECK-SD-NEXT:    ret
 ;
@@ -325,12 +325,12 @@ define <3 x i64> @sext_v3i16_v3i64(<3 x i16> %a) {
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov h2, v0.h[2]
 ; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    sxth x8, w8
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    sxth x8, w8
+; CHECK-GI-NEXT:    fmov d0, x8
 ; CHECK-GI-NEXT:    sxth x9, w9
 ; CHECK-GI-NEXT:    sxth x10, w10
-; CHECK-GI-NEXT:    fmov d0, x8
 ; CHECK-GI-NEXT:    fmov d1, x9
 ; CHECK-GI-NEXT:    fmov d2, x10
 ; CHECK-GI-NEXT:    ret
@@ -344,8 +344,8 @@ define <3 x i64> @sext_v3i32_v3i64(<3 x i32> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    sshll v3.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    sshll2 v2.2d, v0.4s, #0
-; CHECK-SD-NEXT:    fmov d0, d3
 ; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-SD-NEXT:    fmov d0, d3
 ; CHECK-SD-NEXT:    ext v1.16b, v3.16b, v3.16b, #8
 ; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
 ; CHECK-SD-NEXT:    ret
@@ -355,12 +355,12 @@ define <3 x i64> @sext_v3i32_v3i64(<3 x i32> %a) {
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    mov s2, v0.s[2]
 ; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    sxtw x8, w8
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    sxtw x8, w8
+; CHECK-GI-NEXT:    fmov d0, x8
 ; CHECK-GI-NEXT:    sxtw x9, w9
 ; CHECK-GI-NEXT:    sxtw x10, w10
-; CHECK-GI-NEXT:    fmov d0, x8
 ; CHECK-GI-NEXT:    fmov d1, x9
 ; CHECK-GI-NEXT:    fmov d2, x10
 ; CHECK-GI-NEXT:    ret
@@ -401,12 +401,12 @@ entry:
 define <3 x i64> @sext_v3i10_v3i64(<3 x i10> %a) {
 ; CHECK-SD-LABEL: sext_v3i10_v3i64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmov s1, w0
-; CHECK-SD-NEXT:    fmov s0, w2
-; CHECK-SD-NEXT:    mov v1.s[1], w1
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    fmov s1, w2
+; CHECK-SD-NEXT:    mov v0.s[1], w1
+; CHECK-SD-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-SD-NEXT:    shl v2.2d, v1.2d, #54
 ; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-NEXT:    shl v2.2d, v0.2d, #54
-; CHECK-SD-NEXT:    ushll v0.2d, v1.2s, #0
 ; CHECK-SD-NEXT:    sshr v2.2d, v2.2d, #54
 ; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #54
 ; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
@@ -465,8 +465,8 @@ define <4 x i64> @sext_v4i8_v4i64(<4 x i8> %a) {
 ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    ushll v1.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-SD-NEXT:    shl v2.2d, v1.2d, #56
 ; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #56
+; CHECK-SD-NEXT:    shl v2.2d, v1.2d, #56
 ; CHECK-SD-NEXT:    sshr v1.2d, v0.2d, #56
 ; CHECK-SD-NEXT:    sshr v0.2d, v2.2d, #56
 ; CHECK-SD-NEXT:    ret
@@ -564,8 +564,8 @@ define <4 x i64> @sext_v4i10_v4i64(<4 x i10> %a) {
 ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    ushll v1.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-SD-NEXT:    shl v2.2d, v1.2d, #54
 ; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #54
+; CHECK-SD-NEXT:    shl v2.2d, v1.2d, #54
 ; CHECK-SD-NEXT:    sshr v1.2d, v0.2d, #54
 ; CHECK-SD-NEXT:    sshr v0.2d, v2.2d, #54
 ; CHECK-SD-NEXT:    ret
@@ -620,11 +620,11 @@ define <8 x i64> @sext_v8i8_v8i64(<8 x i8> %a) {
 ; CHECK-SD-LABEL: sext_v8i8_v8i64:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    sshll v1.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    sshll2 v2.4s, v0.8h, #0
-; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v1.2s, #0
 ; CHECK-SD-NEXT:    sshll2 v3.2d, v2.4s, #0
-; CHECK-SD-NEXT:    sshll2 v1.2d, v0.4s, #0
-; CHECK-SD-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT:    sshll2 v1.2d, v1.4s, #0
 ; CHECK-SD-NEXT:    sshll v2.2d, v2.2s, #0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -633,13 +633,13 @@ define <8 x i64> @sext_v8i8_v8i64(<8 x i8> %a) {
 ; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    mov d2, v0.d[1]
-; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    mov d4, v3.d[1]
-; CHECK-GI-NEXT:    sshll v1.2d, v2.2s, #0
-; CHECK-GI-NEXT:    sshll v2.2d, v3.2s, #0
-; CHECK-GI-NEXT:    sshll v3.2d, v4.2s, #0
+; CHECK-GI-NEXT:    mov d3, v2.d[1]
+; CHECK-GI-NEXT:    sshll v2.2d, v2.2s, #0
+; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll v3.2d, v3.2s, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <8 x i8> %a to <8 x i64>
@@ -667,11 +667,11 @@ entry:
 define <8 x i64> @sext_v8i16_v8i64(<8 x i16> %a) {
 ; CHECK-SD-LABEL: sext_v8i16_v8i64:
 ; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshll v1.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    sshll2 v2.4s, v0.8h, #0
-; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v1.2s, #0
 ; CHECK-SD-NEXT:    sshll2 v3.2d, v2.4s, #0
-; CHECK-SD-NEXT:    sshll2 v1.2d, v0.4s, #0
-; CHECK-SD-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT:    sshll2 v1.2d, v1.4s, #0
 ; CHECK-SD-NEXT:    sshll v2.2d, v2.2s, #0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -679,13 +679,13 @@ define <8 x i64> @sext_v8i16_v8i64(<8 x i16> %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    mov d2, v0.d[1]
-; CHECK-GI-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    mov d4, v3.d[1]
-; CHECK-GI-NEXT:    sshll v1.2d, v2.2s, #0
-; CHECK-GI-NEXT:    sshll v2.2d, v3.2s, #0
-; CHECK-GI-NEXT:    sshll v3.2d, v4.2s, #0
+; CHECK-GI-NEXT:    mov d3, v2.d[1]
+; CHECK-GI-NEXT:    sshll v2.2d, v2.2s, #0
+; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll v3.2d, v3.2s, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <8 x i16> %a to <8 x i64>
@@ -695,21 +695,23 @@ entry:
 define <8 x i64> @sext_v8i32_v8i64(<8 x i32> %a) {
 ; CHECK-SD-LABEL: sext_v8i32_v8i64:
 ; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshll v5.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    sshll2 v4.2d, v0.4s, #0
 ; CHECK-SD-NEXT:    sshll2 v3.2d, v1.4s, #0
-; CHECK-SD-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    sshll v2.2d, v1.2s, #0
+; CHECK-SD-NEXT:    mov v0.16b, v5.16b
 ; CHECK-SD-NEXT:    mov v1.16b, v4.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: sext_v8i32_v8i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov d3, v0.d[1]
-; CHECK-GI-NEXT:    mov d4, v1.d[1]
+; CHECK-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v2.2s, #0
 ; CHECK-GI-NEXT:    sshll v2.2d, v1.2s, #0
-; CHECK-GI-NEXT:    sshll v1.2d, v3.2s, #0
-; CHECK-GI-NEXT:    sshll v3.2d, v4.2s, #0
+; CHECK-GI-NEXT:    sshll v3.2d, v3.2s, #0
+; CHECK-GI-NEXT:    mov v1.16b, v4.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <8 x i32> %a to <8 x i64>
@@ -732,8 +734,8 @@ define <8 x i32> @sext_v8i10_v8i32(<8 x i10> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    ushll v1.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-SD-NEXT:    shl v2.4s, v1.4s, #22
 ; CHECK-SD-NEXT:    shl v0.4s, v0.4s, #22
+; CHECK-SD-NEXT:    shl v2.4s, v1.4s, #22
 ; CHECK-SD-NEXT:    sshr v1.4s, v0.4s, #22
 ; CHECK-SD-NEXT:    sshr v0.4s, v2.4s, #22
 ; CHECK-SD-NEXT:    ret
@@ -762,14 +764,14 @@ define <8 x i64> @sext_v8i10_v8i64(<8 x i10> %a) {
 ; CHECK-SD-NEXT:    ushll v3.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    ushll2 v1.2d, v1.4s, #0
 ; CHECK-SD-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-SD-NEXT:    shl v1.2d, v1.2d, #54
 ; CHECK-SD-NEXT:    shl v2.2d, v2.2d, #54
-; CHECK-SD-NEXT:    shl v4.2d, v0.2d, #54
+; CHECK-SD-NEXT:    shl v1.2d, v1.2d, #54
 ; CHECK-SD-NEXT:    shl v5.2d, v3.2d, #54
-; CHECK-SD-NEXT:    sshr v1.2d, v1.2d, #54
+; CHECK-SD-NEXT:    shl v4.2d, v0.2d, #54
 ; CHECK-SD-NEXT:    sshr v0.2d, v2.2d, #54
-; CHECK-SD-NEXT:    sshr v3.2d, v4.2d, #54
+; CHECK-SD-NEXT:    sshr v1.2d, v1.2d, #54
 ; CHECK-SD-NEXT:    sshr v2.2d, v5.2d, #54
+; CHECK-SD-NEXT:    sshr v3.2d, v4.2d, #54
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: sext_v8i10_v8i64:
@@ -777,19 +779,19 @@ define <8 x i64> @sext_v8i10_v8i64(<8 x i10> %a) {
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    mov d2, v0.d[1]
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #54
 ; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    ushll v2.2d, v2.2s, #0
-; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #54
 ; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #54
 ; CHECK-GI-NEXT:    shl v2.2d, v2.2d, #54
-; CHECK-GI-NEXT:    ushll v3.2d, v3.2s, #0
 ; CHECK-GI-NEXT:    shl v4.2d, v1.2d, #54
-; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #54
-; CHECK-GI-NEXT:    shl v3.2d, v3.2d, #54
+; CHECK-GI-NEXT:    ushll v3.2d, v3.2s, #0
 ; CHECK-GI-NEXT:    sshr v1.2d, v2.2d, #54
 ; CHECK-GI-NEXT:    sshr v2.2d, v4.2d, #54
+; CHECK-GI-NEXT:    shl v3.2d, v3.2d, #54
 ; CHECK-GI-NEXT:    sshr v3.2d, v3.2d, #54
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -818,11 +820,11 @@ entry:
 define <16 x i32> @sext_v16i8_v16i32(<16 x i8> %a) {
 ; CHECK-SD-LABEL: sext_v16i8_v16i32:
 ; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshll v1.8h, v0.8b, #0
 ; CHECK-SD-NEXT:    sshll2 v2.8h, v0.16b, #0
-; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    sshll v0.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    sshll2 v3.4s, v2.8h, #0
-; CHECK-SD-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    sshll2 v1.4s, v1.8h, #0
 ; CHECK-SD-NEXT:    sshll v2.4s, v2.4h, #0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -830,13 +832,13 @@ define <16 x i32> @sext_v16i8_v16i32(<16 x i8> %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    mov d2, v0.d[1]
-; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll v2.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    mov d4, v3.d[1]
-; CHECK-GI-NEXT:    sshll v1.4s, v2.4h, #0
-; CHECK-GI-NEXT:    sshll v2.4s, v3.4h, #0
-; CHECK-GI-NEXT:    sshll v3.4s, v4.4h, #0
+; CHECK-GI-NEXT:    mov d3, v2.d[1]
+; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <16 x i8> %a to <16 x i32>
@@ -846,45 +848,45 @@ entry:
 define <16 x i64> @sext_v16i8_v16i64(<16 x i8> %a) {
 ; CHECK-SD-LABEL: sext_v16i8_v16i64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sshll2 v1.8h, v0.16b, #0
-; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-SD-NEXT:    sshll2 v2.4s, v1.8h, #0
-; CHECK-SD-NEXT:    sshll2 v4.4s, v0.8h, #0
-; CHECK-SD-NEXT:    sshll v16.4s, v1.4h, #0
-; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-NEXT:    sshll2 v7.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-SD-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-SD-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-SD-NEXT:    sshll2 v4.4s, v1.8h, #0
+; CHECK-SD-NEXT:    sshll v5.4s, v0.4h, #0
+; CHECK-SD-NEXT:    sshll2 v6.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll2 v1.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v2.2s, #0
 ; CHECK-SD-NEXT:    sshll2 v3.2d, v4.4s, #0
-; CHECK-SD-NEXT:    sshll2 v5.2d, v16.4s, #0
-; CHECK-SD-NEXT:    sshll v6.2d, v2.2s, #0
-; CHECK-SD-NEXT:    sshll2 v1.2d, v0.4s, #0
 ; CHECK-SD-NEXT:    sshll v2.2d, v4.2s, #0
-; CHECK-SD-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-SD-NEXT:    sshll v4.2d, v16.2s, #0
+; CHECK-SD-NEXT:    sshll v4.2d, v5.2s, #0
+; CHECK-SD-NEXT:    sshll2 v7.2d, v6.4s, #0
+; CHECK-SD-NEXT:    sshll2 v5.2d, v5.4s, #0
+; CHECK-SD-NEXT:    sshll v6.2d, v6.2s, #0
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: sext_v16i8_v16i64:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    mov d2, v0.d[1]
-; CHECK-GI-NEXT:    sshll v1.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll v0.4s, v1.4h, #0
-; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-GI-NEXT:    mov d3, v0.d[1]
-; CHECK-GI-NEXT:    mov d5, v2.d[1]
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
 ; CHECK-GI-NEXT:    sshll v4.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll v6.4s, v2.4h, #0
-; CHECK-GI-NEXT:    sshll v1.2d, v3.2s, #0
-; CHECK-GI-NEXT:    sshll v16.4s, v5.4h, #0
-; CHECK-GI-NEXT:    mov d3, v4.d[1]
-; CHECK-GI-NEXT:    mov d7, v6.d[1]
-; CHECK-GI-NEXT:    mov d17, v16.d[1]
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    sshll v2.2d, v4.2s, #0
-; CHECK-GI-NEXT:    sshll v4.2d, v6.2s, #0
+; CHECK-GI-NEXT:    mov d5, v4.d[1]
+; CHECK-GI-NEXT:    sshll v4.2d, v4.2s, #0
+; CHECK-GI-NEXT:    sshll v6.4s, v3.4h, #0
+; CHECK-GI-NEXT:    mov d3, v2.d[1]
+; CHECK-GI-NEXT:    sshll v2.2d, v2.2s, #0
+; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    mov d7, v6.d[1]
+; CHECK-GI-NEXT:    sshll v5.2d, v5.2s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v6.2s, #0
 ; CHECK-GI-NEXT:    sshll v3.2d, v3.2s, #0
-; CHECK-GI-NEXT:    sshll v5.2d, v7.2s, #0
-; CHECK-GI-NEXT:    sshll v6.2d, v16.2s, #0
-; CHECK-GI-NEXT:    sshll v7.2d, v17.2s, #0
+; CHECK-GI-NEXT:    sshll v7.2d, v7.2s, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <16 x i8> %a to <16 x i64>
@@ -894,21 +896,23 @@ entry:
 define <16 x i32> @sext_v16i16_v16i32(<16 x i16> %a) {
 ; CHECK-SD-LABEL: sext_v16i16_v16i32:
 ; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sshll v5.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    sshll2 v4.4s, v0.8h, #0
 ; CHECK-SD-NEXT:    sshll2 v3.4s, v1.8h, #0
-; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-SD-NEXT:    mov v0.16b, v5.16b
 ; CHECK-SD-NEXT:    mov v1.16b, v4.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: sext_v16i16_v16i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov d3, v0.d[1]
-; CHECK-GI-NEXT:    mov d4, v1.d[1]
+; CHECK-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v4.4s, v2.4h, #0
 ; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll v1.4s, v3.4h, #0
-; CHECK-GI-NEXT:    sshll v3.4s, v4.4h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    mov v1.16b, v4.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <16 x i16> %a to <16 x i32>
@@ -918,40 +922,39 @@ entry:
 define <16 x i64> @sext_v16i16_v16i64(<16 x i16> %a) {
 ; CHECK-SD-LABEL: sext_v16i16_v16i64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sshll2 v2.4s, v0.8h, #0
-; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-SD-NEXT:    sshll2 v4.4s, v1.8h, #0
-; CHECK-SD-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-SD-NEXT:    sshll2 v16.2d, v0.4s, #0
-; CHECK-SD-NEXT:    sshll2 v3.2d, v2.4s, #0
-; CHECK-SD-NEXT:    sshll2 v7.2d, v4.4s, #0
-; CHECK-SD-NEXT:    sshll v2.2d, v2.2s, #0
-; CHECK-SD-NEXT:    sshll2 v5.2d, v1.4s, #0
-; CHECK-SD-NEXT:    sshll v6.2d, v4.2s, #0
-; CHECK-SD-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-SD-NEXT:    sshll v4.2d, v1.2s, #0
-; CHECK-SD-NEXT:    mov v1.16b, v16.16b
+; CHECK-SD-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-SD-NEXT:    sshll2 v4.4s, v0.8h, #0
+; CHECK-SD-NEXT:    sshll v5.4s, v1.4h, #0
+; CHECK-SD-NEXT:    sshll2 v6.4s, v1.8h, #0
+; CHECK-SD-NEXT:    sshll2 v1.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll v0.2d, v2.2s, #0
+; CHECK-SD-NEXT:    sshll2 v3.2d, v4.4s, #0
+; CHECK-SD-NEXT:    sshll v2.2d, v4.2s, #0
+; CHECK-SD-NEXT:    sshll v4.2d, v5.2s, #0
+; CHECK-SD-NEXT:    sshll2 v7.2d, v6.4s, #0
+; CHECK-SD-NEXT:    sshll2 v5.2d, v5.4s, #0
+; CHECK-SD-NEXT:    sshll v6.2d, v6.2s, #0
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: sext_v16i16_v16i64:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll v6.4s, v3.4h, #0
 ; CHECK-GI-NEXT:    mov d3, v0.d[1]
-; CHECK-GI-NEXT:    mov d4, v1.d[1]
-; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll v5.4s, v1.4h, #0
-; CHECK-GI-NEXT:    mov d1, v2.d[1]
-; CHECK-GI-NEXT:    sshll v0.2d, v2.2s, #0
-; CHECK-GI-NEXT:    mov d6, v5.d[1]
-; CHECK-GI-NEXT:    sshll v2.4s, v3.4h, #0
-; CHECK-GI-NEXT:    sshll v3.4s, v4.4h, #0
-; CHECK-GI-NEXT:    mov d7, v2.d[1]
-; CHECK-GI-NEXT:    mov d16, v3.d[1]
-; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    mov d7, v1.d[1]
+; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    sshll v4.2d, v1.2s, #0
+; CHECK-GI-NEXT:    mov d5, v2.d[1]
+; CHECK-GI-NEXT:    mov d16, v6.d[1]
 ; CHECK-GI-NEXT:    sshll v2.2d, v2.2s, #0
-; CHECK-GI-NEXT:    sshll v4.2d, v5.2s, #0
-; CHECK-GI-NEXT:    sshll v5.2d, v6.2s, #0
-; CHECK-GI-NEXT:    sshll v6.2d, v3.2s, #0
-; CHECK-GI-NEXT:    sshll v3.2d, v7.2s, #0
+; CHECK-GI-NEXT:    sshll v1.2d, v3.2s, #0
+; CHECK-GI-NEXT:    sshll v6.2d, v6.2s, #0
+; CHECK-GI-NEXT:    sshll v3.2d, v5.2s, #0
+; CHECK-GI-NEXT:    sshll v5.2d, v7.2s, #0
 ; CHECK-GI-NEXT:    sshll v7.2d, v16.2s, #0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -963,35 +966,34 @@ define <16 x i64> @sext_v16i32_v16i64(<16 x i32> %a) {
 ; CHECK-SD-LABEL: sext_v16i32_v16i64:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    sshll2 v17.2d, v0.4s, #0
-; CHECK-SD-NEXT:    sshll2 v18.2d, v1.4s, #0
-; CHECK-SD-NEXT:    sshll v16.2d, v1.2s, #0
-; CHECK-SD-NEXT:    sshll2 v5.2d, v2.4s, #0
-; CHECK-SD-NEXT:    sshll2 v7.2d, v3.4s, #0
+; CHECK-SD-NEXT:    sshll2 v16.2d, v1.4s, #0
+; CHECK-SD-NEXT:    sshll v18.2d, v1.2s, #0
 ; CHECK-SD-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    sshll v4.2d, v2.2s, #0
+; CHECK-SD-NEXT:    sshll2 v5.2d, v2.4s, #0
+; CHECK-SD-NEXT:    sshll2 v7.2d, v3.4s, #0
 ; CHECK-SD-NEXT:    sshll v6.2d, v3.2s, #0
 ; CHECK-SD-NEXT:    mov v1.16b, v17.16b
-; CHECK-SD-NEXT:    mov v2.16b, v16.16b
-; CHECK-SD-NEXT:    mov v3.16b, v18.16b
+; CHECK-SD-NEXT:    mov v2.16b, v18.16b
+; CHECK-SD-NEXT:    mov v3.16b, v16.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: sext_v16i32_v16i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov d5, v1.d[1]
-; CHECK-GI-NEXT:    mov d6, v2.d[1]
-; CHECK-GI-NEXT:    sshll v16.2d, v0.2s, #0
-; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    mov d6, v1.d[1]
+; CHECK-GI-NEXT:    mov d5, v0.d[1]
+; CHECK-GI-NEXT:    mov d7, v2.d[1]
+; CHECK-GI-NEXT:    mov d18, v3.d[1]
+; CHECK-GI-NEXT:    sshll v16.2d, v1.2s, #0
+; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    sshll v4.2d, v2.2s, #0
-; CHECK-GI-NEXT:    mov d2, v3.d[1]
-; CHECK-GI-NEXT:    sshll v17.2d, v1.2s, #0
-; CHECK-GI-NEXT:    sshll v18.2d, v5.2s, #0
-; CHECK-GI-NEXT:    sshll v1.2d, v0.2s, #0
-; CHECK-GI-NEXT:    sshll v5.2d, v6.2s, #0
+; CHECK-GI-NEXT:    sshll v17.2d, v6.2s, #0
+; CHECK-GI-NEXT:    sshll v1.2d, v5.2s, #0
 ; CHECK-GI-NEXT:    sshll v6.2d, v3.2s, #0
-; CHECK-GI-NEXT:    sshll v7.2d, v2.2s, #0
-; CHECK-GI-NEXT:    mov v0.16b, v16.16b
-; CHECK-GI-NEXT:    mov v2.16b, v17.16b
-; CHECK-GI-NEXT:    mov v3.16b, v18.16b
+; CHECK-GI-NEXT:    sshll v5.2d, v7.2s, #0
+; CHECK-GI-NEXT:    sshll v7.2d, v18.2s, #0
+; CHECK-GI-NEXT:    mov v2.16b, v16.16b
+; CHECK-GI-NEXT:    mov v3.16b, v17.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <16 x i32> %a to <16 x i64>
@@ -1003,26 +1005,26 @@ define <16 x i16> @sext_v16i10_v16i16(<16 x i10> %a) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr w8, [sp]
 ; CHECK-NEXT:    fmov s1, w0
-; CHECK-NEXT:    ldr w9, [sp, #16]
+; CHECK-NEXT:    ldr w9, [sp, #8]
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    ldr w8, [sp, #8]
+; CHECK-NEXT:    ldr w8, [sp, #16]
 ; CHECK-NEXT:    mov v1.h[1], w1
-; CHECK-NEXT:    mov v0.h[1], w8
-; CHECK-NEXT:    ldr w8, [sp, #24]
+; CHECK-NEXT:    mov v0.h[1], w9
 ; CHECK-NEXT:    mov v1.h[2], w2
-; CHECK-NEXT:    mov v0.h[2], w9
-; CHECK-NEXT:    ldr w9, [sp, #32]
+; CHECK-NEXT:    mov v0.h[2], w8
+; CHECK-NEXT:    ldr w8, [sp, #24]
 ; CHECK-NEXT:    mov v1.h[3], w3
 ; CHECK-NEXT:    mov v0.h[3], w8
-; CHECK-NEXT:    ldr w8, [sp, #40]
+; CHECK-NEXT:    ldr w8, [sp, #32]
 ; CHECK-NEXT:    mov v1.h[4], w4
-; CHECK-NEXT:    mov v0.h[4], w9
-; CHECK-NEXT:    ldr w9, [sp, #48]
+; CHECK-NEXT:    mov v0.h[4], w8
+; CHECK-NEXT:    ldr w8, [sp, #40]
 ; CHECK-NEXT:    mov v1.h[5], w5
 ; CHECK-NEXT:    mov v0.h[5], w8
-; CHECK-NEXT:    ldr w8, [sp, #56]
+; CHECK-NEXT:    ldr w8, [sp, #48]
 ; CHECK-NEXT:    mov v1.h[6], w6
-; CHECK-NEXT:    mov v0.h[6], w9
+; CHECK-NEXT:    mov v0.h[6], w8
+; CHECK-NEXT:    ldr w8, [sp, #56]
 ; CHECK-NEXT:    mov v1.h[7], w7
 ; CHECK-NEXT:    mov v0.h[7], w8
 ; CHECK-NEXT:    shl v1.8h, v1.8h, #6
@@ -1038,29 +1040,29 @@ entry:
 define <16 x i32> @sext_v16i10_v16i32(<16 x i10> %a) {
 ; CHECK-SD-LABEL: sext_v16i10_v16i32:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldr w11, [sp, #32]
+; CHECK-SD-NEXT:    ldr w8, [sp, #32]
+; CHECK-SD-NEXT:    ldr w9, [sp]
 ; CHECK-SD-NEXT:    fmov s0, w0
-; CHECK-SD-NEXT:    ldr w12, [sp]
 ; CHECK-SD-NEXT:    fmov s1, w4
 ; CHECK-SD-NEXT:    ldr w10, [sp, #40]
-; CHECK-SD-NEXT:    ldr w15, [sp, #8]
-; CHECK-SD-NEXT:    fmov s3, w11
-; CHECK-SD-NEXT:    fmov s2, w12
-; CHECK-SD-NEXT:    ldr w9, [sp, #48]
+; CHECK-SD-NEXT:    ldr w11, [sp, #8]
+; CHECK-SD-NEXT:    fmov s2, w9
+; CHECK-SD-NEXT:    fmov s3, w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #48]
 ; CHECK-SD-NEXT:    mov v0.h[1], w1
-; CHECK-SD-NEXT:    ldr w14, [sp, #16]
+; CHECK-SD-NEXT:    ldr w9, [sp, #16]
 ; CHECK-SD-NEXT:    mov v1.h[1], w5
-; CHECK-SD-NEXT:    ldr w8, [sp, #56]
-; CHECK-SD-NEXT:    mov v2.h[1], w15
-; CHECK-SD-NEXT:    ldr w13, [sp, #24]
+; CHECK-SD-NEXT:    mov v2.h[1], w11
 ; CHECK-SD-NEXT:    mov v3.h[1], w10
 ; CHECK-SD-NEXT:    mov v0.h[2], w2
 ; CHECK-SD-NEXT:    mov v1.h[2], w6
-; CHECK-SD-NEXT:    mov v2.h[2], w14
-; CHECK-SD-NEXT:    mov v3.h[2], w9
+; CHECK-SD-NEXT:    mov v2.h[2], w9
+; CHECK-SD-NEXT:    mov v3.h[2], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #56]
+; CHECK-SD-NEXT:    ldr w9, [sp, #24]
 ; CHECK-SD-NEXT:    mov v0.h[3], w3
 ; CHECK-SD-NEXT:    mov v1.h[3], w7
-; CHECK-SD-NEXT:    mov v2.h[3], w13
+; CHECK-SD-NEXT:    mov v2.h[3], w9
 ; CHECK-SD-NEXT:    mov v3.h[3], w8
 ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
@@ -1078,36 +1080,36 @@ define <16 x i32> @sext_v16i10_v16i32(<16 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v16i10_v16i32:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov s4, w0
+; CHECK-GI-NEXT:    fmov s5, w4
 ; CHECK-GI-NEXT:    ldr s0, [sp]
-; CHECK-GI-NEXT:    fmov s7, w0
 ; CHECK-GI-NEXT:    ldr s1, [sp, #8]
-; CHECK-GI-NEXT:    fmov s17, w4
-; CHECK-GI-NEXT:    ldr s4, [sp, #32]
-; CHECK-GI-NEXT:    ldr s5, [sp, #40]
+; CHECK-GI-NEXT:    ldr s2, [sp, #32]
+; CHECK-GI-NEXT:    ldr s3, [sp, #40]
+; CHECK-GI-NEXT:    mov v4.s[1], w1
+; CHECK-GI-NEXT:    mov v5.s[1], w5
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    ldr s2, [sp, #16]
-; CHECK-GI-NEXT:    mov v7.s[1], w1
-; CHECK-GI-NEXT:    ldr s6, [sp, #48]
-; CHECK-GI-NEXT:    mov v17.s[1], w5
-; CHECK-GI-NEXT:    ldr s3, [sp, #24]
-; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
-; CHECK-GI-NEXT:    ldr s16, [sp, #56]
-; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
-; CHECK-GI-NEXT:    mov v7.s[2], w2
-; CHECK-GI-NEXT:    mov v17.s[2], w6
-; CHECK-GI-NEXT:    mov v4.s[2], v6.s[0]
-; CHECK-GI-NEXT:    mov v0.s[3], v3.s[0]
-; CHECK-GI-NEXT:    mov v7.s[3], w3
-; CHECK-GI-NEXT:    mov v17.s[3], w7
-; CHECK-GI-NEXT:    mov v4.s[3], v16.s[0]
-; CHECK-GI-NEXT:    shl v3.4s, v0.4s, #22
-; CHECK-GI-NEXT:    shl v1.4s, v7.4s, #22
-; CHECK-GI-NEXT:    shl v2.4s, v17.4s, #22
-; CHECK-GI-NEXT:    shl v4.4s, v4.4s, #22
+; CHECK-GI-NEXT:    mov v2.s[1], v3.s[0]
+; CHECK-GI-NEXT:    ldr s1, [sp, #16]
+; CHECK-GI-NEXT:    ldr s3, [sp, #48]
+; CHECK-GI-NEXT:    mov v4.s[2], w2
+; CHECK-GI-NEXT:    mov v5.s[2], w6
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov v2.s[2], v3.s[0]
+; CHECK-GI-NEXT:    ldr s1, [sp, #24]
+; CHECK-GI-NEXT:    ldr s3, [sp, #56]
+; CHECK-GI-NEXT:    mov v4.s[3], w3
+; CHECK-GI-NEXT:    mov v5.s[3], w7
+; CHECK-GI-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-GI-NEXT:    mov v2.s[3], v3.s[0]
+; CHECK-GI-NEXT:    shl v1.4s, v4.4s, #22
+; CHECK-GI-NEXT:    shl v3.4s, v5.4s, #22
+; CHECK-GI-NEXT:    shl v4.4s, v0.4s, #22
+; CHECK-GI-NEXT:    shl v5.4s, v2.4s, #22
 ; CHECK-GI-NEXT:    sshr v0.4s, v1.4s, #22
-; CHECK-GI-NEXT:    sshr v1.4s, v2.4s, #22
-; CHECK-GI-NEXT:    sshr v2.4s, v3.4s, #22
-; CHECK-GI-NEXT:    sshr v3.4s, v4.4s, #22
+; CHECK-GI-NEXT:    sshr v1.4s, v3.4s, #22
+; CHECK-GI-NEXT:    sshr v2.4s, v4.4s, #22
+; CHECK-GI-NEXT:    sshr v3.4s, v5.4s, #22
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <16 x i10> %a to <16 x i32>
@@ -1117,47 +1119,47 @@ entry:
 define <16 x i64> @sext_v16i10_v16i64(<16 x i10> %a) {
 ; CHECK-SD-LABEL: sext_v16i10_v16i64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldr s0, [sp]
-; CHECK-SD-NEXT:    add x8, sp, #8
+; CHECK-SD-NEXT:    fmov s0, w2
 ; CHECK-SD-NEXT:    fmov s1, w0
-; CHECK-SD-NEXT:    fmov s2, w2
+; CHECK-SD-NEXT:    ldr s2, [sp]
 ; CHECK-SD-NEXT:    fmov s3, w4
 ; CHECK-SD-NEXT:    fmov s4, w6
-; CHECK-SD-NEXT:    ld1 { v0.s }[1], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #24
+; CHECK-SD-NEXT:    add x8, sp, #8
 ; CHECK-SD-NEXT:    ldr s5, [sp, #16]
-; CHECK-SD-NEXT:    add x9, sp, #40
 ; CHECK-SD-NEXT:    ldr s6, [sp, #32]
-; CHECK-SD-NEXT:    add x10, sp, #56
 ; CHECK-SD-NEXT:    ldr s7, [sp, #48]
 ; CHECK-SD-NEXT:    mov v1.s[1], w1
+; CHECK-SD-NEXT:    mov v0.s[1], w3
+; CHECK-SD-NEXT:    ld1 { v2.s }[1], [x8]
+; CHECK-SD-NEXT:    mov v3.s[1], w5
+; CHECK-SD-NEXT:    mov v4.s[1], w7
+; CHECK-SD-NEXT:    add x8, sp, #24
+; CHECK-SD-NEXT:    add x9, sp, #40
+; CHECK-SD-NEXT:    add x10, sp, #56
 ; CHECK-SD-NEXT:    ld1 { v5.s }[1], [x8]
-; CHECK-SD-NEXT:    mov v2.s[1], w3
 ; CHECK-SD-NEXT:    ld1 { v6.s }[1], [x9]
-; CHECK-SD-NEXT:    mov v3.s[1], w5
 ; CHECK-SD-NEXT:    ld1 { v7.s }[1], [x10]
-; CHECK-SD-NEXT:    mov v4.s[1], w7
-; CHECK-SD-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-SD-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    ushll v3.2d, v3.2s, #0
 ; CHECK-SD-NEXT:    ushll v4.2d, v4.2s, #0
-; CHECK-SD-NEXT:    ushll v16.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    ushll v5.2d, v5.2s, #0
 ; CHECK-SD-NEXT:    ushll v6.2d, v6.2s, #0
 ; CHECK-SD-NEXT:    ushll v7.2d, v7.2s, #0
-; CHECK-SD-NEXT:    shl v0.2d, v1.2d, #54
-; CHECK-SD-NEXT:    shl v1.2d, v2.2d, #54
-; CHECK-SD-NEXT:    shl v2.2d, v3.2d, #54
-; CHECK-SD-NEXT:    shl v3.2d, v4.2d, #54
-; CHECK-SD-NEXT:    shl v4.2d, v16.2d, #54
+; CHECK-SD-NEXT:    shl v17.2d, v2.2d, #54
+; CHECK-SD-NEXT:    shl v1.2d, v1.2d, #54
+; CHECK-SD-NEXT:    shl v16.2d, v0.2d, #54
+; CHECK-SD-NEXT:    shl v3.2d, v3.2d, #54
+; CHECK-SD-NEXT:    shl v4.2d, v4.2d, #54
 ; CHECK-SD-NEXT:    shl v5.2d, v5.2d, #54
 ; CHECK-SD-NEXT:    shl v6.2d, v6.2d, #54
 ; CHECK-SD-NEXT:    shl v7.2d, v7.2d, #54
-; CHECK-SD-NEXT:    sshr v0.2d, v0.2d, #54
-; CHECK-SD-NEXT:    sshr v1.2d, v1.2d, #54
-; CHECK-SD-NEXT:    sshr v2.2d, v2.2d, #54
-; CHECK-SD-NEXT:    sshr v3.2d, v3.2d, #54
-; CHECK-SD-NEXT:    sshr v4.2d, v4.2d, #54
+; CHECK-SD-NEXT:    sshr v0.2d, v1.2d, #54
+; CHECK-SD-NEXT:    sshr v1.2d, v16.2d, #54
+; CHECK-SD-NEXT:    sshr v2.2d, v3.2d, #54
+; CHECK-SD-NEXT:    sshr v3.2d, v4.2d, #54
+; CHECK-SD-NEXT:    sshr v4.2d, v17.2d, #54
 ; CHECK-SD-NEXT:    sshr v5.2d, v5.2d, #54
 ; CHECK-SD-NEXT:    sshr v6.2d, v6.2d, #54
 ; CHECK-SD-NEXT:    sshr v7.2d, v7.2d, #54
@@ -1165,50 +1167,50 @@ define <16 x i64> @sext_v16i10_v16i64(<16 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v16i10_v16i64:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov s7, w0
+; CHECK-GI-NEXT:    fmov s17, w2
 ; CHECK-GI-NEXT:    ldr s0, [sp]
-; CHECK-GI-NEXT:    fmov s6, w0
+; CHECK-GI-NEXT:    fmov s18, w4
+; CHECK-GI-NEXT:    fmov s19, w6
 ; CHECK-GI-NEXT:    ldr s1, [sp, #8]
-; CHECK-GI-NEXT:    fmov s16, w2
 ; CHECK-GI-NEXT:    ldr s2, [sp, #16]
-; CHECK-GI-NEXT:    fmov s18, w4
 ; CHECK-GI-NEXT:    ldr s3, [sp, #24]
-; CHECK-GI-NEXT:    fmov s19, w6
 ; CHECK-GI-NEXT:    ldr s4, [sp, #32]
 ; CHECK-GI-NEXT:    ldr s5, [sp, #40]
-; CHECK-GI-NEXT:    ldr s7, [sp, #48]
-; CHECK-GI-NEXT:    ldr s17, [sp, #56]
-; CHECK-GI-NEXT:    mov v6.s[1], w1
-; CHECK-GI-NEXT:    mov v16.s[1], w3
+; CHECK-GI-NEXT:    ldr s6, [sp, #48]
+; CHECK-GI-NEXT:    ldr s16, [sp, #56]
+; CHECK-GI-NEXT:    mov v7.s[1], w1
+; CHECK-GI-NEXT:    mov v17.s[1], w3
 ; CHECK-GI-NEXT:    mov v18.s[1], w5
 ; CHECK-GI-NEXT:    mov v19.s[1], w7
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
 ; CHECK-GI-NEXT:    mov v2.s[1], v3.s[0]
 ; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
-; CHECK-GI-NEXT:    mov v7.s[1], v17.s[0]
-; CHECK-GI-NEXT:    ushll v1.2d, v6.2s, #0
-; CHECK-GI-NEXT:    ushll v3.2d, v16.2s, #0
+; CHECK-GI-NEXT:    mov v6.s[1], v16.s[0]
+; CHECK-GI-NEXT:    ushll v1.2d, v7.2s, #0
+; CHECK-GI-NEXT:    ushll v3.2d, v17.2s, #0
 ; CHECK-GI-NEXT:    ushll v5.2d, v18.2s, #0
-; CHECK-GI-NEXT:    ushll v6.2d, v19.2s, #0
+; CHECK-GI-NEXT:    ushll v7.2d, v19.2s, #0
 ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    ushll v2.2d, v2.2s, #0
 ; CHECK-GI-NEXT:    ushll v4.2d, v4.2s, #0
-; CHECK-GI-NEXT:    ushll v7.2d, v7.2s, #0
+; CHECK-GI-NEXT:    ushll v6.2d, v6.2s, #0
 ; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #54
 ; CHECK-GI-NEXT:    shl v3.2d, v3.2d, #54
 ; CHECK-GI-NEXT:    shl v5.2d, v5.2d, #54
-; CHECK-GI-NEXT:    shl v6.2d, v6.2d, #54
+; CHECK-GI-NEXT:    shl v7.2d, v7.2d, #54
 ; CHECK-GI-NEXT:    shl v16.2d, v0.2d, #54
 ; CHECK-GI-NEXT:    shl v17.2d, v2.2d, #54
 ; CHECK-GI-NEXT:    shl v18.2d, v4.2d, #54
-; CHECK-GI-NEXT:    shl v7.2d, v7.2d, #54
+; CHECK-GI-NEXT:    shl v19.2d, v6.2d, #54
 ; CHECK-GI-NEXT:    sshr v0.2d, v1.2d, #54
 ; CHECK-GI-NEXT:    sshr v1.2d, v3.2d, #54
 ; CHECK-GI-NEXT:    sshr v2.2d, v5.2d, #54
-; CHECK-GI-NEXT:    sshr v3.2d, v6.2d, #54
+; CHECK-GI-NEXT:    sshr v3.2d, v7.2d, #54
 ; CHECK-GI-NEXT:    sshr v4.2d, v16.2d, #54
 ; CHECK-GI-NEXT:    sshr v5.2d, v17.2d, #54
 ; CHECK-GI-NEXT:    sshr v6.2d, v18.2d, #54
-; CHECK-GI-NEXT:    sshr v7.2d, v7.2d, #54
+; CHECK-GI-NEXT:    sshr v7.2d, v19.2d, #54
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <16 x i10> %a to <16 x i64>

diff  --git a/llvm/test/CodeGen/AArch64/shift-amount-mod.ll b/llvm/test/CodeGen/AArch64/shift-amount-mod.ll
index 4fe609097f2045..d4f3e80e96dd19 100644
--- a/llvm/test/CodeGen/AArch64/shift-amount-mod.ll
+++ b/llvm/test/CodeGen/AArch64/shift-amount-mod.ll
@@ -21,9 +21,9 @@ define i32 @reg32_shl_by_negated(i32 %val, i32 %shamt) nounwind {
 define i32 @load32_shl_by_negated(ptr %valptr, i32 %shamt) nounwind {
 ; CHECK-LABEL: load32_shl_by_negated:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg w8, w1
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    lsl w0, w9, w8
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    neg w9, w1
+; CHECK-NEXT:    lsl w0, w8, w9
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %valptr
   %negshamt = sub i32 32, %shamt
@@ -45,9 +45,9 @@ define void @store32_shl_by_negated(i32 %val, ptr %dstptr, i32 %shamt) nounwind
 define void @modify32_shl_by_negated(ptr %valptr, i32 %shamt) nounwind {
 ; CHECK-LABEL: modify32_shl_by_negated:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg w8, w1
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    lsl w8, w9, w8
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    neg w9, w1
+; CHECK-NEXT:    lsl w8, w8, w9
 ; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %valptr
@@ -59,11 +59,11 @@ define void @modify32_shl_by_negated(ptr %valptr, i32 %shamt) nounwind {
 define void @modify32_shl_by_negated_multi_use(ptr %valptr, i32 %shamt, ptr %shamtptr) nounwind {
 ; CHECK-LABEL: modify32_shl_by_negated_multi_use:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg w8, w1
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    mov w10, #32
-; CHECK-NEXT:    lsl w8, w9, w8
-; CHECK-NEXT:    sub w9, w10, w1
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    neg w9, w1
+; CHECK-NEXT:    lsl w8, w8, w9
+; CHECK-NEXT:    mov w9, #32 // =0x20
+; CHECK-NEXT:    sub w9, w9, w1
 ; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    str w9, [x2]
 ; CHECK-NEXT:    ret
@@ -88,9 +88,9 @@ define i64 @reg64_shl_by_negated(i64 %val, i64 %shamt) nounwind {
 define i64 @load64_shl_by_negated(ptr %valptr, i64 %shamt) nounwind {
 ; CHECK-LABEL: load64_shl_by_negated:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg x8, x1
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    lsl x0, x9, x8
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    neg x9, x1
+; CHECK-NEXT:    lsl x0, x8, x9
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %valptr
   %negshamt = sub i64 64, %shamt
@@ -112,9 +112,9 @@ define void @store64_shl_by_negated(i64 %val, ptr %dstptr, i64 %shamt) nounwind
 define void @modify64_shl_by_negated(ptr %valptr, i64 %shamt) nounwind {
 ; CHECK-LABEL: modify64_shl_by_negated:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg x8, x1
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    lsl x8, x9, x8
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    neg x9, x1
+; CHECK-NEXT:    lsl x8, x8, x9
 ; CHECK-NEXT:    str x8, [x0]
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %valptr
@@ -126,11 +126,11 @@ define void @modify64_shl_by_negated(ptr %valptr, i64 %shamt) nounwind {
 define void @modify64_shl_by_negated_multi_use(ptr %valptr, i64 %shamt, ptr %shamtptr) nounwind {
 ; CHECK-LABEL: modify64_shl_by_negated_multi_use:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg x8, x1
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    mov w10, #64
-; CHECK-NEXT:    lsl x8, x9, x8
-; CHECK-NEXT:    sub x9, x10, x1
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    neg x9, x1
+; CHECK-NEXT:    lsl x8, x8, x9
+; CHECK-NEXT:    mov w9, #64 // =0x40
+; CHECK-NEXT:    sub x9, x9, x1
 ; CHECK-NEXT:    str x8, [x0]
 ; CHECK-NEXT:    str x9, [x2]
 ; CHECK-NEXT:    ret
@@ -158,9 +158,9 @@ define i32 @reg32_lshr_by_negated(i32 %val, i32 %shamt) nounwind {
 define i32 @load32_lshr_by_negated(ptr %valptr, i32 %shamt) nounwind {
 ; CHECK-LABEL: load32_lshr_by_negated:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg w8, w1
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    lsr w0, w9, w8
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    neg w9, w1
+; CHECK-NEXT:    lsr w0, w8, w9
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %valptr
   %negshamt = sub i32 32, %shamt
@@ -182,9 +182,9 @@ define void @store32_lshr_by_negated(i32 %val, ptr %dstptr, i32 %shamt) nounwind
 define void @modify32_lshr_by_negated(ptr %valptr, i32 %shamt) nounwind {
 ; CHECK-LABEL: modify32_lshr_by_negated:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg w8, w1
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    lsr w8, w9, w8
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    neg w9, w1
+; CHECK-NEXT:    lsr w8, w8, w9
 ; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %valptr
@@ -196,11 +196,11 @@ define void @modify32_lshr_by_negated(ptr %valptr, i32 %shamt) nounwind {
 define void @modify32_lshr_by_negated_multi_use(ptr %valptr, i32 %shamt, ptr %shamtptr) nounwind {
 ; CHECK-LABEL: modify32_lshr_by_negated_multi_use:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg w8, w1
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    mov w10, #32
-; CHECK-NEXT:    lsr w8, w9, w8
-; CHECK-NEXT:    sub w9, w10, w1
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    neg w9, w1
+; CHECK-NEXT:    lsr w8, w8, w9
+; CHECK-NEXT:    mov w9, #32 // =0x20
+; CHECK-NEXT:    sub w9, w9, w1
 ; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    str w9, [x2]
 ; CHECK-NEXT:    ret
@@ -225,9 +225,9 @@ define i64 @reg64_lshr_by_negated(i64 %val, i64 %shamt) nounwind {
 define i64 @load64_lshr_by_negated(ptr %valptr, i64 %shamt) nounwind {
 ; CHECK-LABEL: load64_lshr_by_negated:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg x8, x1
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    lsr x0, x9, x8
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    neg x9, x1
+; CHECK-NEXT:    lsr x0, x8, x9
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %valptr
   %negshamt = sub i64 64, %shamt
@@ -249,9 +249,9 @@ define void @store64_lshr_by_negated(i64 %val, ptr %dstptr, i64 %shamt) nounwind
 define void @modify64_lshr_by_negated(ptr %valptr, i64 %shamt) nounwind {
 ; CHECK-LABEL: modify64_lshr_by_negated:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg x8, x1
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    lsr x8, x9, x8
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    neg x9, x1
+; CHECK-NEXT:    lsr x8, x8, x9
 ; CHECK-NEXT:    str x8, [x0]
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %valptr
@@ -263,11 +263,11 @@ define void @modify64_lshr_by_negated(ptr %valptr, i64 %shamt) nounwind {
 define void @modify64_lshr_by_negated_multi_use(ptr %valptr, i64 %shamt, ptr %shamtptr) nounwind {
 ; CHECK-LABEL: modify64_lshr_by_negated_multi_use:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg x8, x1
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    mov w10, #64
-; CHECK-NEXT:    lsr x8, x9, x8
-; CHECK-NEXT:    sub x9, x10, x1
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    neg x9, x1
+; CHECK-NEXT:    lsr x8, x8, x9
+; CHECK-NEXT:    mov w9, #64 // =0x40
+; CHECK-NEXT:    sub x9, x9, x1
 ; CHECK-NEXT:    str x8, [x0]
 ; CHECK-NEXT:    str x9, [x2]
 ; CHECK-NEXT:    ret
@@ -295,9 +295,9 @@ define i32 @reg32_ashr_by_negated(i32 %val, i32 %shamt) nounwind {
 define i32 @load32_ashr_by_negated(ptr %valptr, i32 %shamt) nounwind {
 ; CHECK-LABEL: load32_ashr_by_negated:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg w8, w1
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    asr w0, w9, w8
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    neg w9, w1
+; CHECK-NEXT:    asr w0, w8, w9
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %valptr
   %negshamt = sub i32 32, %shamt
@@ -319,9 +319,9 @@ define void @store32_ashr_by_negated(i32 %val, ptr %dstptr, i32 %shamt) nounwind
 define void @modify32_ashr_by_negated(ptr %valptr, i32 %shamt) nounwind {
 ; CHECK-LABEL: modify32_ashr_by_negated:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg w8, w1
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    asr w8, w9, w8
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    neg w9, w1
+; CHECK-NEXT:    asr w8, w8, w9
 ; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %valptr
@@ -333,11 +333,11 @@ define void @modify32_ashr_by_negated(ptr %valptr, i32 %shamt) nounwind {
 define void @modify32_ashr_by_negated_multi_use(ptr %valptr, i32 %shamt, ptr %shamtptr) nounwind {
 ; CHECK-LABEL: modify32_ashr_by_negated_multi_use:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg w8, w1
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    mov w10, #32
-; CHECK-NEXT:    asr w8, w9, w8
-; CHECK-NEXT:    sub w9, w10, w1
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    neg w9, w1
+; CHECK-NEXT:    asr w8, w8, w9
+; CHECK-NEXT:    mov w9, #32 // =0x20
+; CHECK-NEXT:    sub w9, w9, w1
 ; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    str w9, [x2]
 ; CHECK-NEXT:    ret
@@ -362,9 +362,9 @@ define i64 @reg64_ashr_by_negated(i64 %val, i64 %shamt) nounwind {
 define i64 @load64_ashr_by_negated(ptr %valptr, i64 %shamt) nounwind {
 ; CHECK-LABEL: load64_ashr_by_negated:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg x8, x1
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    asr x0, x9, x8
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    neg x9, x1
+; CHECK-NEXT:    asr x0, x8, x9
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %valptr
   %negshamt = sub i64 64, %shamt
@@ -386,9 +386,9 @@ define void @store64_ashr_by_negated(i64 %val, ptr %dstptr, i64 %shamt) nounwind
 define void @modify64_ashr_by_negated(ptr %valptr, i64 %shamt) nounwind {
 ; CHECK-LABEL: modify64_ashr_by_negated:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg x8, x1
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    asr x8, x9, x8
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    neg x9, x1
+; CHECK-NEXT:    asr x8, x8, x9
 ; CHECK-NEXT:    str x8, [x0]
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %valptr
@@ -400,11 +400,11 @@ define void @modify64_ashr_by_negated(ptr %valptr, i64 %shamt) nounwind {
 define void @modify64_ashr_by_negated_multi_use(ptr %valptr, i64 %shamt, ptr %shamtptr) nounwind {
 ; CHECK-LABEL: modify64_ashr_by_negated_multi_use:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg x8, x1
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    mov w10, #64
-; CHECK-NEXT:    asr x8, x9, x8
-; CHECK-NEXT:    sub x9, x10, x1
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    neg x9, x1
+; CHECK-NEXT:    asr x8, x8, x9
+; CHECK-NEXT:    mov w9, #64 // =0x40
+; CHECK-NEXT:    sub x9, x9, x1
 ; CHECK-NEXT:    str x8, [x0]
 ; CHECK-NEXT:    str x9, [x2]
 ; CHECK-NEXT:    ret
@@ -436,9 +436,9 @@ define i32 @reg32_shl_by_complemented(i32 %val, i32 %shamt) nounwind {
 define i32 @load32_shl_by_complemented(ptr %valptr, i32 %shamt) nounwind {
 ; CHECK-LABEL: load32_shl_by_complemented:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w1
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    lsl w0, w9, w8
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    mvn w9, w1
+; CHECK-NEXT:    lsl w0, w8, w9
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %valptr
   %negshamt = sub i32 31, %shamt
@@ -460,9 +460,9 @@ define void @store32_shl_by_complemented(i32 %val, ptr %dstptr, i32 %shamt) noun
 define void @modify32_shl_by_complemented(ptr %valptr, i32 %shamt) nounwind {
 ; CHECK-LABEL: modify32_shl_by_complemented:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w1
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    lsl w8, w9, w8
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    mvn w9, w1
+; CHECK-NEXT:    lsl w8, w8, w9
 ; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %valptr
@@ -474,11 +474,11 @@ define void @modify32_shl_by_complemented(ptr %valptr, i32 %shamt) nounwind {
 define void @modify32_shl_by_complemented_multi_use(ptr %valptr, i32 %shamt, ptr %shamtptr) nounwind {
 ; CHECK-LABEL: modify32_shl_by_complemented_multi_use:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w1
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    mov w10, #31
-; CHECK-NEXT:    lsl w8, w9, w8
-; CHECK-NEXT:    sub w9, w10, w1
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    mvn w9, w1
+; CHECK-NEXT:    lsl w8, w8, w9
+; CHECK-NEXT:    mov w9, #31 // =0x1f
+; CHECK-NEXT:    sub w9, w9, w1
 ; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    str w9, [x2]
 ; CHECK-NEXT:    ret
@@ -503,9 +503,9 @@ define i64 @reg64_shl_by_complemented(i64 %val, i64 %shamt) nounwind {
 define i64 @load64_shl_by_complemented(ptr %valptr, i64 %shamt) nounwind {
 ; CHECK-LABEL: load64_shl_by_complemented:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn x8, x1
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    lsl x0, x9, x8
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    mvn x9, x1
+; CHECK-NEXT:    lsl x0, x8, x9
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %valptr
   %negshamt = sub i64 63, %shamt
@@ -527,9 +527,9 @@ define void @store64_shl_by_complemented(i64 %val, ptr %dstptr, i64 %shamt) noun
 define void @modify64_shl_by_complemented(ptr %valptr, i64 %shamt) nounwind {
 ; CHECK-LABEL: modify64_shl_by_complemented:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn x8, x1
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    lsl x8, x9, x8
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    mvn x9, x1
+; CHECK-NEXT:    lsl x8, x8, x9
 ; CHECK-NEXT:    str x8, [x0]
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %valptr
@@ -541,11 +541,11 @@ define void @modify64_shl_by_complemented(ptr %valptr, i64 %shamt) nounwind {
 define void @modify64_shl_by_complemented_multi_use(ptr %valptr, i64 %shamt, ptr %shamtptr) nounwind {
 ; CHECK-LABEL: modify64_shl_by_complemented_multi_use:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn x8, x1
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    mov w10, #63
-; CHECK-NEXT:    lsl x8, x9, x8
-; CHECK-NEXT:    sub x9, x10, x1
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    mvn x9, x1
+; CHECK-NEXT:    lsl x8, x8, x9
+; CHECK-NEXT:    mov w9, #63 // =0x3f
+; CHECK-NEXT:    sub x9, x9, x1
 ; CHECK-NEXT:    str x8, [x0]
 ; CHECK-NEXT:    str x9, [x2]
 ; CHECK-NEXT:    ret
@@ -573,9 +573,9 @@ define i32 @reg32_lshr_by_complemented(i32 %val, i32 %shamt) nounwind {
 define i32 @load32_lshr_by_complemented(ptr %valptr, i32 %shamt) nounwind {
 ; CHECK-LABEL: load32_lshr_by_complemented:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w1
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    lsr w0, w9, w8
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    mvn w9, w1
+; CHECK-NEXT:    lsr w0, w8, w9
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %valptr
   %negshamt = sub i32 31, %shamt
@@ -597,9 +597,9 @@ define void @store32_lshr_by_complemented(i32 %val, ptr %dstptr, i32 %shamt) nou
 define void @modify32_lshr_by_complemented(ptr %valptr, i32 %shamt) nounwind {
 ; CHECK-LABEL: modify32_lshr_by_complemented:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w1
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    lsr w8, w9, w8
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    mvn w9, w1
+; CHECK-NEXT:    lsr w8, w8, w9
 ; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %valptr
@@ -611,11 +611,11 @@ define void @modify32_lshr_by_complemented(ptr %valptr, i32 %shamt) nounwind {
 define void @modify32_lshr_by_complemented_multi_use(ptr %valptr, i32 %shamt, ptr %shamtptr) nounwind {
 ; CHECK-LABEL: modify32_lshr_by_complemented_multi_use:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w1
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    mov w10, #31
-; CHECK-NEXT:    lsr w8, w9, w8
-; CHECK-NEXT:    sub w9, w10, w1
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    mvn w9, w1
+; CHECK-NEXT:    lsr w8, w8, w9
+; CHECK-NEXT:    mov w9, #31 // =0x1f
+; CHECK-NEXT:    sub w9, w9, w1
 ; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    str w9, [x2]
 ; CHECK-NEXT:    ret
@@ -640,9 +640,9 @@ define i64 @reg64_lshr_by_complemented(i64 %val, i64 %shamt) nounwind {
 define i64 @load64_lshr_by_complemented(ptr %valptr, i64 %shamt) nounwind {
 ; CHECK-LABEL: load64_lshr_by_complemented:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn x8, x1
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    lsr x0, x9, x8
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    mvn x9, x1
+; CHECK-NEXT:    lsr x0, x8, x9
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %valptr
   %negshamt = sub i64 63, %shamt
@@ -664,9 +664,9 @@ define void @store64_lshr_by_complemented(i64 %val, ptr %dstptr, i64 %shamt) nou
 define void @modify64_lshr_by_complemented(ptr %valptr, i64 %shamt) nounwind {
 ; CHECK-LABEL: modify64_lshr_by_complemented:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn x8, x1
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    lsr x8, x9, x8
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    mvn x9, x1
+; CHECK-NEXT:    lsr x8, x8, x9
 ; CHECK-NEXT:    str x8, [x0]
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %valptr
@@ -678,11 +678,11 @@ define void @modify64_lshr_by_complemented(ptr %valptr, i64 %shamt) nounwind {
 define void @modify64_lshr_by_complemented_multi_use(ptr %valptr, i64 %shamt, ptr %shamtptr) nounwind {
 ; CHECK-LABEL: modify64_lshr_by_complemented_multi_use:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn x8, x1
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    mov w10, #63
-; CHECK-NEXT:    lsr x8, x9, x8
-; CHECK-NEXT:    sub x9, x10, x1
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    mvn x9, x1
+; CHECK-NEXT:    lsr x8, x8, x9
+; CHECK-NEXT:    mov w9, #63 // =0x3f
+; CHECK-NEXT:    sub x9, x9, x1
 ; CHECK-NEXT:    str x8, [x0]
 ; CHECK-NEXT:    str x9, [x2]
 ; CHECK-NEXT:    ret
@@ -710,9 +710,9 @@ define i32 @reg32_ashr_by_complemented(i32 %val, i32 %shamt) nounwind {
 define i32 @load32_ashr_by_complemented(ptr %valptr, i32 %shamt) nounwind {
 ; CHECK-LABEL: load32_ashr_by_complemented:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w1
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    asr w0, w9, w8
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    mvn w9, w1
+; CHECK-NEXT:    asr w0, w8, w9
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %valptr
   %negshamt = sub i32 31, %shamt
@@ -734,9 +734,9 @@ define void @store32_ashr_by_complemented(i32 %val, ptr %dstptr, i32 %shamt) nou
 define void @modify32_ashr_by_complemented(ptr %valptr, i32 %shamt) nounwind {
 ; CHECK-LABEL: modify32_ashr_by_complemented:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w1
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    asr w8, w9, w8
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    mvn w9, w1
+; CHECK-NEXT:    asr w8, w8, w9
 ; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %valptr
@@ -748,11 +748,11 @@ define void @modify32_ashr_by_complemented(ptr %valptr, i32 %shamt) nounwind {
 define void @modify32_ashr_by_complemented_multi_use(ptr %valptr, i32 %shamt, ptr %shamtptr) nounwind {
 ; CHECK-LABEL: modify32_ashr_by_complemented_multi_use:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn w8, w1
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    mov w10, #31
-; CHECK-NEXT:    asr w8, w9, w8
-; CHECK-NEXT:    sub w9, w10, w1
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    mvn w9, w1
+; CHECK-NEXT:    asr w8, w8, w9
+; CHECK-NEXT:    mov w9, #31 // =0x1f
+; CHECK-NEXT:    sub w9, w9, w1
 ; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    str w9, [x2]
 ; CHECK-NEXT:    ret
@@ -777,9 +777,9 @@ define i64 @reg64_ashr_by_complemented(i64 %val, i64 %shamt) nounwind {
 define i64 @load64_ashr_by_complemented(ptr %valptr, i64 %shamt) nounwind {
 ; CHECK-LABEL: load64_ashr_by_complemented:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn x8, x1
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    asr x0, x9, x8
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    mvn x9, x1
+; CHECK-NEXT:    asr x0, x8, x9
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %valptr
   %negshamt = sub i64 63, %shamt
@@ -801,9 +801,9 @@ define void @store64_ashr_by_complemented(i64 %val, ptr %dstptr, i64 %shamt) nou
 define void @modify64_ashr_by_complemented(ptr %valptr, i64 %shamt) nounwind {
 ; CHECK-LABEL: modify64_ashr_by_complemented:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn x8, x1
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    asr x8, x9, x8
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    mvn x9, x1
+; CHECK-NEXT:    asr x8, x8, x9
 ; CHECK-NEXT:    str x8, [x0]
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %valptr
@@ -815,11 +815,11 @@ define void @modify64_ashr_by_complemented(ptr %valptr, i64 %shamt) nounwind {
 define void @modify64_ashr_by_complemented_multi_use(ptr %valptr, i64 %shamt, ptr %shamtptr) nounwind {
 ; CHECK-LABEL: modify64_ashr_by_complemented_multi_use:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvn x8, x1
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    mov w10, #63
-; CHECK-NEXT:    asr x8, x9, x8
-; CHECK-NEXT:    sub x9, x10, x1
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    mvn x9, x1
+; CHECK-NEXT:    asr x8, x8, x9
+; CHECK-NEXT:    mov w9, #63 // =0x3f
+; CHECK-NEXT:    sub x9, x9, x1
 ; CHECK-NEXT:    str x8, [x0]
 ; CHECK-NEXT:    str x9, [x2]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/shift-by-signext.ll b/llvm/test/CodeGen/AArch64/shift-by-signext.ll
index 47bdc02d29dfdf..67e2da96084efb 100644
--- a/llvm/test/CodeGen/AArch64/shift-by-signext.ll
+++ b/llvm/test/CodeGen/AArch64/shift-by-signext.ll
@@ -80,11 +80,11 @@ declare i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c)
 define i32 @n6_fshl(i32 %x, i32 %y, i8 %shamt) nounwind {
 ; CHECK-LABEL: n6_fshl:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr w8, w1, #1
 ; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
-; CHECK-NEXT:    mvn w8, w2
-; CHECK-NEXT:    lsr w9, w1, #1
+; CHECK-NEXT:    mvn w9, w2
 ; CHECK-NEXT:    lsl w10, w0, w2
-; CHECK-NEXT:    lsr w8, w9, w8
+; CHECK-NEXT:    lsr w8, w8, w9
 ; CHECK-NEXT:    orr w0, w10, w8
 ; CHECK-NEXT:    ret
   %shamt_wide = sext i8 %shamt to i32
@@ -94,11 +94,11 @@ define i32 @n6_fshl(i32 %x, i32 %y, i8 %shamt) nounwind {
 define i32 @n7_fshr(i32 %x, i32 %y, i8 %shamt) nounwind {
 ; CHECK-LABEL: n7_fshr:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl w8, w0, #1
 ; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
-; CHECK-NEXT:    mvn w8, w2
-; CHECK-NEXT:    lsl w9, w0, #1
+; CHECK-NEXT:    mvn w9, w2
 ; CHECK-NEXT:    lsr w10, w1, w2
-; CHECK-NEXT:    lsl w8, w9, w8
+; CHECK-NEXT:    lsl w8, w8, w9
 ; CHECK-NEXT:    orr w0, w8, w10
 ; CHECK-NEXT:    ret
   %shamt_wide = sext i8 %shamt to i32

diff  --git a/llvm/test/CodeGen/AArch64/shift_minsize.ll b/llvm/test/CodeGen/AArch64/shift_minsize.ll
index cc29e3a5f04f57..235fee718f539d 100644
--- a/llvm/test/CodeGen/AArch64/shift_minsize.ll
+++ b/llvm/test/CodeGen/AArch64/shift_minsize.ll
@@ -17,6 +17,11 @@ define i64 @f0(i64 %val, i64 %amt) minsize optsize {
 ; CHECK-NEXT:    lsl x0, x0, x1
 ; CHECK-NEXT:    ret
 ;
+; CHECK-WIN-LABEL: f0:
+; CHECK-WIN:       // %bb.0:
+; CHECK-WIN-NEXT:    lsl x0, x0, x1
+; CHECK-WIN-NEXT:    ret
+;
 ; CHECK-DARWIN-LABEL: f0:
 ; CHECK-DARWIN:       ; %bb.0:
 ; CHECK-DARWIN-NEXT:    lsl x0, x0, x1
@@ -32,6 +37,12 @@ define i32 @f1(i64 %x, i64 %y) minsize optsize {
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
 ;
+; CHECK-WIN-LABEL: f1:
+; CHECK-WIN:       // %bb.0:
+; CHECK-WIN-NEXT:    lsl x0, x0, x1
+; CHECK-WIN-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-WIN-NEXT:    ret
+;
 ; CHECK-DARWIN-LABEL: f1:
 ; CHECK-DARWIN:       ; %bb.0:
 ; CHECK-DARWIN-NEXT:    lsl x0, x0, x1
@@ -49,6 +60,12 @@ define i32 @f2(i64 %x, i64 %y) minsize optsize {
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
 ;
+; CHECK-WIN-LABEL: f2:
+; CHECK-WIN:       // %bb.0:
+; CHECK-WIN-NEXT:    asr x0, x0, x1
+; CHECK-WIN-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-WIN-NEXT:    ret
+;
 ; CHECK-DARWIN-LABEL: f2:
 ; CHECK-DARWIN:       ; %bb.0:
 ; CHECK-DARWIN-NEXT:    asr x0, x0, x1
@@ -66,6 +83,12 @@ define i32 @f3(i64 %x, i64 %y) minsize optsize {
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; CHECK-NEXT:    ret
 ;
+; CHECK-WIN-LABEL: f3:
+; CHECK-WIN:       // %bb.0:
+; CHECK-WIN-NEXT:    lsr x0, x0, x1
+; CHECK-WIN-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-WIN-NEXT:    ret
+;
 ; CHECK-DARWIN-LABEL: f3:
 ; CHECK-DARWIN:       ; %bb.0:
 ; CHECK-DARWIN-NEXT:    lsr x0, x0, x1
@@ -86,18 +109,32 @@ define dso_local { i64, i64 } @shl128(i64 %x.coerce0, i64 %x.coerce1, i8 signext
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 ;
+; CHECK-WIN-LABEL: shl128:
+; CHECK-WIN:       // %bb.0: // %entry
+; CHECK-WIN-NEXT:    lsr x8, x0, #1
+; CHECK-WIN-NEXT:    mvn w9, w2
+; CHECK-WIN-NEXT:    mov w10, w2
+; CHECK-WIN-NEXT:    lsl x11, x0, x10
+; CHECK-WIN-NEXT:    tst x10, #0x40
+; CHECK-WIN-NEXT:    lsr x8, x8, x9
+; CHECK-WIN-NEXT:    lsl x9, x1, x10
+; CHECK-WIN-NEXT:    csel x0, xzr, x11, ne
+; CHECK-WIN-NEXT:    orr x8, x9, x8
+; CHECK-WIN-NEXT:    csel x1, x11, x8, ne
+; CHECK-WIN-NEXT:    ret
+;
 ; CHECK-DARWIN-LABEL: shl128:
 ; CHECK-DARWIN:       ; %bb.0: ; %entry
-; CHECK-DARWIN-NEXT:    mvn w8, w2
-; CHECK-DARWIN-NEXT:    mov w9, w2
-; CHECK-DARWIN-NEXT:    lsr x10, x0, #1
-; CHECK-DARWIN-NEXT:    tst x9, #0x40
-; CHECK-DARWIN-NEXT:    lsr x8, x10, x8
-; CHECK-DARWIN-NEXT:    lsl x10, x1, x9
-; CHECK-DARWIN-NEXT:    orr x8, x10, x8
-; CHECK-DARWIN-NEXT:    lsl x10, x0, x9
-; CHECK-DARWIN-NEXT:    csel x1, x10, x8, ne
-; CHECK-DARWIN-NEXT:    csel x0, xzr, x10, ne
+; CHECK-DARWIN-NEXT:    lsr x8, x0, #1
+; CHECK-DARWIN-NEXT:    mvn w9, w2
+; CHECK-DARWIN-NEXT:    mov w10, w2
+; CHECK-DARWIN-NEXT:    lsl x11, x0, x10
+; CHECK-DARWIN-NEXT:    tst x10, #0x40
+; CHECK-DARWIN-NEXT:    lsr x8, x8, x9
+; CHECK-DARWIN-NEXT:    lsl x9, x1, x10
+; CHECK-DARWIN-NEXT:    csel x0, xzr, x11, ne
+; CHECK-DARWIN-NEXT:    orr x8, x9, x8
+; CHECK-DARWIN-NEXT:    csel x1, x11, x8, ne
 ; CHECK-DARWIN-NEXT:    ret
 
 entry:
@@ -126,19 +163,34 @@ define dso_local { i64, i64 } @ashr128(i64 %x.coerce0, i64 %x.coerce1, i8 signex
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 ;
+; CHECK-WIN-LABEL: ashr128:
+; CHECK-WIN:       // %bb.0: // %entry
+; CHECK-WIN-NEXT:    lsl x8, x1, #1
+; CHECK-WIN-NEXT:    mov w9, w2
+; CHECK-WIN-NEXT:    mvn w10, w2
+; CHECK-WIN-NEXT:    lsr x11, x0, x9
+; CHECK-WIN-NEXT:    asr x12, x1, #63
+; CHECK-WIN-NEXT:    tst x9, #0x40
+; CHECK-WIN-NEXT:    lsl x8, x8, x10
+; CHECK-WIN-NEXT:    asr x10, x1, x9
+; CHECK-WIN-NEXT:    orr x8, x8, x11
+; CHECK-WIN-NEXT:    csel x1, x12, x10, ne
+; CHECK-WIN-NEXT:    csel x0, x10, x8, ne
+; CHECK-WIN-NEXT:    ret
+;
 ; CHECK-DARWIN-LABEL: ashr128:
 ; CHECK-DARWIN:       ; %bb.0: ; %entry
-; CHECK-DARWIN-NEXT:    mov w8, w2
-; CHECK-DARWIN-NEXT:    mvn w9, w2
-; CHECK-DARWIN-NEXT:    lsl x10, x1, #1
-; CHECK-DARWIN-NEXT:    tst x8, #0x40
-; CHECK-DARWIN-NEXT:    lsr x11, x0, x8
-; CHECK-DARWIN-NEXT:    lsl x9, x10, x9
-; CHECK-DARWIN-NEXT:    asr x10, x1, x8
-; CHECK-DARWIN-NEXT:    orr x9, x9, x11
-; CHECK-DARWIN-NEXT:    asr x8, x1, #63
-; CHECK-DARWIN-NEXT:    csel x0, x10, x9, ne
-; CHECK-DARWIN-NEXT:    csel x1, x8, x10, ne
+; CHECK-DARWIN-NEXT:    lsl x8, x1, #1
+; CHECK-DARWIN-NEXT:    mov w9, w2
+; CHECK-DARWIN-NEXT:    mvn w10, w2
+; CHECK-DARWIN-NEXT:    lsr x11, x0, x9
+; CHECK-DARWIN-NEXT:    asr x12, x1, #63
+; CHECK-DARWIN-NEXT:    tst x9, #0x40
+; CHECK-DARWIN-NEXT:    lsl x8, x8, x10
+; CHECK-DARWIN-NEXT:    asr x10, x1, x9
+; CHECK-DARWIN-NEXT:    orr x8, x8, x11
+; CHECK-DARWIN-NEXT:    csel x1, x12, x10, ne
+; CHECK-DARWIN-NEXT:    csel x0, x10, x8, ne
 ; CHECK-DARWIN-NEXT:    ret
 entry:
   %x.sroa.2.0.insert.ext = zext i64 %x.coerce1 to i128
@@ -166,18 +218,32 @@ define dso_local { i64, i64 } @lshr128(i64 %x.coerce0, i64 %x.coerce1, i8 signex
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 ;
+; CHECK-WIN-LABEL: lshr128:
+; CHECK-WIN:       // %bb.0: // %entry
+; CHECK-WIN-NEXT:    lsl x8, x1, #1
+; CHECK-WIN-NEXT:    mov w9, w2
+; CHECK-WIN-NEXT:    mvn w10, w2
+; CHECK-WIN-NEXT:    lsr x11, x0, x9
+; CHECK-WIN-NEXT:    tst x9, #0x40
+; CHECK-WIN-NEXT:    lsl x8, x8, x10
+; CHECK-WIN-NEXT:    lsr x10, x1, x9
+; CHECK-WIN-NEXT:    orr x8, x8, x11
+; CHECK-WIN-NEXT:    csel x1, xzr, x10, ne
+; CHECK-WIN-NEXT:    csel x0, x10, x8, ne
+; CHECK-WIN-NEXT:    ret
+;
 ; CHECK-DARWIN-LABEL: lshr128:
 ; CHECK-DARWIN:       ; %bb.0: ; %entry
-; CHECK-DARWIN-NEXT:    mov w8, w2
-; CHECK-DARWIN-NEXT:    mvn w9, w2
-; CHECK-DARWIN-NEXT:    lsl x10, x1, #1
-; CHECK-DARWIN-NEXT:    tst x8, #0x40
-; CHECK-DARWIN-NEXT:    lsr x11, x0, x8
-; CHECK-DARWIN-NEXT:    lsl x9, x10, x9
-; CHECK-DARWIN-NEXT:    orr x9, x9, x11
-; CHECK-DARWIN-NEXT:    lsr x10, x1, x8
-; CHECK-DARWIN-NEXT:    csel x0, x10, x9, ne
+; CHECK-DARWIN-NEXT:    lsl x8, x1, #1
+; CHECK-DARWIN-NEXT:    mov w9, w2
+; CHECK-DARWIN-NEXT:    mvn w10, w2
+; CHECK-DARWIN-NEXT:    lsr x11, x0, x9
+; CHECK-DARWIN-NEXT:    tst x9, #0x40
+; CHECK-DARWIN-NEXT:    lsl x8, x8, x10
+; CHECK-DARWIN-NEXT:    lsr x10, x1, x9
+; CHECK-DARWIN-NEXT:    orr x8, x8, x11
 ; CHECK-DARWIN-NEXT:    csel x1, xzr, x10, ne
+; CHECK-DARWIN-NEXT:    csel x0, x10, x8, ne
 ; CHECK-DARWIN-NEXT:    ret
 entry:
   %x.sroa.2.0.insert.ext = zext i64 %x.coerce1 to i128

diff  --git a/llvm/test/CodeGen/AArch64/shrink-wrap-byval-inalloca-preallocated.ll b/llvm/test/CodeGen/AArch64/shrink-wrap-byval-inalloca-preallocated.ll
index 210c8701ea9292..8b58ba633b8142 100644
--- a/llvm/test/CodeGen/AArch64/shrink-wrap-byval-inalloca-preallocated.ll
+++ b/llvm/test/CodeGen/AArch64/shrink-wrap-byval-inalloca-preallocated.ll
@@ -13,8 +13,8 @@ define void @test_regular_pointers(ptr %a, ptr %b) {
 ; CHECK-LABEL: test_regular_pointers:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    mov x8, #1 ; =0x1
 ; CHECK-NEXT:    ldr d1, [x1, #8]
+; CHECK-NEXT:    mov x8, #1 ; =0x1
 ; CHECK-NEXT:    movk x8, #2047, lsl #16
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    fmov d1, x8
@@ -67,8 +67,8 @@ define void @test_byval_pointers(ptr %a, ptr byval(%struct.s) %b) {
 ; CHECK-NEXT:    .cfi_offset w19, -24
 ; CHECK-NEXT:    .cfi_offset w20, -32
 ; CHECK-NEXT:    ldr d0, [sp, #40]
-; CHECK-NEXT:    mov x8, #1 ; =0x1
 ; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    mov x8, #1 ; =0x1
 ; CHECK-NEXT:    movk x8, #2047, lsl #16
 ; CHECK-NEXT:    fadd d0, d1, d0
 ; CHECK-NEXT:    fmov d1, x8
@@ -115,8 +115,8 @@ define void @test_inalloca_pointers(ptr %a, ptr inalloca(%struct.s) %b) {
 ; CHECK-NEXT:    .cfi_offset w19, -24
 ; CHECK-NEXT:    .cfi_offset w20, -32
 ; CHECK-NEXT:    ldr d0, [sp, #40]
-; CHECK-NEXT:    mov x8, #1 ; =0x1
 ; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    mov x8, #1 ; =0x1
 ; CHECK-NEXT:    movk x8, #2047, lsl #16
 ; CHECK-NEXT:    fadd d0, d1, d0
 ; CHECK-NEXT:    fmov d1, x8
@@ -163,8 +163,8 @@ define void @test_preallocated_pointers(ptr %a, ptr preallocated(%struct.s) %b)
 ; CHECK-NEXT:    .cfi_offset w19, -24
 ; CHECK-NEXT:    .cfi_offset w20, -32
 ; CHECK-NEXT:    ldr d0, [sp, #40]
-; CHECK-NEXT:    mov x8, #1 ; =0x1
 ; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    mov x8, #1 ; =0x1
 ; CHECK-NEXT:    movk x8, #2047, lsl #16
 ; CHECK-NEXT:    fadd d0, d1, d0
 ; CHECK-NEXT:    fmov d1, x8

diff  --git a/llvm/test/CodeGen/AArch64/shrink-wrapping-vla.ll b/llvm/test/CodeGen/AArch64/shrink-wrapping-vla.ll
index 1410daa0b91821..c21ccfe522d748 100644
--- a/llvm/test/CodeGen/AArch64/shrink-wrapping-vla.ll
+++ b/llvm/test/CodeGen/AArch64/shrink-wrapping-vla.ll
@@ -88,8 +88,8 @@ declare void @llvm.stackrestore(ptr)
 ; VLA allocation
 ; CHECK: ubfiz	x8, x0, #2, #32
 ; CHECK: mov	x9, sp
-; CHECK: add	x8, x8, #15
 ; CHECK: mov	[[SAVE:x[0-9]+]], sp
+; CHECK: add	x8, x8, #15
 ; CHECK: and	[[X1:x[0-9]+]], [[X1]], #0x7fffffff0
 ; Saving the SP via llvm.stacksave()
 ; CHECK: sub	[[X1]], [[X2:x[0-9]+]], [[X1]]

diff  --git a/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll b/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll
index 30e5bdb8371fbd..0ef64789ad9724 100644
--- a/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll
+++ b/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll
@@ -21,11 +21,11 @@
 define <16 x i8> @shuffle4_v4i8_16(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
 ; CHECK-LABEL: shuffle4_v4i8_16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI0_0
 ; CHECK-NEXT:    // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT:    adrp x8, .LCPI0_0
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
 ; CHECK-NEXT:    ret
@@ -47,11 +47,11 @@ define <16 x i8> @shuffle4_v4i8_16(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i
 define <8 x i8> @shuffle4_v4i8_8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
 ; CHECK-LABEL: shuffle4_v4i8_8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI1_0
 ; CHECK-NEXT:    // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT:    adrp x8, .LCPI1_0
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    ldr d4, [x8, :lo12:.LCPI1_0]
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    tbl v0.8b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.8b
 ; CHECK-NEXT:    ret
@@ -101,17 +101,17 @@ define <8 x i8> @shuffle4_v4i8_8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8>
 define <16 x i8> @shuffle4_v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
 ; CHECK-LABEL: shuffle4_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI2_0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    adrp x9, .LCPI2_1
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-NEXT:    adrp x8, .LCPI2_0
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    mov v2.d[1], v3.d[0]
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI2_0]
+; CHECK-NEXT:    adrp x8, .LCPI2_1
+; CHECK-NEXT:    ldr d3, [x8, :lo12:.LCPI2_1]
 ; CHECK-NEXT:    adrp x8, .LCPI2_2
-; CHECK-NEXT:    ldr d3, [x9, :lo12:.LCPI2_1]
 ; CHECK-NEXT:    tbl v0.8b, { v0.16b }, v1.8b
 ; CHECK-NEXT:    tbl v1.8b, { v2.16b }, v3.8b
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI2_2]
@@ -178,10 +178,10 @@ define <16 x i8> @shuffle4_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x
 ; CHECK-LABEL: shuffle4_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI3_0
-; CHECK-NEXT:    adrp x9, .LCPI3_1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI3_0]
+; CHECK-NEXT:    adrp x8, .LCPI3_1
+; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI3_1]
 ; CHECK-NEXT:    adrp x8, .LCPI3_2
-; CHECK-NEXT:    ldr q3, [x9, :lo12:.LCPI3_1]
 ; CHECK-NEXT:    tbl v1.16b, { v0.16b }, v1.16b
 ; CHECK-NEXT:    tbl v0.16b, { v2.16b }, v3.16b
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI3_2]
@@ -214,10 +214,10 @@ define <8 x i16> @shuffle4_v8i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x
 ; CHECK-LABEL: shuffle4_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov d5, d2
-; CHECK-NEXT:    adrp x8, .LCPI4_0
-; CHECK-NEXT:    fmov d4, d0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-NEXT:    adrp x8, .LCPI4_0
+; CHECK-NEXT:    fmov d4, d0
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI4_0]
 ; CHECK-NEXT:    mov v4.d[1], v1.d[0]
 ; CHECK-NEXT:    mov v5.d[1], v3.d[0]
@@ -232,12 +232,11 @@ define <8 x i16> @shuffle4_v8i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x
 define <4 x i32> @shuffle4_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
 ; CHECK-LABEL: shuffle4_v4i32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    zip1 v1.4s, v1.4s, v1.4s
 ; CHECK-NEXT:    rev64 v3.4s, v3.4s
-; CHECK-NEXT:    zip1 v4.4s, v1.4s, v1.4s
-; CHECK-NEXT:    zip2 v1.4s, v3.4s, v2.4s
-; CHECK-NEXT:    ext v0.16b, v4.16b, v0.16b, #4
-; CHECK-NEXT:    mov v1.d[1], v0.d[1]
-; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ext v1.16b, v1.16b, v0.16b, #4
+; CHECK-NEXT:    zip2 v0.4s, v3.4s, v2.4s
+; CHECK-NEXT:    mov v0.d[1], v1.d[1]
 ; CHECK-NEXT:    ret
   %x = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %y = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -276,9 +275,9 @@ define <4 x i32> @shuffle4_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x
 define <16 x i8> @shuffle4_v8i8_v16i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
 ; CHECK-LABEL: shuffle4_v8i8_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI6_0
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    adrp x8, .LCPI6_0
 ; CHECK-NEXT:    mov v2.d[1], v2.d[0]
 ; CHECK-NEXT:    mov v0.d[1], v0.d[0]
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI6_0]
@@ -315,9 +314,9 @@ define <16 x i8> @shuffle4_v8i8_v16i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8
 define <8 x i8> @shuffle4_v8i8_v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
 ; CHECK-LABEL: shuffle4_v8i8_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI7_0
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    adrp x8, .LCPI7_0
 ; CHECK-NEXT:    mov v2.d[1], v2.d[0]
 ; CHECK-NEXT:    mov v0.d[1], v0.d[0]
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI7_0]
@@ -355,12 +354,12 @@ define <8 x i16> @shuffle4_v4i8_zext(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x
 ; CHECK-LABEL: shuffle4_v4i8_zext:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    uzp1 v1.8b, v2.8b, v3.8b
 ; CHECK-NEXT:    adrp x8, .LCPI8_0
-; CHECK-NEXT:    uzp1 v2.8b, v2.8b, v3.8b
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI8_0]
-; CHECK-NEXT:    ushll v3.8h, v0.8b, #0
-; CHECK-NEXT:    ushll v4.8h, v2.8b, #0
-; CHECK-NEXT:    tbl v0.16b, { v3.16b, v4.16b }, v1.16b
+; CHECK-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI8_0]
+; CHECK-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-NEXT:    tbl v0.16b, { v2.16b, v3.16b }, v0.16b
 ; CHECK-NEXT:    ret
   %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %y = shufflevector <4 x i8> %c, <4 x i8> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -390,11 +389,11 @@ define <8 x i16> @shuffle4_v4i8_zext(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x
 define <16 x i8> @shuffle4_v4i16_trunc(<4 x i16> %ae, <4 x i16> %be, <4 x i16> %ce, <4 x i16> %de) {
 ; CHECK-LABEL: shuffle4_v4i16_trunc:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI9_0
 ; CHECK-NEXT:    // kill: def $d3 killed $d3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT:    adrp x8, .LCPI9_0
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI9_0]
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b, v2.16b, v3.16b }, v4.16b
 ; CHECK-NEXT:    ret
@@ -429,11 +428,11 @@ define <16 x i8> @shuffle4_v4i16_trunc(<4 x i16> %ae, <4 x i16> %be, <4 x i16> %
 define <16 x i8> @shuffle4_v4i32_trunc(<4 x i32> %ae, <4 x i32> %be, <4 x i32> %ce, <4 x i32> %de) {
 ; CHECK-LABEL: shuffle4_v4i32_trunc:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI10_0
 ; CHECK-NEXT:    xtn v4.4h, v0.4s
+; CHECK-NEXT:    adrp x8, .LCPI10_0
 ; CHECK-NEXT:    xtn v5.4h, v1.4s
-; CHECK-NEXT:    xtn v6.4h, v2.4s
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI10_0]
+; CHECK-NEXT:    xtn v6.4h, v2.4s
 ; CHECK-NEXT:    xtn v7.4h, v3.4s
 ; CHECK-NEXT:    tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b
 ; CHECK-NEXT:    ret
@@ -467,11 +466,11 @@ define <16 x i8> @shuffle4_v4i32_trunc(<4 x i32> %ae, <4 x i32> %be, <4 x i32> %
 define <12 x i8> @shuffle3_v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) {
 ; CHECK-LABEL: shuffle3_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI11_0
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-NEXT:    adrp x8, .LCPI11_0
+; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI11_0]
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q0_q1_q2 def $q0_q1_q2
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1_q2 def $q0_q1_q2
-; CHECK-NEXT:    ldr q3, [x8, :lo12:.LCPI11_0]
 ; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b, v2.16b }, v3.16b
 ; CHECK-NEXT:    ret
   %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -501,9 +500,9 @@ define <8 x i16> @shuffle3_v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) {
 ; CHECK-LABEL: shuffle3_v4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov d3, d2
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-NEXT:    adrp x8, .LCPI12_0
 ; CHECK-NEXT:    fmov d2, d0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI12_0]
 ; CHECK-NEXT:    mov v2.d[1], v1.d[0]
 ; CHECK-NEXT:    tbl v0.16b, { v2.16b, v3.16b }, v0.16b
@@ -559,11 +558,11 @@ define <4 x i32> @shuffle3_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 define <8 x i8> @insert4_v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c, <16 x i8> %d) {
 ; CHECK-LABEL: insert4_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI14_0
-; CHECK-NEXT:    adrp x9, .LCPI14_1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    mov v4.16b, v3.16b
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    adrp x8, .LCPI14_0
+; CHECK-NEXT:    adrp x9, .LCPI14_1
 ; CHECK-NEXT:    mov v0.d[1], v2.d[0]
 ; CHECK-NEXT:    mov v3.16b, v1.16b
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI14_0]
@@ -629,16 +628,16 @@ define <8 x i8> @insert4_v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c, <16 x i8>
 define <16 x i8> @insert4_v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c, <16 x i8> %d) {
 ; CHECK-LABEL: insert4_v16i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v4.16b, v3.16b
 ; CHECK-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q31_q0
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT:    mov v4.16b, v3.16b
 ; CHECK-NEXT:    mov v3.16b, v1.16b
 ; CHECK-NEXT:    ldr q5, [x8, :lo12:.LCPI15_0]
-; CHECK-NEXT:    adrp x8, .LCPI15_1
 ; CHECK-NEXT:    mov v0.d[1], v2.d[0]
-; CHECK-NEXT:    tbl v31.16b, { v3.16b, v4.16b }, v5.16b
+; CHECK-NEXT:    adrp x8, .LCPI15_1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI15_1]
+; CHECK-NEXT:    tbl v31.16b, { v3.16b, v4.16b }, v5.16b
 ; CHECK-NEXT:    tbl v0.16b, { v31.16b, v0.16b }, v1.16b
 ; CHECK-NEXT:    ret
   %e1 = extractelement <8 x i8> %a, i32 4
@@ -698,8 +697,8 @@ define <16 x i16> @test(<2 x double> %l213, <2 x double> %l231, <2 x double> %l2
 ; CHECK-LABEL: test:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frintm v0.2d, v0.2d
-; CHECK-NEXT:    adrp x8, .LCPI16_0
 ; CHECK-NEXT:    frintm v4.2d, v4.2d
+; CHECK-NEXT:    adrp x8, .LCPI16_0
 ; CHECK-NEXT:    frintm v1.2d, v1.2d
 ; CHECK-NEXT:    frintm v5.2d, v5.2d
 ; CHECK-NEXT:    frintm v2.2d, v2.2d
@@ -713,20 +712,20 @@ define <16 x i16> @test(<2 x double> %l213, <2 x double> %l231, <2 x double> %l2
 ; CHECK-NEXT:    fcvtzs v2.2d, v2.2d
 ; CHECK-NEXT:    fcvtzs v6.2d, v6.2d
 ; CHECK-NEXT:    fcvtzs v3.2d, v3.2d
+; CHECK-NEXT:    fcvtzs v7.2d, v7.2d
 ; CHECK-NEXT:    xtn v16.2s, v0.2d
-; CHECK-NEXT:    fcvtzs v0.2d, v7.2d
 ; CHECK-NEXT:    xtn v20.2s, v4.2d
+; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI16_0]
 ; CHECK-NEXT:    xtn v17.2s, v1.2d
 ; CHECK-NEXT:    xtn v21.2s, v5.2d
 ; CHECK-NEXT:    xtn v18.2s, v2.2d
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
 ; CHECK-NEXT:    xtn v22.2s, v6.2d
 ; CHECK-NEXT:    xtn v19.2s, v3.2d
-; CHECK-NEXT:    xtn v23.2s, v0.2d
-; CHECK-NEXT:    tbl v2.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v1.16b
-; CHECK-NEXT:    tbl v1.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v1.16b
-; CHECK-NEXT:    uzp1 v0.8h, v2.8h, v1.8h
-; CHECK-NEXT:    uzp2 v1.8h, v2.8h, v1.8h
+; CHECK-NEXT:    xtn v23.2s, v7.2d
+; CHECK-NEXT:    tbl v1.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b
+; CHECK-NEXT:    tbl v2.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b
+; CHECK-NEXT:    uzp1 v0.8h, v1.8h, v2.8h
+; CHECK-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
 ; CHECK-NEXT:    ret
   %l214 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l213)
   %l215 = fptosi <2 x double> %l214 to <2 x i16>

diff  --git a/llvm/test/CodeGen/AArch64/shuffles.ll b/llvm/test/CodeGen/AArch64/shuffles.ll
index 7be836af6342a2..51d51e5a7834d5 100644
--- a/llvm/test/CodeGen/AArch64/shuffles.ll
+++ b/llvm/test/CodeGen/AArch64/shuffles.ll
@@ -4,18 +4,18 @@
 define <16 x i32> @test_shuf1(<16 x i32> %x, <16 x i32> %y) {
 ; CHECK-LABEL: test_shuf1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v16.16b, v6.16b, v1.16b, #4
-; CHECK-NEXT:    dup v5.4s, v4.s[0]
-; CHECK-NEXT:    uzp1 v17.4s, v1.4s, v0.4s
-; CHECK-NEXT:    uzp2 v18.4s, v2.4s, v4.4s
+; CHECK-NEXT:    ext v3.16b, v6.16b, v1.16b, #4
+; CHECK-NEXT:    uzp1 v5.4s, v1.4s, v0.4s
+; CHECK-NEXT:    uzp2 v16.4s, v2.4s, v4.4s
+; CHECK-NEXT:    dup v17.4s, v4.s[0]
+; CHECK-NEXT:    trn2 v4.4s, v1.4s, v3.4s
+; CHECK-NEXT:    mov v17.s[0], v6.s[3]
+; CHECK-NEXT:    trn2 v1.4s, v5.4s, v1.4s
 ; CHECK-NEXT:    rev64 v3.4s, v7.4s
-; CHECK-NEXT:    trn2 v4.4s, v1.4s, v16.4s
-; CHECK-NEXT:    mov v5.s[0], v6.s[3]
-; CHECK-NEXT:    trn2 v1.4s, v17.4s, v1.4s
-; CHECK-NEXT:    trn1 v2.4s, v18.4s, v2.4s
+; CHECK-NEXT:    trn1 v2.4s, v16.4s, v2.4s
 ; CHECK-NEXT:    mov v4.s[0], v7.s[1]
-; CHECK-NEXT:    mov v3.d[0], v5.d[0]
 ; CHECK-NEXT:    ext v1.16b, v0.16b, v1.16b, #12
+; CHECK-NEXT:    mov v3.d[0], v17.d[0]
 ; CHECK-NEXT:    mov v2.s[3], v7.s[0]
 ; CHECK-NEXT:    mov v0.16b, v4.16b
 ; CHECK-NEXT:    ret
@@ -26,10 +26,10 @@ define <16 x i32> @test_shuf1(<16 x i32> %x, <16 x i32> %y) {
 define <4 x i32> @test_shuf2(<16 x i32> %x, <16 x i32> %y) {
 ; CHECK-LABEL: test_shuf2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip2 v2.4s, v7.4s, v6.4s
+; CHECK-NEXT:    zip2 v0.4s, v7.4s, v6.4s
+; CHECK-NEXT:    trn2 v2.4s, v7.4s, v0.4s
 ; CHECK-NEXT:    ext v0.16b, v1.16b, v1.16b, #4
-; CHECK-NEXT:    trn2 v1.4s, v7.4s, v2.4s
-; CHECK-NEXT:    mov v0.d[0], v1.d[0]
+; CHECK-NEXT:    mov v0.d[0], v2.d[0]
 ; CHECK-NEXT:    ret
   %s3 = shufflevector <16 x i32> %x, <16 x i32> %y, <4 x i32> <i32 29, i32 26, i32 7, i32 4>
   ret <4 x i32> %s3
@@ -60,8 +60,8 @@ define <4 x i32> @test_shuf4(<16 x i32> %x, <16 x i32> %y) {
 define <4 x i32> @test_shuf5(<16 x i32> %x, <16 x i32> %y) {
 ; CHECK-LABEL: test_shuf5:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    rev64 v0.4s, v7.4s
 ; CHECK-NEXT:    ext v1.16b, v6.16b, v4.16b, #12
+; CHECK-NEXT:    rev64 v0.4s, v7.4s
 ; CHECK-NEXT:    mov v0.d[0], v1.d[0]
 ; CHECK-NEXT:    ret
   %s3 = shufflevector <16 x i32> %x, <16 x i32> %y, <4 x i32> <i32 27, i32 16, i32 31, i32 30>
@@ -155,9 +155,9 @@ define <4 x i16> @test_shuf7(<4 x i16> %a, <4 x i16> %b)
 define <8 x i8> @test_shuf8(<8 x i8> %a, <8 x i8> %b)
 ; CHECK-LABEL: test_shuf8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI12_0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    adrp x8, .LCPI12_0
 ; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI12_0]
 ; CHECK-NEXT:    tbl v0.8b, { v0.16b }, v1.8b
@@ -172,8 +172,8 @@ define <8 x i16> @test_shuf9(<8 x i16> %a, <8 x i16> %b)
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI13_0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI13_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-NEXT:    ret
 {
@@ -198,8 +198,8 @@ define <8 x half> @test_shuf11(<8 x half> %a, <8 x half> %b)
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI15_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-NEXT:    ret
 {
@@ -212,8 +212,8 @@ define <8 x half> @test_shuf12(<8 x half> %a, <8 x half> %b)
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI16_0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI16_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-NEXT:    ret
 {
@@ -226,8 +226,8 @@ define <8 x half> @test_shuf13(<8 x half> %a, <8 x half> %b)
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI17_0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI17_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-NEXT:    ret
 {
@@ -240,8 +240,8 @@ define <8 x half> @test_shuf14(<8 x half> %a, <8 x half> %b)
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI18_0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI18_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-NEXT:    ret
 {
@@ -254,8 +254,8 @@ define <8 x half> @test_shuf15(<8 x half> %a, <8 x half> %b)
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI19_0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI19_0]
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-NEXT:    ret
 {

diff  --git a/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll b/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll
index 1d5d2c2197bced..f4f75bb9c7825f 100644
--- a/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll
+++ b/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll
@@ -129,9 +129,9 @@ define i32 @sink_sub_of_const_to_sub2(i32 %a, i32 %b) {
 define i32 @sink_sub_from_const_to_sub(i32 %a, i32 %b) {
 ; CHECK-LABEL: sink_sub_from_const_to_sub:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w8, w0, w1
-; CHECK-NEXT:    mov w9, #32 // =0x20
-; CHECK-NEXT:    sub w0, w9, w8
+; CHECK-NEXT:    mov w8, #32 // =0x20
+; CHECK-NEXT:    add w9, w0, w1
+; CHECK-NEXT:    sub w0, w8, w9
 ; CHECK-NEXT:    ret
   %t0 = sub i32 32, %a
   %r = sub i32 %t0, %b
@@ -158,10 +158,10 @@ define i32 @sink_sub_from_const_to_sub2(i32 %a, i32 %b) {
 define <4 x i32> @vec_sink_add_of_const_to_add0(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: vec_sink_add_of_const_to_add0:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    adrp x8, .LCPI12_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI12_0]
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI12_0]
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
   %t0 = add <4 x i32> %a, <i32 42, i32 24, i32 undef, i32 46> ; constant always on RHS
   %r = add <4 x i32> %t0, %b
@@ -170,10 +170,10 @@ define <4 x i32> @vec_sink_add_of_const_to_add0(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @vec_sink_add_of_const_to_add1(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: vec_sink_add_of_const_to_add1:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    adrp x8, .LCPI13_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI13_0]
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI13_0]
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
   %t0 = add <4 x i32> %a, <i32 42, i32 24, i32 undef, i32 46> ; constant always on RHS
   %r = add <4 x i32> %b, %t0
@@ -186,10 +186,10 @@ define <4 x i32> @vec_sink_add_of_const_to_add1(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @vec_sink_sub_of_const_to_add0(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: vec_sink_sub_of_const_to_add0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI14_0
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI14_0]
-; CHECK-NEXT:    sub v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    adrp x8, .LCPI14_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI14_0]
+; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %t0 = sub <4 x i32> %a, <i32 42, i32 24, i32 undef, i32 46>
   %r = add <4 x i32> %t0, %b
@@ -198,10 +198,10 @@ define <4 x i32> @vec_sink_sub_of_const_to_add0(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @vec_sink_sub_of_const_to_add1(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: vec_sink_sub_of_const_to_add1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI15_0]
-; CHECK-NEXT:    sub v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    adrp x8, .LCPI15_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI15_0]
+; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %t0 = sub <4 x i32> %a, <i32 42, i32 24, i32 undef, i32 46>
   %r = add <4 x i32> %b, %t0
@@ -214,10 +214,10 @@ define <4 x i32> @vec_sink_sub_of_const_to_add1(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @vec_sink_sub_from_const_to_add0(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: vec_sink_sub_from_const_to_add0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI16_0
 ; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI16_0]
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    adrp x8, .LCPI16_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %t0 = sub <4 x i32> <i32 42, i32 24, i32 undef, i32 46>, %a
   %r = add <4 x i32> %t0, %b
@@ -226,10 +226,10 @@ define <4 x i32> @vec_sink_sub_from_const_to_add0(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @vec_sink_sub_from_const_to_add1(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: vec_sink_sub_from_const_to_add1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI17_0
 ; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI17_0]
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    adrp x8, .LCPI17_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI17_0]
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %t0 = sub <4 x i32> <i32 42, i32 24, i32 undef, i32 46>, %a
   %r = add <4 x i32> %b, %t0
@@ -242,10 +242,10 @@ define <4 x i32> @vec_sink_sub_from_const_to_add1(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @vec_sink_add_of_const_to_sub(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: vec_sink_add_of_const_to_sub:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI18_0
 ; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI18_0]
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    adrp x8, .LCPI18_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI18_0]
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %t0 = add <4 x i32> %a, <i32 42, i32 24, i32 undef, i32 46> ; constant always on RHS
   %r = sub <4 x i32> %t0, %b
@@ -254,10 +254,10 @@ define <4 x i32> @vec_sink_add_of_const_to_sub(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @vec_sink_add_of_const_to_sub2(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: vec_sink_add_of_const_to_sub2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI19_0
 ; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI19_0]
-; CHECK-NEXT:    sub v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    adrp x8, .LCPI19_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI19_0]
+; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %t0 = add <4 x i32> %a, <i32 42, i32 24, i32 undef, i32 46> ; constant always on RHS
   %r = sub <4 x i32> %b, %t0
@@ -270,10 +270,10 @@ define <4 x i32> @vec_sink_add_of_const_to_sub2(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @vec_sink_sub_of_const_to_sub(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: vec_sink_sub_of_const_to_sub:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    adrp x8, .LCPI20_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI20_0]
 ; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI20_0]
-; CHECK-NEXT:    sub v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
   %t0 = sub <4 x i32> %a, <i32 42, i32 24, i32 undef, i32 46>
   %r = sub <4 x i32> %t0, %b
@@ -282,10 +282,10 @@ define <4 x i32> @vec_sink_sub_of_const_to_sub(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @vec_sink_sub_of_const_to_sub2(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: vec_sink_sub_of_const_to_sub2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI21_0
 ; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI21_0]
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    adrp x8, .LCPI21_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI21_0]
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %t0 = sub <4 x i32> %a, <i32 42, i32 24, i32 undef, i32 46>
   %r = sub <4 x i32> %b, %t0
@@ -298,10 +298,10 @@ define <4 x i32> @vec_sink_sub_of_const_to_sub2(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @vec_sink_sub_from_const_to_sub(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: vec_sink_sub_from_const_to_sub:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI22_0
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI22_0]
-; CHECK-NEXT:    sub v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    adrp x8, .LCPI22_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI22_0]
+; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    ret
   %t0 = sub <4 x i32> <i32 42, i32 24, i32 undef, i32 46>, %a
   %r = sub <4 x i32> %t0, %b
@@ -310,10 +310,10 @@ define <4 x i32> @vec_sink_sub_from_const_to_sub(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @vec_sink_sub_from_const_to_sub2(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: vec_sink_sub_from_const_to_sub2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI23_0
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI23_0]
-; CHECK-NEXT:    sub v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    adrp x8, .LCPI23_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI23_0]
+; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %t0 = sub <4 x i32> <i32 42, i32 24, i32 undef, i32 46>, %a
   %r = sub <4 x i32> %b, %t0

diff  --git a/llvm/test/CodeGen/AArch64/sinksplat.ll b/llvm/test/CodeGen/AArch64/sinksplat.ll
index c85f0bc4ff82db..ca51c7c85d2c9c 100644
--- a/llvm/test/CodeGen/AArch64/sinksplat.ll
+++ b/llvm/test/CodeGen/AArch64/sinksplat.ll
@@ -5,8 +5,8 @@ define <4 x i32> @smull(<4 x i16> %x, ptr %y) {
 ; CHECK-LABEL: smull:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmov d1, d0
-; CHECK-NEXT:    mov w8, #1
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:  .LBB0_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr d2, [x0]
@@ -37,8 +37,8 @@ define <4 x i32> @umull(<4 x i16> %x, ptr %y) {
 ; CHECK-LABEL: umull:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmov d1, d0
-; CHECK-NEXT:    mov w8, #1
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:  .LBB1_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr d2, [x0]
@@ -69,8 +69,8 @@ define <4 x i32> @sqadd(<4 x i32> %x, ptr %y) {
 ; CHECK-LABEL: sqadd:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    mov w8, #1
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:  .LBB2_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr q2, [x0]
@@ -102,8 +102,8 @@ define <4 x i32> @sqsub(<4 x i32> %x, ptr %y) {
 ; CHECK-LABEL: sqsub:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    mov w8, #1
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:  .LBB3_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr q2, [x0]
@@ -135,8 +135,8 @@ define <4 x i32> @sqdmulh(<4 x i32> %x, ptr %y) {
 ; CHECK-LABEL: sqdmulh:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    mov w8, #1
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:  .LBB4_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr q2, [x0]
@@ -168,8 +168,8 @@ define <4 x i32> @sqdmull(<4 x i16> %x, ptr %y) {
 ; CHECK-LABEL: sqdmull:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmov d1, d0
-; CHECK-NEXT:    mov w8, #1
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:  .LBB5_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr d2, [x0]
@@ -201,8 +201,8 @@ define <4 x i32> @mlal(<4 x i32> %x, ptr %y) {
 ; CHECK-LABEL: mlal:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    mov w8, #1
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    dup v1.4s, v1.s[3]
 ; CHECK-NEXT:  .LBB6_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -234,8 +234,8 @@ define <4 x float> @fmul(<4 x float> %x, ptr %y) {
 ; CHECK-LABEL: fmul:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    mov w8, #1
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:  .LBB7_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr q2, [x0]
@@ -267,8 +267,8 @@ define <4 x float> @fmuladd(<4 x float> %x, ptr %y) {
 ; CHECK-LABEL: fmuladd:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    mov w8, #1
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    dup v1.4s, v1.s[3]
 ; CHECK-NEXT:  .LBB8_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -300,16 +300,16 @@ define <4 x float> @fma(<4 x float> %x, ptr %y) {
 ; CHECK-LABEL: fma:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    mov w8, #1
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    dup v1.4s, v1.s[3]
 ; CHECK-NEXT:  .LBB9_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr q3, [x0]
-; CHECK-NEXT:    subs w8, w8, #1
-; CHECK-NEXT:    mov v2.16b, v0.16b
+; CHECK-NEXT:    mov v3.16b, v0.16b
 ; CHECK-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEXT:    fmla v0.4s, v2.4s, v3.4s
+; CHECK-NEXT:    ldr q2, [x0]
+; CHECK-NEXT:    subs w8, w8, #1
+; CHECK-NEXT:    fmla v0.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    b.eq .LBB9_1
 ; CHECK-NEXT:  // %bb.2: // %l2
 ; CHECK-NEXT:    ret
@@ -334,8 +334,8 @@ define <4 x i32> @smull_nonsplat(<4 x i16> %x, ptr %y) {
 ; CHECK-LABEL: smull_nonsplat:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmov d1, d0
-; CHECK-NEXT:    mov w8, #1
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    trn2 v2.4h, v1.4h, v1.4h
 ; CHECK-NEXT:    zip2 v1.4h, v2.4h, v1.4h
 ; CHECK-NEXT:  .LBB10_1: // %l1

diff  --git a/llvm/test/CodeGen/AArch64/sitofp-fixed-legal.ll b/llvm/test/CodeGen/AArch64/sitofp-fixed-legal.ll
index 7d041d4c5e75d9..5a5a669e92eebb 100644
--- a/llvm/test/CodeGen/AArch64/sitofp-fixed-legal.ll
+++ b/llvm/test/CodeGen/AArch64/sitofp-fixed-legal.ll
@@ -4,21 +4,21 @@
 define <16 x double> @test_sitofp_fixed(<16 x i32> %in) {
 ; CHECK-LABEL: test_sitofp_fixed:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    sshll2.2d v4, v2, #0
-; CHECK-NEXT:    sshll2.2d v5, v0, #0
-; CHECK-NEXT:    sshll2.2d v6, v1, #0
-; CHECK-NEXT:    sshll2.2d v7, v3, #0
+; CHECK-NEXT:    sshll2.2d v4, v0, #0
 ; CHECK-NEXT:    sshll.2d v0, v0, #0
-; CHECK-NEXT:    sshll.2d v16, v1, #0
-; CHECK-NEXT:    sshll.2d v17, v2, #0
+; CHECK-NEXT:    sshll2.2d v5, v1, #0
+; CHECK-NEXT:    sshll.2d v6, v1, #0
+; CHECK-NEXT:    sshll.2d v7, v2, #0
+; CHECK-NEXT:    sshll2.2d v16, v2, #0
+; CHECK-NEXT:    sshll2.2d v17, v3, #0
 ; CHECK-NEXT:    sshll.2d v18, v3, #0
-; CHECK-NEXT:    scvtf.2d v1, v5, #6
+; CHECK-NEXT:    scvtf.2d v1, v4, #6
 ; CHECK-NEXT:    scvtf.2d v0, v0, #6
-; CHECK-NEXT:    scvtf.2d v3, v6, #6
-; CHECK-NEXT:    scvtf.2d v2, v16, #6
-; CHECK-NEXT:    scvtf.2d v5, v4, #6
-; CHECK-NEXT:    scvtf.2d v4, v17, #6
-; CHECK-NEXT:    scvtf.2d v7, v7, #6
+; CHECK-NEXT:    scvtf.2d v3, v5, #6
+; CHECK-NEXT:    scvtf.2d v2, v6, #6
+; CHECK-NEXT:    scvtf.2d v4, v7, #6
+; CHECK-NEXT:    scvtf.2d v5, v16, #6
+; CHECK-NEXT:    scvtf.2d v7, v17, #6
 ; CHECK-NEXT:    scvtf.2d v6, v18, #6
 ; CHECK-NEXT:    ret
 

diff  --git a/llvm/test/CodeGen/AArch64/sme-aarch64-svcount.ll b/llvm/test/CodeGen/AArch64/sme-aarch64-svcount.ll
index 47a72a86fe6f08..20277f87b4edb4 100644
--- a/llvm/test/CodeGen/AArch64/sme-aarch64-svcount.ll
+++ b/llvm/test/CodeGen/AArch64/sme-aarch64-svcount.ll
@@ -149,11 +149,11 @@ define void @test_pass_5args(target("aarch64.svcount") %arg) nounwind {
 ; CHECK-O3:       // %bb.0:
 ; CHECK-O3-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-O3-NEXT:    addvl sp, sp, #-1
-; CHECK-O3-NEXT:    addpl x0, sp, #7
 ; CHECK-O3-NEXT:    mov p1.b, p0.b
+; CHECK-O3-NEXT:    addpl x0, sp, #7
+; CHECK-O3-NEXT:    str p0, [sp, #7, mul vl]
 ; CHECK-O3-NEXT:    mov p2.b, p0.b
 ; CHECK-O3-NEXT:    mov p3.b, p0.b
-; CHECK-O3-NEXT:    str p0, [sp, #7, mul vl]
 ; CHECK-O3-NEXT:    bl take_svcount_5
 ; CHECK-O3-NEXT:    addvl sp, sp, #1
 ; CHECK-O3-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index 0118d7df25640b..cea3a5e6ccf6f4 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -49,7 +49,7 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline
 ; CHECK-GISEL-NEXT:    bl streaming_callee
 ; CHECK-GISEL-NEXT:    str d0, [sp, #88] // 8-byte Folded Spill
 ; CHECK-GISEL-NEXT:    smstop sm
-; CHECK-GISEL-NEXT:    mov x8, #4631107791820423168
+; CHECK-GISEL-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
 ; CHECK-GISEL-NEXT:    fmov d0, x8
 ; CHECK-GISEL-NEXT:    ldr d1, [sp, #88] // 8-byte Folded Reload
 ; CHECK-GISEL-NEXT:    fadd d0, d1, d0
@@ -108,7 +108,7 @@ define double @streaming_caller_nonstreaming_callee(double %x) nounwind noinline
 ; CHECK-GISEL-NEXT:    bl normal_callee
 ; CHECK-GISEL-NEXT:    str d0, [sp, #88] // 8-byte Folded Spill
 ; CHECK-GISEL-NEXT:    smstart sm
-; CHECK-GISEL-NEXT:    mov x8, #4631107791820423168
+; CHECK-GISEL-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
 ; CHECK-GISEL-NEXT:    fmov d0, x8
 ; CHECK-GISEL-NEXT:    ldr d1, [sp, #88] // 8-byte Folded Reload
 ; CHECK-GISEL-NEXT:    fadd d0, d1, d0
@@ -141,7 +141,7 @@ define double @locally_streaming_caller_normal_callee(double %x) nounwind noinli
 ; CHECK-COMMON-NEXT:    bl normal_callee
 ; CHECK-COMMON-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstart sm
-; CHECK-COMMON-NEXT:    mov x8, #4631107791820423168
+; CHECK-COMMON-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
 ; CHECK-COMMON-NEXT:    fmov d0, x8
 ; CHECK-COMMON-NEXT:    ldr d1, [sp, #8] // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    fadd d0, d1, d0
@@ -246,7 +246,7 @@ define double  @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o
 ; CHECK-COMMON-NEXT:  .LBB6_2: // %entry
 ; CHECK-COMMON-NEXT:    smstart za
 ; CHECK-COMMON-NEXT:    bl za_shared_callee
-; CHECK-COMMON-NEXT:    mov x8, #4631107791820423168
+; CHECK-COMMON-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
 ; CHECK-COMMON-NEXT:    fmov d1, x8
 ; CHECK-COMMON-NEXT:    fadd d0, d0, d1
 ; CHECK-COMMON-NEXT:    smstop za
@@ -285,7 +285,7 @@ define double  @za_shared_caller_to_za_none_callee(double %x) nounwind noinline
 ; CHECK-COMMON-NEXT:    b .LBB7_2
 ; CHECK-COMMON-NEXT:  .LBB7_2: // %entry
 ; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, xzr
-; CHECK-COMMON-NEXT:    mov x8, #4631107791820423168
+; CHECK-COMMON-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
 ; CHECK-COMMON-NEXT:    fmov d1, x8
 ; CHECK-COMMON-NEXT:    fadd d0, d0, d1
 ; CHECK-COMMON-NEXT:    mov sp, x29
@@ -309,14 +309,14 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_pstate_za_shared" nounwi
 ; CHECK-COMMON-NEXT:    mul x8, x8, x8
 ; CHECK-COMMON-NEXT:    sub x9, x9, x8
 ; CHECK-COMMON-NEXT:    mov sp, x9
-; CHECK-COMMON-NEXT:    sub x10, x29, #16
 ; CHECK-COMMON-NEXT:    stur x9, [x29, #-16]
+; CHECK-COMMON-NEXT:    sub x9, x29, #16
 ; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
-; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-COMMON-NEXT:    bl __addtf3
 ; CHECK-COMMON-NEXT:    smstart za
-; CHECK-COMMON-NEXT:    sub x0, x29, #16
 ; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-COMMON-NEXT:    sub x0, x29, #16
 ; CHECK-COMMON-NEXT:    cbnz x8, .LBB8_2
 ; CHECK-COMMON-NEXT:  // %bb.1:
 ; CHECK-COMMON-NEXT:    bl __arm_tpidr2_restore
@@ -347,11 +347,11 @@ define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounw
 ; CHECK-COMMON-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstart sm
 ; CHECK-COMMON-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    add sp, sp, #112
 ; CHECK-COMMON-NEXT:    ret
   %res = fadd fp128 %a, %b

diff  --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll
index ccb3975f0c5b42..c96aca366ed43f 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll
@@ -18,10 +18,10 @@ define void @ld1b(<vscale x 16 x i1> %pg, ptr %ptr, i32 %sliceidx) {
 define void @ld1b_with_addr_offset(<vscale x 16 x i1> %pg, ptr %ptr, i64 %index, i32 %sliceidx) {
 ; CHECK-LABEL: ld1b_with_addr_offset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, wzr
-; CHECK-NEXT:    mov w13, w2
-; CHECK-NEXT:    ld1b {za0h.b[w12, 0]}, p0/z, [x0, x1]
-; CHECK-NEXT:    ld1b {za0v.b[w13, 15]}, p0/z, [x0, x1]
+; CHECK-NEXT:    mov w13, wzr
+; CHECK-NEXT:    mov w12, w2
+; CHECK-NEXT:    ld1b {za0h.b[w13, 0]}, p0/z, [x0, x1]
+; CHECK-NEXT:    ld1b {za0v.b[w12, 15]}, p0/z, [x0, x1]
 ; CHECK-NEXT:    ret
   %base = getelementptr i8, ptr %ptr, i64 %index
   %tileslice = add i32 %sliceidx, 15
@@ -66,16 +66,16 @@ define void @ld1h_with_addr_offset(<vscale x 8 x i1> %pg, ptr %ptr, i64 %index,
 define void @ld1w(<vscale x 4 x i1> %pg, ptr %ptr, i32 %sliceidx) {
 ; CHECK-LABEL: ld1w:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, wzr
-; CHECK-NEXT:    mov w13, w1
-; CHECK-NEXT:    ld1w {za0h.s[w12, 0]}, p0/z, [x0]
-; CHECK-NEXT:    ld1w {za1h.s[w12, 0]}, p0/z, [x0]
-; CHECK-NEXT:    ld1w {za2h.s[w12, 0]}, p0/z, [x0]
-; CHECK-NEXT:    ld1w {za3h.s[w13, 3]}, p0/z, [x0]
-; CHECK-NEXT:    ld1w {za0v.s[w12, 0]}, p0/z, [x0]
-; CHECK-NEXT:    ld1w {za1v.s[w12, 0]}, p0/z, [x0]
-; CHECK-NEXT:    ld1w {za2v.s[w13, 3]}, p0/z, [x0]
-; CHECK-NEXT:    ld1w {za3v.s[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    mov w13, wzr
+; CHECK-NEXT:    ld1w {za0h.s[w13, 0]}, p0/z, [x0]
+; CHECK-NEXT:    ld1w {za1h.s[w13, 0]}, p0/z, [x0]
+; CHECK-NEXT:    ld1w {za2h.s[w13, 0]}, p0/z, [x0]
+; CHECK-NEXT:    ld1w {za3h.s[w12, 3]}, p0/z, [x0]
+; CHECK-NEXT:    ld1w {za0v.s[w13, 0]}, p0/z, [x0]
+; CHECK-NEXT:    ld1w {za1v.s[w13, 0]}, p0/z, [x0]
+; CHECK-NEXT:    ld1w {za2v.s[w12, 3]}, p0/z, [x0]
+; CHECK-NEXT:    ld1w {za3v.s[w13, 0]}, p0/z, [x0]
 ; CHECK-NEXT:    ret
   %tileslice = add i32 %sliceidx, 3
   call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> %pg, ptr %ptr, i32 0, i32 0)
@@ -107,8 +107,8 @@ define void @ld1w_with_addr_offset(<vscale x 4 x i1> %pg, ptr %ptr, i64 %index,
 define void @ld1d(<vscale x 2 x i1> %pg, ptr %ptr, i32 %sliceidx) {
 ; CHECK-LABEL: ld1d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w13, wzr
 ; CHECK-NEXT:    mov w12, w1
+; CHECK-NEXT:    mov w13, wzr
 ; CHECK-NEXT:    ld1d {za0h.d[w13, 0]}, p0/z, [x0]
 ; CHECK-NEXT:    ld1d {za1h.d[w13, 0]}, p0/z, [x0]
 ; CHECK-NEXT:    ld1d {za2h.d[w13, 0]}, p0/z, [x0]

diff  --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
index e6ff6b380100ad..768d54d423f294 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
@@ -4,21 +4,21 @@
 define <vscale x 16 x i8> @extract_row_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_row_b:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    mov z4.d, z0.d
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z1.b, p0/m, za0h.b[w12, 0]
 ; CHECK-NEXT:    mov z2.b, p0/m, za0h.b[w12, 2]
-; CHECK-NEXT:    mov z3.b, p0/m, za0h.b[w12, 4]
 ; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z4.b, p0/m, za0h.b[w12, 6]
-; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    mov z2.b, p0/m, za0h.b[w12, 4]
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z2.b, p0/m, za0h.b[w12, 6]
+; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    mov z2.b, p0/m, za0h.b[w12, 8]
-; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    mov z3.b, p0/m, za0h.b[w12, 10]
-; CHECK-NEXT:    mov z4.b, p0/m, za0h.b[w12, 12]
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z2.b, p0/m, za0h.b[w12, 10]
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z2.b, p0/m, za0h.b[w12, 12]
 ; CHECK-NEXT:    mov z0.b, p0/m, za0h.b[w12, 14]
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -43,21 +43,21 @@ define <vscale x 16 x i8> @extract_row_b(<vscale x 16 x i8> %zd, <vscale x 16 x
 define <vscale x 16 x i8> @extract_col_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_col_b:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    mov z4.d, z0.d
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z1.b, p0/m, za0v.b[w12, 1]
 ; CHECK-NEXT:    mov z2.b, p0/m, za0v.b[w12, 3]
-; CHECK-NEXT:    mov z3.b, p0/m, za0v.b[w12, 5]
 ; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z4.b, p0/m, za0v.b[w12, 7]
-; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    mov z2.b, p0/m, za0v.b[w12, 5]
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z2.b, p0/m, za0v.b[w12, 7]
+; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    mov z2.b, p0/m, za0v.b[w12, 9]
-; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    mov z3.b, p0/m, za0v.b[w12, 11]
-; CHECK-NEXT:    mov z4.b, p0/m, za0v.b[w12, 13]
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z2.b, p0/m, za0v.b[w12, 11]
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z2.b, p0/m, za0v.b[w12, 13]
 ; CHECK-NEXT:    mov z0.b, p0/m, za0v.b[w12, 15]
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -83,13 +83,13 @@ define <vscale x 16 x i8> @extract_col_b(<vscale x 16 x i8> %zd, <vscale x 16 x
 define <vscale x 8 x i16> @extract_row_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_row_h:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z1.h, p0/m, za0h.h[w12, 0]
 ; CHECK-NEXT:    mov z2.h, p0/m, za0h.h[w12, 2]
-; CHECK-NEXT:    mov z3.h, p0/m, za0h.h[w12, 4]
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z2.h, p0/m, za0h.h[w12, 4]
 ; CHECK-NEXT:    mov z0.h, p0/m, za0h.h[w12, 6]
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -106,13 +106,13 @@ define <vscale x 8 x i16> @extract_row_h(<vscale x 8 x i16> %zd, <vscale x 8 x i
 define <vscale x 8 x i16> @extract_col_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_col_h:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z1.h, p0/m, za1v.h[w12, 1]
 ; CHECK-NEXT:    mov z2.h, p0/m, za1v.h[w12, 3]
-; CHECK-NEXT:    mov z3.h, p0/m, za1v.h[w12, 5]
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z2.h, p0/m, za1v.h[w12, 5]
 ; CHECK-NEXT:    mov z0.h, p0/m, za1v.h[w12, 7]
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -130,21 +130,21 @@ define <vscale x 8 x i16> @extract_col_h(<vscale x 8 x i16> %zd, <vscale x 8 x i
 define <vscale x 8 x half> @extract_f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    mov z4.d, z0.d
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z1.h, p0/m, za0h.h[w12, 0]
 ; CHECK-NEXT:    mov z2.h, p0/m, za0h.h[w12, 1]
-; CHECK-NEXT:    mov z3.h, p0/m, za0v.h[w12, 2]
 ; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z4.h, p0/m, za0v.h[w12, 3]
-; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    mov z2.h, p0/m, za0v.h[w12, 2]
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z2.h, p0/m, za0v.h[w12, 3]
+; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    mov z2.h, p0/m, za0h.h[w12, 4]
-; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    mov z3.h, p0/m, za0h.h[w12, 5]
-; CHECK-NEXT:    mov z4.h, p0/m, za0v.h[w12, 6]
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z2.h, p0/m, za0h.h[w12, 5]
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z2.h, p0/m, za0v.h[w12, 6]
 ; CHECK-NEXT:    mov z0.h, p0/m, za0v.h[w12, 7]
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -169,21 +169,21 @@ define <vscale x 8 x half> @extract_f16(<vscale x 8 x half> %zd, <vscale x 8 x i
 define <vscale x 8 x bfloat> @extract_bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 %tileslice, <vscale x 8 x bfloat> *%ptr) {
 ; CHECK-LABEL: extract_bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    mov z4.d, z0.d
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z1.h, p0/m, za0h.h[w12, 0]
 ; CHECK-NEXT:    mov z2.h, p0/m, za0h.h[w12, 1]
-; CHECK-NEXT:    mov z3.h, p0/m, za0v.h[w12, 2]
 ; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z4.h, p0/m, za0v.h[w12, 3]
-; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    mov z2.h, p0/m, za0v.h[w12, 2]
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z2.h, p0/m, za0v.h[w12, 3]
+; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    mov z2.h, p0/m, za0h.h[w12, 4]
-; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    mov z3.h, p0/m, za0h.h[w12, 5]
-; CHECK-NEXT:    mov z4.h, p0/m, za0v.h[w12, 6]
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z2.h, p0/m, za0h.h[w12, 5]
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z2.h, p0/m, za0v.h[w12, 6]
 ; CHECK-NEXT:    mov z0.h, p0/m, za0v.h[w12, 7]
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -208,8 +208,8 @@ define <vscale x 8 x bfloat> @extract_bf16(<vscale x 8 x bfloat> %zd, <vscale x
 define <vscale x 4 x i32> @extract_row_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_row_s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z1.s, p0/m, za0h.s[w12, 0]
 ; CHECK-NEXT:    mov z0.s, p0/m, za0h.s[w12, 2]
 ; CHECK-NEXT:    mov z0.d, z1.d
@@ -223,8 +223,8 @@ define <vscale x 4 x i32> @extract_row_s(<vscale x 4 x i32> %zd, <vscale x 4 x i
 define <vscale x 4 x i32> @extract_col_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_col_s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z1.s, p0/m, za3v.s[w12, 1]
 ; CHECK-NEXT:    mov z0.s, p0/m, za3v.s[w12, 3]
 ; CHECK-NEXT:    mov z0.d, z1.d
@@ -239,13 +239,13 @@ define <vscale x 4 x i32> @extract_col_s(<vscale x 4 x i32> %zd, <vscale x 4 x i
 define <vscale x 4 x float> @extract_f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z1.s, p0/m, za0h.s[w12, 0]
 ; CHECK-NEXT:    mov z2.s, p0/m, za0h.s[w12, 1]
-; CHECK-NEXT:    mov z3.s, p0/m, za0v.s[w12, 2]
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z2.s, p0/m, za0v.s[w12, 2]
 ; CHECK-NEXT:    mov z0.s, p0/m, za0v.s[w12, 3]
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -283,8 +283,8 @@ define <vscale x 2 x i64> @extract_col_d(<vscale x 2 x i64> %zd, <vscale x 2 x i
 define <vscale x 2 x double> @extract_f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z1.d, p0/m, za0h.d[w12, 0]
 ; CHECK-NEXT:    mov z0.d, p0/m, za0v.d[w12, 1]
 ; CHECK-NEXT:    mov z0.d, z1.d
@@ -438,16 +438,16 @@ define <vscale x 2 x double> @extract_col_q_v2f64(<vscale x 2 x double> %zd, <vs
 define <vscale x 4 x i32> @test_sink_offset_operand(<vscale x 4 x i1> %pg, i32 %base, i32 %N) {
 ; CHECK-LABEL: test_sink_offset_operand:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:  .LBB26_1: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    mov z2.d, z0.d
-; CHECK-NEXT:    mov z1.s, p0/m, za0h.s[w12, 0]
+; CHECK-NEXT:    subs w1, w1, #3
 ; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    mov z1.s, p0/m, za0h.s[w12, 0]
 ; CHECK-NEXT:    mov z2.s, p0/m, za0h.s[w12, 1]
-; CHECK-NEXT:    subs w1, w1, #3
 ; CHECK-NEXT:    mov z3.s, p0/m, za0h.s[w12, 2]
 ; CHECK-NEXT:    b.ne .LBB26_1
 ; CHECK-NEXT:  // %bb.2: // %exit

diff  --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-insert.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-insert.ll
index 394c34427cf28f..a88834c9626f86 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-insert.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-insert.ll
@@ -441,12 +441,12 @@ define void @insert_col_q_v2f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %zn
 define void @test_sink_offset_operand(<vscale x 4 x i1> %pg, i32 %base, i32 %N) {
 ; CHECK-LABEL: test_sink_offset_operand:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:  .LBB28_1: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    subs w1, w1, #3
 ; CHECK-NEXT:    mov za0h.s[w12, 0], p0/m, z0.s
+; CHECK-NEXT:    subs w1, w1, #3
 ; CHECK-NEXT:    mov za0h.s[w12, 1], p0/m, z0.s
 ; CHECK-NEXT:    mov za0h.s[w12, 2], p0/m, z0.s
 ; CHECK-NEXT:    b.ne .LBB28_1

diff  --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll
index ddff4c7d3cd3e7..2bb9c3d05b9da5 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll
@@ -18,10 +18,10 @@ define void @st1b(<vscale x 16 x i1> %pg, ptr %ptr, i32 %sliceidx) {
 define void @st1b_with_addr_offset(<vscale x 16 x i1> %pg, ptr %ptr, i64 %index, i32 %sliceidx) {
 ; CHECK-LABEL: st1b_with_addr_offset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, wzr
-; CHECK-NEXT:    mov w13, w2
-; CHECK-NEXT:    st1b {za0h.b[w12, 0]}, p0, [x0, x1]
-; CHECK-NEXT:    st1b {za0v.b[w13, 15]}, p0, [x0, x1]
+; CHECK-NEXT:    mov w13, wzr
+; CHECK-NEXT:    mov w12, w2
+; CHECK-NEXT:    st1b {za0h.b[w13, 0]}, p0, [x0, x1]
+; CHECK-NEXT:    st1b {za0v.b[w12, 15]}, p0, [x0, x1]
 ; CHECK-NEXT:    ret
   %base = getelementptr i8, ptr %ptr, i64 %index
   %tileslice = add i32 %sliceidx, 15
@@ -92,10 +92,10 @@ define void @st1w(<vscale x 4 x i1> %pg, ptr %ptr, i32 %sliceidx) {
 define void @st1w_with_addr_offset(<vscale x 4 x i1> %pg, ptr %ptr, i64 %index, i32 %sliceidx) {
 ; CHECK-LABEL: st1w_with_addr_offset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, wzr
-; CHECK-NEXT:    mov w13, w2
-; CHECK-NEXT:    st1w {za0h.s[w12, 0]}, p0, [x0, x1, lsl #2]
-; CHECK-NEXT:    st1w {za3v.s[w13, 3]}, p0, [x0, x1, lsl #2]
+; CHECK-NEXT:    mov w13, wzr
+; CHECK-NEXT:    mov w12, w2
+; CHECK-NEXT:    st1w {za0h.s[w13, 0]}, p0, [x0, x1, lsl #2]
+; CHECK-NEXT:    st1w {za3v.s[w12, 3]}, p0, [x0, x1, lsl #2]
 ; CHECK-NEXT:    ret
   %base = getelementptr i32, ptr %ptr, i64 %index
   %tileslice = add i32 %sliceidx, 3

diff  --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index 00c1c9d66c3e65..538c403981b6d3 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -16,14 +16,14 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_pstate_za_shared" {
 ; CHECK-NEXT:    mul x8, x8, x8
 ; CHECK-NEXT:    sub x9, x9, x8
 ; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    sub x10, x29, #16
 ; CHECK-NEXT:    stur x9, [x29, #-16]
+; CHECK-NEXT:    sub x9, x29, #16
 ; CHECK-NEXT:    sturh w8, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl private_za_callee
 ; CHECK-NEXT:    smstart za
-; CHECK-NEXT:    sub x0, x29, #16
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #16
 ; CHECK-NEXT:    cbnz x8, .LBB0_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    bl __arm_tpidr2_restore
@@ -55,8 +55,8 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_pstate_za_shared" {
 ; CHECK-NEXT:    msr TPIDR2_EL0, x20
 ; CHECK-NEXT:    bl private_za_callee
 ; CHECK-NEXT:    smstart za
-; CHECK-NEXT:    sub x0, x29, #16
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #16
 ; CHECK-NEXT:    cbnz x8, .LBB1_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    bl __arm_tpidr2_restore
@@ -66,8 +66,8 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_pstate_za_shared" {
 ; CHECK-NEXT:    msr TPIDR2_EL0, x20
 ; CHECK-NEXT:    bl private_za_callee
 ; CHECK-NEXT:    smstart za
-; CHECK-NEXT:    sub x0, x29, #16
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #16
 ; CHECK-NEXT:    cbnz x8, .LBB1_4
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    bl __arm_tpidr2_restore
@@ -94,14 +94,14 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_psta
 ; CHECK-NEXT:    mul x8, x8, x8
 ; CHECK-NEXT:    sub x9, x9, x8
 ; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    sub x10, x29, #16
 ; CHECK-NEXT:    stur x9, [x29, #-16]
+; CHECK-NEXT:    sub x9, x29, #16
 ; CHECK-NEXT:    sturh w8, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    smstart za
-; CHECK-NEXT:    sub x0, x29, #16
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #16
 ; CHECK-NEXT:    cbnz x8, .LBB2_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    bl __arm_tpidr2_restore
@@ -131,10 +131,10 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_pstate_z
 ; CHECK-NEXT:    mul x8, x8, x8
 ; CHECK-NEXT:    sub x9, x9, x8
 ; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    sub x10, x29, #80
 ; CHECK-NEXT:    stur x9, [x29, #-80]
+; CHECK-NEXT:    sub x9, x29, #80
 ; CHECK-NEXT:    sturh w8, [x29, #-72]
-; CHECK-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
 ; CHECK-NEXT:    tbz x19, #0, .LBB3_2
@@ -147,8 +147,8 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_pstate_z
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB3_4:
 ; CHECK-NEXT:    smstart za
-; CHECK-NEXT:    sub x0, x29, #80
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #80
 ; CHECK-NEXT:    cbnz x8, .LBB3_6
 ; CHECK-NEXT:  // %bb.5:
 ; CHECK-NEXT:    bl __arm_tpidr2_restore
@@ -156,10 +156,10 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_pstate_z
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-NEXT:    sub sp, x29, #64
 ; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @private_za_callee()

diff  --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
index 47d465cc320b5c..de7df1c9831908 100644
--- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
@@ -15,14 +15,14 @@ define void @disable_tailcallopt() "aarch64_pstate_za_shared" nounwind {
 ; CHECK-NEXT:    mul x8, x8, x8
 ; CHECK-NEXT:    sub x9, x9, x8
 ; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    sub x10, x29, #16
 ; CHECK-NEXT:    stur x9, [x29, #-16]
+; CHECK-NEXT:    sub x9, x29, #16
 ; CHECK-NEXT:    sturh w8, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl private_za_callee
 ; CHECK-NEXT:    smstart za
-; CHECK-NEXT:    sub x0, x29, #16
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #16
 ; CHECK-NEXT:    cbnz x8, .LBB0_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    bl __arm_tpidr2_restore
@@ -47,14 +47,14 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_pstate_za_shared" nounwi
 ; CHECK-NEXT:    mul x8, x8, x8
 ; CHECK-NEXT:    sub x9, x9, x8
 ; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    sub x10, x29, #16
 ; CHECK-NEXT:    stur x9, [x29, #-16]
+; CHECK-NEXT:    sub x9, x29, #16
 ; CHECK-NEXT:    sturh w8, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl __addtf3
 ; CHECK-NEXT:    smstart za
-; CHECK-NEXT:    sub x0, x29, #16
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #16
 ; CHECK-NEXT:    cbnz x8, .LBB1_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    bl __arm_tpidr2_restore

diff  --git a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll
index 7561e75d04efa0..ff23ed9dfe5a2d 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll
@@ -18,9 +18,9 @@ define void @locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_body
 ; CHECK-NEXT:    bl streaming_compatible_callee
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 
@@ -101,10 +101,10 @@ define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_psta
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #80
 ; CHECK-NEXT:    ret
 
@@ -129,9 +129,9 @@ define void @locally_streaming_caller_locally_streaming_callee() "aarch64_pstate
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 
@@ -162,11 +162,11 @@ define <2 x i64> @locally_streaming_caller_compatible_callee_vec_args_ret(<2 x i
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64> %a) "aarch64_pstate_sm_compatible"
@@ -191,11 +191,11 @@ define {<2 x i64>, <2 x i64>} @locally_streaming_caller_compatible_callee_struct
 ; CHECK-NEXT:    stp q1, q0, [sp] // 32-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #112
 ; CHECK-NEXT:    ret
   %v1.arg = extractvalue {<2 x i64>, <2 x i64>} %arg, 1
@@ -251,10 +251,10 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr d0, [sp, #72] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
@@ -286,10 +286,10 @@ define float @test_arg_survives_loop(float %arg, i32 %N) nounwind "aarch64_pstat
 ; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #80
 ; CHECK-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index 214a5ce38f276b..9d694523e09cd7 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -147,17 +147,17 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB4_4:
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z1, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %res = call <2 x double> @normal_callee_vec_arg(<2 x double> %arg)
@@ -220,33 +220,33 @@ define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale
 ; CHECK-NEXT:    fadd z0.d, z1.d, z0.d
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
 ; CHECK-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr x29, [sp], #32 // 8-byte Folded Reload
@@ -312,33 +312,33 @@ define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x
 ; CHECK-NEXT:    and p0.b, p1/z, p1.b, p0.b
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
 ; CHECK-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr x29, [sp], #32 // 8-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
index 2f4c907c872477..65521a0edc6d26 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
@@ -30,9 +30,9 @@ define void @normal_caller_streaming_callee() nounwind {
 ; CHECK-NEXT:    bl streaming_callee
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @streaming_callee()
@@ -55,9 +55,9 @@ define void @streaming_caller_normal_callee() nounwind "aarch64_pstate_sm_enable
 ; CHECK-NEXT:    bl normal_callee
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @normal_callee()
@@ -111,9 +111,9 @@ define void @call_to_function_pointer_streaming_enabled(ptr %p) nounwind {
 ; CHECK-NEXT:    blr x0
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void %p() "aarch64_pstate_sm_enabled"
@@ -135,11 +135,11 @@ define <4 x i32> @smstart_clobber_simdfp(<4 x i32> %x) nounwind {
 ; CHECK-NEXT:    bl streaming_callee
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   call void @streaming_callee()
@@ -188,33 +188,33 @@ define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) #0 {
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -268,33 +268,33 @@ define <vscale x 4 x i32> @smstart_clobber_sve_duplicate(<vscale x 4 x i32> %x)
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -319,14 +319,14 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta
 ; CHECK-NEXT:    bl cos
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr d0, [sp, #88] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldr d1, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    fadd d0, d1, d0
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
 entry:
@@ -351,9 +351,9 @@ define void @disable_tailcallopt() nounwind {
 ; CHECK-NEXT:    bl streaming_callee
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   tail call void @streaming_callee()

diff  --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll
index f25e8b71bda1e4..ecaf8bccb71fb6 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll
@@ -8,8 +8,8 @@
 define void @multi_vector_add_write_single_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,  <vscale x 4 x i32> %zm) {
 ; CHECK-LABEL: multi_vector_add_write_single_za_vg1x2_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    add za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s
 ; CHECK-NEXT:    add za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s
@@ -27,8 +27,8 @@ define void @multi_vector_add_write_single_za_vg1x2_i32(i32 %slice, <vscale x 4
 define void @multi_vector_add_write_single_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,  <vscale x 2 x i64> %zm) {
 ; CHECK-LABEL: multi_vector_add_write_single_za_vg1x2_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    add za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d
 ; CHECK-NEXT:    add za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d
@@ -105,9 +105,9 @@ define void @multi_vector_add_write_single_za_vg1x4_i64(i32 %slice,
 define void @multi_vector_add_write_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
 ; CHECK-LABEL: multi_vector_add_write_za_vg1x2_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    add za.s[w8, 0, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
@@ -128,9 +128,9 @@ define void @multi_vector_add_write_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32>
 define void @multi_vector_add_write_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
 ; CHECK-LABEL: multi_vector_add_write_za_vg1x2_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    add za.d[w8, 0, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
@@ -223,8 +223,8 @@ define void @multi_vector_add_write_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64>
 define void @multi_vector_add_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1) {
 ; CHECK-LABEL: multi_vector_add_za_vg1x2_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    add za.s[w8, 0, vgx2], { z0.s, z1.s }
 ; CHECK-NEXT:    add za.s[w8, 7, vgx2], { z0.s, z1.s }
@@ -238,8 +238,8 @@ define void @multi_vector_add_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0,
 define void @multi_vector_add_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1) {
 ; CHECK-LABEL: multi_vector_add_za_vg1x2_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    add za.d[w8, 0, vgx2], { z0.d, z1.d }
 ; CHECK-NEXT:    add za.d[w8, 7, vgx2], { z0.d, z1.d }
@@ -253,8 +253,8 @@ define void @multi_vector_add_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0,
 define void @multi_vector_add_za_vg1x2_f32(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1) {
 ; CHECK-LABEL: multi_vector_add_za_vg1x2_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fadd za.s[w8, 0, vgx2], { z0.s, z1.s }
 ; CHECK-NEXT:    fadd za.s[w8, 7, vgx2], { z0.s, z1.s }
@@ -270,8 +270,8 @@ define void @multi_vector_add_za_vg1x2_f32(i32 %slice, <vscale x 4 x float> %zn0
 define void @multi_vector_add_za_vg1x2_f64(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1) {
 ; CHECK-LABEL: multi_vector_add_za_vg1x2_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fadd za.d[w8, 0, vgx2], { z0.d, z1.d }
 ; CHECK-NEXT:    fadd za.d[w8, 7, vgx2], { z0.d, z1.d }

diff  --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas.ll
index 016fb3c50cd9ed..401cdd0b9dfb79 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas.ll
@@ -6,8 +6,8 @@
 define void @multi_vector_add_single_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zm) {
 ; CHECK-LABEL: multi_vector_add_single_vg1x2_s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fmla za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s
 ; CHECK-NEXT:    fmla za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s
@@ -25,8 +25,8 @@ define void @multi_vector_add_single_vg1x2_s(i32 %slice, <vscale x 4 x float> %z
 define void @multi_vector_add_single_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zm) {
 ; CHECK-LABEL: multi_vector_add_single_vg1x2_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fmla za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d
 ; CHECK-NEXT:    fmla za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d
@@ -94,8 +94,8 @@ define void @multi_vector_add_single_vg1x4_d(i32 %slice, <vscale x 2 x double> %
 define void @multi_vector_sub_single_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zm) {
 ; CHECK-LABEL: multi_vector_sub_single_vg1x2_s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fmls za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s
 ; CHECK-NEXT:    fmls za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s
@@ -113,8 +113,8 @@ define void @multi_vector_sub_single_vg1x2_s(i32 %slice, <vscale x 4 x float> %z
 define void @multi_vector_sub_single_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zm) {
 ; CHECK-LABEL: multi_vector_sub_single_vg1x2_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fmls za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d
 ; CHECK-NEXT:    fmls za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d
@@ -182,9 +182,9 @@ define void @multi_vector_sub_single_vg1x4_d(i32 %slice, <vscale x 2 x double> %
 define void @multi_vector_add_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
 ; CHECK-LABEL: multi_vector_add_vg1x2_s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fmla za.s[w8, 0, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
@@ -204,9 +204,9 @@ define void @multi_vector_add_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vs
 define void @multi_vector_add_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
 ; CHECK-LABEL: multi_vector_add_vg1x2_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fmla za.d[w8, 0, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
@@ -227,9 +227,9 @@ define void @multi_vector_add_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <v
 define void @multi_vector_add_vg1x2_s_regclass(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
 ; CHECK-LABEL: multi_vector_add_vg1x2_s_regclass:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z2.d
 ; CHECK-NEXT:    mov z7.d, z0.d
 ; CHECK-NEXT:    fmla za.s[w8, 0, vgx2], { z6.s, z7.s }, { z4.s, z5.s }
@@ -320,9 +320,9 @@ define void @multi_vector_add_vg1x4_s_regclass(i32 %slice, <vscale x 4 x float>
 define void @multi_vector_sub_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
 ; CHECK-LABEL: multi_vector_sub_vg1x2_s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fmls za.s[w8, 0, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
@@ -342,9 +342,9 @@ define void @multi_vector_sub_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vs
 define void @multi_vector_sub_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
 ; CHECK-LABEL: multi_vector_sub_vg1x2_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fmls za.d[w8, 0, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
@@ -418,8 +418,8 @@ define void @multi_vector_sub_vg1x4_d(i32 %slice, <vscale x 2 x double> %zn0, <v
 define void @multi_vector_add_lane_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zm) {
 ; CHECK-LABEL: multi_vector_add_lane_vg1x2_s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fmla za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s[3]
 ; CHECK-NEXT:    fmla za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s[3]
@@ -437,8 +437,8 @@ define void @multi_vector_add_lane_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0
 define void @multi_vector_add_lane_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zm) {
 ; CHECK-LABEL: multi_vector_add_lane_vg1x2_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fmla za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d[1]
 ; CHECK-NEXT:    fmla za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d[1]
@@ -457,8 +457,8 @@ define void @multi_vector_add_lane_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn
 define void @multi_vector_add_lane_vg1x2_s_regclass(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zm) {
 ; CHECK-LABEL: multi_vector_add_lane_vg1x2_s_regclass:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z0.d
 ; CHECK-NEXT:    fmla za.s[w8, 0, vgx2], { z4.s, z5.s }, z2.s[3]
 ; CHECK-NEXT:    ret
@@ -540,8 +540,8 @@ define void @multi_vector_add_lane_vg1x4_s_regclass(i32 %slice, <vscale x 4 x fl
 define void @multi_vector_sub_lane_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zm) {
 ; CHECK-LABEL: multi_vector_sub_lane_vg1x2_s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fmls za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s[3]
 ; CHECK-NEXT:    fmls za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s[3]
@@ -559,8 +559,8 @@ define void @multi_vector_sub_lane_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0
 define void @multi_vector_sub_lane_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zm) {
 ; CHECK-LABEL: multi_vector_sub_lane_vg1x2_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fmls za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d[1]
 ; CHECK-NEXT:    fmls za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d[1]

diff  --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll
index 629c111b7e3225..0097968b1171d7 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll
@@ -9,9 +9,9 @@ target triple="aarch64-linux-gnu"
 define void @fdot_multi_za32_f16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3) #0 {
 ; CHECK-LABEL: fdot_multi_za32_f16_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    fdot za.s[w8, 0, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
@@ -26,16 +26,16 @@ define void @fdot_multi_za32_f16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @fdot_multi_za32_f16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
 ; CHECK-LABEL: fdot_multi_za32_f16_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z30.d, z3.d
-; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
 ; CHECK-NEXT:    fdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    fdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ret
@@ -54,9 +54,9 @@ define void @fdot_multi_za32_f16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @bfdot_multi_za32_bf16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3) #0 {
 ; CHECK-LABEL: bfdot_multi_za32_bf16_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    bfdot za.s[w8, 0, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
@@ -71,16 +71,16 @@ define void @bfdot_multi_za32_bf16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
 define void @fdot_multi_za32_bf16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
 ; CHECK-LABEL: fdot_multi_za32_bf16_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z30.d, z3.d
-; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
 ; CHECK-NEXT:    bfdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    bfdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ret
@@ -99,8 +99,8 @@ define void @fdot_multi_za32_bf16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
 define void @fdot_single_za32_f16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) #0 {
 ; CHECK-LABEL: fdot_single_za32_f16_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    fdot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
 ; CHECK-NEXT:    fdot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
@@ -134,8 +134,8 @@ define void @fdot_single_za32_f16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
 define void @bfdot_single_za32_bf16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) #0 {
 ; CHECK-LABEL: bfdot_single_za32_bf16_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    bfdot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
 ; CHECK-NEXT:    bfdot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
@@ -169,8 +169,8 @@ define void @bfdot_single_za32_bf16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused
 define void @fdot_lane_za32_f16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) #0 {
 ; CHECK-LABEL: fdot_lane_za32_f16_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    fdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
 ; CHECK-NEXT:    fdot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
@@ -206,8 +206,8 @@ define void @fdot_lane_za32_f16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
 define void @bfdot_lane_za32_bf16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) #0 {
 ; CHECK-LABEL: bfdot_lane_za32_bf16_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    bfdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
 ; CHECK-NEXT:    bfdot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]

diff  --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-insert-mova.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-insert-mova.ll
index 0de05c0b08a696..2712cbc1d7efd3 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-insert-mova.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-insert-mova.ll
@@ -10,8 +10,8 @@
 define void @za_write_vg2_horiz_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) {
 ; CHECK-LABEL: za_write_vg2_horiz_b:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0h.b[w12, 0:1], { z0.b, z1.b }
 ; CHECK-NEXT:    mov za0h.b[w12, 14:15], { z0.b, z1.b }
@@ -25,8 +25,8 @@ define void @za_write_vg2_horiz_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x
 define void @za_write_vg2_horiz_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) {
 ; CHECK-LABEL: za_write_vg2_horiz_h:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0h.h[w12, 0:1], { z0.h, z1.h }
 ; CHECK-NEXT:    mov za1h.h[w12, 6:7], { z0.h, z1.h }
@@ -40,8 +40,8 @@ define void @za_write_vg2_horiz_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x
 define void @za_write_vg2_horiz_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) {
 ; CHECK-LABEL: za_write_vg2_horiz_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0h.h[w12, 0:1], { z0.h, z1.h }
 ; CHECK-NEXT:    mov za1h.h[w12, 6:7], { z0.h, z1.h }
@@ -55,8 +55,8 @@ define void @za_write_vg2_horiz_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscal
 define void @za_write_vg2_horiz_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) {
 ; CHECK-LABEL: za_write_vg2_horiz_bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0h.h[w12, 0:1], { z0.h, z1.h }
 ; CHECK-NEXT:    mov za1h.h[w12, 6:7], { z0.h, z1.h }
@@ -70,8 +70,8 @@ define void @za_write_vg2_horiz_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vs
 define void @za_write_vg2_horiz_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2) {
 ; CHECK-LABEL: za_write_vg2_horiz_s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0h.s[w12, 0:1], { z0.s, z1.s }
 ; CHECK-NEXT:    mov za3h.s[w12, 2:3], { z0.s, z1.s }
@@ -85,8 +85,8 @@ define void @za_write_vg2_horiz_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x
 define void @za_write_vg2_horiz_f32(i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2) {
 ; CHECK-LABEL: za_write_vg2_horiz_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0h.s[w12, 0:1], { z0.s, z1.s }
 ; CHECK-NEXT:    mov za3h.s[w12, 2:3], { z0.s, z1.s }
@@ -100,8 +100,8 @@ define void @za_write_vg2_horiz_f32(i32 %slice, <vscale x 4 x float> %zn1, <vsca
 define void @za_write_vg2_horiz_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2) {
 ; CHECK-LABEL: za_write_vg2_horiz_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0h.d[w12, 0:1], { z0.d, z1.d }
 ; CHECK-NEXT:    ret
@@ -112,8 +112,8 @@ define void @za_write_vg2_horiz_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x
 define void @za_write_vg2_horiz_f64(i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2) {
 ; CHECK-LABEL: za_write_vg2_horiz_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0h.d[w12, 0:1], { z0.d, z1.d }
 ; CHECK-NEXT:    ret
@@ -126,8 +126,8 @@ define void @za_write_vg2_horiz_f64(i32 %slice, <vscale x 2 x double> %zn1, <vsc
 define void @za_write_vg2_vert_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) {
 ; CHECK-LABEL: za_write_vg2_vert_b:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0v.b[w12, 0:1], { z0.b, z1.b }
 ; CHECK-NEXT:    mov za0v.b[w12, 14:15], { z0.b, z1.b }
@@ -141,8 +141,8 @@ define void @za_write_vg2_vert_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x
 define void @za_write_vg2_vert_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) {
 ; CHECK-LABEL: za_write_vg2_vert_h:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0v.h[w12, 0:1], { z0.h, z1.h }
 ; CHECK-NEXT:    mov za1v.h[w12, 6:7], { z0.h, z1.h }
@@ -156,8 +156,8 @@ define void @za_write_vg2_vert_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x
 define void @za_write_vg2_vert_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) {
 ; CHECK-LABEL: za_write_vg2_vert_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0v.h[w12, 0:1], { z0.h, z1.h }
 ; CHECK-NEXT:    mov za1v.h[w12, 6:7], { z0.h, z1.h }
@@ -171,8 +171,8 @@ define void @za_write_vg2_vert_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale
 define void @za_write_vg2_vert_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) {
 ; CHECK-LABEL: za_write_vg2_vert_bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0v.h[w12, 0:1], { z0.h, z1.h }
 ; CHECK-NEXT:    mov za1v.h[w12, 6:7], { z0.h, z1.h }
@@ -186,8 +186,8 @@ define void @za_write_vg2_vert_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vsc
 define void @za_write_vg2_vert_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2) {
 ; CHECK-LABEL: za_write_vg2_vert_s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0v.s[w12, 0:1], { z0.s, z1.s }
 ; CHECK-NEXT:    mov za3v.s[w12, 2:3], { z0.s, z1.s }
@@ -201,8 +201,8 @@ define void @za_write_vg2_vert_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x
 define void @za_write_vg2_vert_f32(i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2) {
 ; CHECK-LABEL: za_write_vg2_vert_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0v.s[w12, 0:1], { z0.s, z1.s }
 ; CHECK-NEXT:    mov za3v.s[w12, 2:3], { z0.s, z1.s }
@@ -216,8 +216,8 @@ define void @za_write_vg2_vert_f32(i32 %slice, <vscale x 4 x float> %zn1, <vscal
 define void @za_write_vg2_vert_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2) {
 ; CHECK-LABEL: za_write_vg2_vert_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0v.d[w12, 0:1], { z0.d, z1.d }
 ; CHECK-NEXT:    ret
@@ -228,8 +228,8 @@ define void @za_write_vg2_vert_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x
 define void @za_write_vg2_vert_f64(i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2) {
 ; CHECK-LABEL: za_write_vg2_vert_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w12, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za0v.d[w12, 0:1], { z0.d, z1.d }
 ; CHECK-NEXT:    ret
@@ -500,8 +500,8 @@ define void @za_write_vg4_vert_f64(i32 %slice, <vscale x 2 x double> %zn1, <vsca
 define void @za_write_vg1x2_d(i32 %slice, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2) {
 ; CHECK-LABEL: za_write_vg1x2_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za.d[w8, 0, vgx2], { z0.d, z1.d }
 ; CHECK-NEXT:    mov za.d[w8, 7, vgx2], { z0.d, z1.d }
@@ -515,8 +515,8 @@ define void @za_write_vg1x2_d(i32 %slice, <vscale x 2 x i64> %za1, <vscale x 2 x
 define void @za_write_vg1x2_f64(i32 %slice, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2) {
 ; CHECK-LABEL: za_write_vg1x2_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    mov za.d[w8, 0, vgx2], { z0.d, z1.d }
 ; CHECK-NEXT:    mov za.d[w8, 7, vgx2], { z0.d, z1.d }

diff  --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
index dac8096de5234c..6d986048371157 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
@@ -9,9 +9,9 @@ target triple="aarch64-linux-gnu"
 define void @udot_multi_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) #0 {
 ; CHECK-LABEL: udot_multi_za32_u16_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
@@ -26,16 +26,16 @@ define void @udot_multi_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @udot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
 ; CHECK-LABEL: udot_multi_za32_u16_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z30.d, z3.d
-; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
 ; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    udot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ret
@@ -51,9 +51,9 @@ define void @udot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @udot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3) #0 {
 ; CHECK-LABEL: udot_multi_za32_u8_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
@@ -68,16 +68,16 @@ define void @udot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
 define void @udot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
 ; CHECK-LABEL: udot_multi_za32_u8_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z30.d, z3.d
-; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
 ; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    udot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    ret
@@ -93,9 +93,9 @@ define void @udot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
 define void @udot_multi_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) #1 {
 ; CHECK-LABEL: udot_multi_za64_u16_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    udot za.d[w8, 0, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
@@ -110,16 +110,16 @@ define void @udot_multi_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @udot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
 ; CHECK-LABEL: udot_multi_za64_u16_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z30.d, z3.d
-; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
 ; CHECK-NEXT:    udot za.d[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    udot za.d[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ret
@@ -135,9 +135,9 @@ define void @udot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @usdot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3) #0 {
 ; CHECK-LABEL: usdot_multi_za32_u8_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    usdot za.s[w8, 0, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
@@ -152,16 +152,16 @@ define void @usdot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @usdot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
 ; CHECK-LABEL: usdot_multi_za32_u8_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z30.d, z3.d
-; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
 ; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    usdot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    ret
@@ -180,9 +180,9 @@ define void @usdot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @sdot_multi_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) #0 {
 ; CHECK-LABEL: sdot_multi_za32_u16_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
@@ -197,16 +197,16 @@ define void @sdot_multi_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @sdot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
 ; CHECK-LABEL: sdot_multi_za32_u16_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z30.d, z3.d
-; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
 ; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    sdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ret
@@ -222,9 +222,9 @@ define void @sdot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @sdot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3) #0 {
 ; CHECK-LABEL: sdot_multi_za32_u8_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
@@ -239,16 +239,16 @@ define void @sdot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
 define void @sdot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
 ; CHECK-LABEL: sdot_multi_za32_u8_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z30.d, z3.d
-; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
 ; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    sdot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    ret
@@ -264,9 +264,9 @@ define void @sdot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
 define void @sdot_multi_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) #1 {
 ; CHECK-LABEL: sdot_multi_za64_u16_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    sdot za.d[w8, 0, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
@@ -281,16 +281,16 @@ define void @sdot_multi_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @sdot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
 ; CHECK-LABEL: sdot_multi_za64_u16_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z30.d, z3.d
-; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
 ; CHECK-NEXT:    sdot za.d[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    sdot za.d[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ret
@@ -309,8 +309,8 @@ define void @sdot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @udot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
 ; CHECK-LABEL: udot_single_za32_u16_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
 ; CHECK-NEXT:    udot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
@@ -341,8 +341,8 @@ define void @udot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
 define void @udot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
 ; CHECK-LABEL: udot_single_za32_u8_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
 ; CHECK-NEXT:    udot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
@@ -373,8 +373,8 @@ define void @udot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @udot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
 ; CHECK-LABEL: udot_single_za64_u16_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    udot za.d[w8, 0, vgx2], { z1.h, z2.h }, z3.h
 ; CHECK-NEXT:    udot za.d[w8, 7, vgx2], { z1.h, z2.h }, z3.h
@@ -405,8 +405,8 @@ define void @udot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
 define void @usdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
 ; CHECK-LABEL: usdot_single_za32_u8_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    usdot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
 ; CHECK-NEXT:    usdot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
@@ -440,8 +440,8 @@ define void @usdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
 define void @sdot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
 ; CHECK-LABEL: sdot_single_za32_u16_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
 ; CHECK-NEXT:    sdot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
@@ -472,8 +472,8 @@ define void @sdot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
 define void @sdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
 ; CHECK-LABEL: sdot_single_za32_u8_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
 ; CHECK-NEXT:    sdot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
@@ -504,8 +504,8 @@ define void @sdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @sdot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
 ; CHECK-LABEL: sdot_single_za64_u16_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    sdot za.d[w8, 0, vgx2], { z1.h, z2.h }, z3.h
 ; CHECK-NEXT:    sdot za.d[w8, 7, vgx2], { z1.h, z2.h }, z3.h
@@ -536,8 +536,8 @@ define void @sdot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
 define void @sudot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
 ; CHECK-LABEL: sudot_single_za32_u8_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    sudot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
 ; CHECK-NEXT:    sudot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
@@ -570,8 +570,8 @@ define void @sudot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused,
 define void @udot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
 ; CHECK-LABEL: udot_lane_za32_u16_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
 ; CHECK-NEXT:    udot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
@@ -604,8 +604,8 @@ define void @udot_lane_za32_u16_vg1x4(i32 %slice, <vscale x 8 x i16> %zn0, <vsca
 define void @udot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
 ; CHECK-LABEL: udot_lane_za32_u8_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
 ; CHECK-NEXT:    udot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
@@ -638,8 +638,8 @@ define void @udot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vs
 define void @udot_lane_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
 ; CHECK-LABEL: udot_lane_za64_u16_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    udot za.d[w8, 0, vgx2], { z4.h, z5.h }, z3.h[1]
 ; CHECK-NEXT:    udot za.d[w8, 7, vgx2], { z4.h, z5.h }, z3.h[1]
@@ -672,8 +672,8 @@ define void @udot_lane_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
 define void @usdot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
 ; CHECK-LABEL: usdot_lane_za32_u8_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    usdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
 ; CHECK-NEXT:    usdot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
@@ -709,8 +709,8 @@ define void @usdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
 define void @sdot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
 ; CHECK-LABEL: sdot_lane_za32_u16_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
 ; CHECK-NEXT:    sdot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
@@ -743,8 +743,8 @@ define void @sdot_lane_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
 define void @sdot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
 ; CHECK-LABEL: sdot_lane_za32_u8_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
 ; CHECK-NEXT:    sdot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
@@ -777,8 +777,8 @@ define void @sdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vs
 define void @sdot_lane_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
 ; CHECK-LABEL: sdot_lane_za64_u16_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    sdot za.d[w8, 0, vgx2], { z4.h, z5.h }, z3.h[1]
 ; CHECK-NEXT:    sdot za.d[w8, 7, vgx2], { z4.h, z5.h }, z3.h[1]
@@ -813,8 +813,8 @@ define void @sdot_lane_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <v
 define void @sudot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
 ; CHECK-LABEL: sudot_lane_za32_u8_vg1x2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    sudot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
 ; CHECK-NEXT:    sudot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]

diff  --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
index 406d7f78097177..b7119fc0825673 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
@@ -55,6 +55,7 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -71,7 +72,6 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ld1b { z0.b, z8.b }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -82,8 +82,8 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -210,6 +210,7 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -226,7 +227,6 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ld1b { z0.b, z8.b }, pn8/z, [x0, x1]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -237,8 +237,8 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -366,6 +366,7 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -382,7 +383,6 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ld1h { z0.h, z8.h }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -393,8 +393,8 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -521,6 +521,7 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -537,7 +538,6 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ld1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -548,8 +548,8 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -677,6 +677,7 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -693,7 +694,6 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ld1w { z0.s, z8.s }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -704,8 +704,8 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -832,6 +832,7 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -848,7 +849,6 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ld1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -859,8 +859,8 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -988,6 +988,7 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -1004,7 +1005,6 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ld1d { z0.d, z8.d }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -1015,8 +1015,8 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -1143,6 +1143,7 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -1159,7 +1160,6 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ld1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -1170,8 +1170,8 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -1301,6 +1301,7 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -1317,7 +1318,6 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -1328,9 +1328,9 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -1466,6 +1466,7 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -1482,7 +1483,6 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -1493,9 +1493,9 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -1632,6 +1632,7 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -1648,7 +1649,6 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -1659,9 +1659,9 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -1798,6 +1798,7 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -1814,7 +1815,6 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -1825,9 +1825,9 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -1964,6 +1964,7 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -1980,7 +1981,6 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -1991,9 +1991,9 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -2130,6 +2130,7 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -2146,7 +2147,6 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -2157,9 +2157,9 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -2296,6 +2296,7 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -2312,7 +2313,6 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -2323,9 +2323,9 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -2462,6 +2462,7 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -2478,7 +2479,6 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -2489,9 +2489,9 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
index aca99138d2cdca..1fb251a4f628e9 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
@@ -8,6 +8,7 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -24,7 +25,6 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ldnt1b { z0.b, z8.b }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -35,8 +35,8 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -116,6 +116,7 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -132,7 +133,6 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ldnt1b { z0.b, z8.b }, pn8/z, [x0, x1]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -143,8 +143,8 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -225,6 +225,7 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -241,7 +242,6 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ldnt1h { z0.h, z8.h }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -252,8 +252,8 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -333,6 +333,7 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -349,7 +350,6 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ldnt1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -360,8 +360,8 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -442,6 +442,7 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -458,7 +459,6 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ldnt1w { z0.s, z8.s }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -469,8 +469,8 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -550,6 +550,7 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -566,7 +567,6 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ldnt1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -577,8 +577,8 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -659,6 +659,7 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -675,7 +676,6 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ldnt1d { z0.d, z8.d }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -686,8 +686,8 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -767,6 +767,7 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -783,7 +784,6 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ldnt1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -794,8 +794,8 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
-; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    mov z1.d, z8.d
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -876,6 +876,7 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -892,7 +893,6 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -903,9 +903,9 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -992,6 +992,7 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -1008,7 +1009,6 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -1019,9 +1019,9 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -1109,6 +1109,7 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -1125,7 +1126,6 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -1136,9 +1136,9 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -1225,6 +1225,7 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -1241,7 +1242,6 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -1252,9 +1252,9 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -1342,6 +1342,7 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -1358,7 +1359,6 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -1369,9 +1369,9 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -1458,6 +1458,7 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -1474,7 +1475,6 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -1485,9 +1485,9 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -1575,6 +1575,7 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -1591,7 +1592,6 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -1602,9 +1602,9 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
@@ -1691,6 +1691,7 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
 ; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; STRIDED-NEXT:    addvl sp, sp, #-17
 ; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
@@ -1707,7 +1708,6 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
 ; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
 ; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
-; STRIDED-NEXT:    mov p8.b, p0.b
 ; STRIDED-NEXT:    ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3]
 ; STRIDED-NEXT:    //APP
 ; STRIDED-NEXT:    nop
@@ -1718,9 +1718,9 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
 ; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    mov z2.d, z8.d
 ; STRIDED-NEXT:    mov z3.d, z12.d
-; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
 ; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll
index f0671cb1f012c5..e95d29f65e55e7 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll
@@ -542,15 +542,15 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
                            <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) {
 ; CHECK-LABEL: multi_vec_max_multi_x4_s8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov z29.d, z6.d
-; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
-; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov	z30.d, z7.d
+; CHECK-NEXT:    mov	z27.d, z4.d
+; CHECK-NEXT:    mov	z29.d, z6.d
+; CHECK-NEXT:    mov	z26.d, z3.d
+; CHECK-NEXT:    mov	z28.d, z5.d
+; CHECK-NEXT:    mov	z25.d, z2.d
+; CHECK-NEXT:    mov	z24.d, z1.d
 ; CHECK-NEXT:    ld1b { z31.b }, p0/z, [x0]
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    smax { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -568,15 +568,15 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
                             <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
 ; CHECK-LABEL: multi_vec_max_multi_x4_s16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z29.d, z6.d
-; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
-; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov	z30.d, z7.d
+; CHECK-NEXT:    mov	z27.d, z4.d
+; CHECK-NEXT:    mov	z29.d, z6.d
+; CHECK-NEXT:    mov	z26.d, z3.d
+; CHECK-NEXT:    mov	z28.d, z5.d
+; CHECK-NEXT:    mov	z25.d, z2.d
+; CHECK-NEXT:    mov	z24.d, z1.d
 ; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    smax { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -594,15 +594,15 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4
                             <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
 ; CHECK-LABEL: multi_vec_max_multi_x4_s32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z29.d, z6.d
-; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
-; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov	z30.d, z7.d
+; CHECK-NEXT:    mov	z27.d, z4.d
+; CHECK-NEXT:    mov	z29.d, z6.d
+; CHECK-NEXT:    mov	z26.d, z3.d
+; CHECK-NEXT:    mov	z28.d, z5.d
+; CHECK-NEXT:    mov	z25.d, z2.d
+; CHECK-NEXT:    mov	z24.d, z1.d
 ; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    smax { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -620,15 +620,15 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2
                             <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
 ; CHECK-LABEL: multi_vec_max_multi_x4_s64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z29.d, z6.d
-; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
-; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov	z30.d, z7.d
+; CHECK-NEXT:    mov	z27.d, z4.d
+; CHECK-NEXT:    mov	z29.d, z6.d
+; CHECK-NEXT:    mov	z26.d, z3.d
+; CHECK-NEXT:    mov	z28.d, z5.d
+; CHECK-NEXT:    mov	z25.d, z2.d
+; CHECK-NEXT:    mov	z24.d, z1.d
 ; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    smax { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -648,15 +648,15 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
                            <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) {
 ; CHECK-LABEL: multi_vec_max_multi_x4_u8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov z29.d, z6.d
-; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
-; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov	z30.d, z7.d
+; CHECK-NEXT:    mov	z27.d, z4.d
+; CHECK-NEXT:    mov	z29.d, z6.d
+; CHECK-NEXT:    mov	z26.d, z3.d
+; CHECK-NEXT:    mov	z28.d, z5.d
+; CHECK-NEXT:    mov	z25.d, z2.d
+; CHECK-NEXT:    mov	z24.d, z1.d
 ; CHECK-NEXT:    ld1b { z31.b }, p0/z, [x0]
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    umax { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -674,15 +674,15 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
                             <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
 ; CHECK-LABEL: multi_vec_max_multi_x4_u16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z29.d, z6.d
-; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
-; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov	z30.d, z7.d
+; CHECK-NEXT:    mov	z27.d, z4.d
+; CHECK-NEXT:    mov	z29.d, z6.d
+; CHECK-NEXT:    mov	z26.d, z3.d
+; CHECK-NEXT:    mov	z28.d, z5.d
+; CHECK-NEXT:    mov	z25.d, z2.d
+; CHECK-NEXT:    mov	z24.d, z1.d
 ; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    umax { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -700,15 +700,15 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4
                             <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
 ; CHECK-LABEL: multi_vec_max_multi_x4_u32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z29.d, z6.d
-; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
-; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov	z30.d, z7.d
+; CHECK-NEXT:    mov	z27.d, z4.d
+; CHECK-NEXT:    mov	z29.d, z6.d
+; CHECK-NEXT:    mov	z26.d, z3.d
+; CHECK-NEXT:    mov	z28.d, z5.d
+; CHECK-NEXT:    mov	z25.d, z2.d
+; CHECK-NEXT:    mov	z24.d, z1.d
 ; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    umax { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -726,15 +726,15 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2
                             <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
 ; CHECK-LABEL: multi_vec_max_multi_x4_u64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z29.d, z6.d
-; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
-; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov	z30.d, z7.d
+; CHECK-NEXT:    mov	z27.d, z4.d
+; CHECK-NEXT:    mov	z29.d, z6.d
+; CHECK-NEXT:    mov	z26.d, z3.d
+; CHECK-NEXT:    mov	z28.d, z5.d
+; CHECK-NEXT:    mov	z25.d, z2.d
+; CHECK-NEXT:    mov	z24.d, z1.d
 ; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    umax { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -754,15 +754,15 @@ define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale
                             <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3, <vscale x 8 x half> %zm4) {
 ; CHECK-LABEL: multi_vec_max_multi_x4_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z29.d, z6.d
-; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
-; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov	z30.d, z7.d
+; CHECK-NEXT:    mov	z27.d, z4.d
+; CHECK-NEXT:    mov	z29.d, z6.d
+; CHECK-NEXT:    mov	z26.d, z3.d
+; CHECK-NEXT:    mov	z28.d, z5.d
+; CHECK-NEXT:    mov	z25.d, z2.d
+; CHECK-NEXT:    mov	z24.d, z1.d
 ; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    fmax { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -780,15 +780,15 @@ define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vsca
                             <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) {
 ; CHECK-LABEL: multi_vec_max_multi_x4_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z29.d, z6.d
-; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
-; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov	z30.d, z7.d
+; CHECK-NEXT:    mov	z27.d, z4.d
+; CHECK-NEXT:    mov	z29.d, z6.d
+; CHECK-NEXT:    mov	z26.d, z3.d
+; CHECK-NEXT:    mov	z28.d, z5.d
+; CHECK-NEXT:    mov	z25.d, z2.d
+; CHECK-NEXT:    mov	z24.d, z1.d
 ; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    fmax { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -806,15 +806,15 @@ define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <v
                             <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) {
 ; CHECK-LABEL: multi_vec_max_multi_x4_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z29.d, z6.d
-; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
-; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov	z30.d, z7.d
+; CHECK-NEXT:    mov	z27.d, z4.d
+; CHECK-NEXT:    mov	z29.d, z6.d
+; CHECK-NEXT:    mov	z26.d, z3.d
+; CHECK-NEXT:    mov	z28.d, z5.d
+; CHECK-NEXT:    mov	z25.d, z2.d
+; CHECK-NEXT:    mov	z24.d, z1.d
 ; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    fmax { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -980,15 +980,15 @@ define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale
 @multi_vec_maxnm_x4_f16(<vscale x 8 x half> %dummy, <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3, <vscale x 8 x half> %zm4) {
 ; CHECK-LABEL: multi_vec_maxnm_x4_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z29.d, z6.d
-; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
-; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov	z30.d, z7.d
+; CHECK-NEXT:    mov	z27.d, z4.d
+; CHECK-NEXT:    mov	z29.d, z6.d
+; CHECK-NEXT:    mov	z26.d, z3.d
+; CHECK-NEXT:    mov	z28.d, z5.d
+; CHECK-NEXT:    mov	z25.d, z2.d
+; CHECK-NEXT:    mov	z24.d, z1.d
 ; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    fmaxnm { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -1005,15 +1005,15 @@ define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vsca
 @multi_vec_maxnm_x4_f32(<vscale x 8 x half> %dummy, <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) {
 ; CHECK-LABEL: multi_vec_maxnm_x4_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z29.d, z6.d
-; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
-; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov	z30.d, z7.d
+; CHECK-NEXT:    mov	z27.d, z4.d
+; CHECK-NEXT:    mov	z29.d, z6.d
+; CHECK-NEXT:    mov	z26.d, z3.d
+; CHECK-NEXT:    mov	z28.d, z5.d
+; CHECK-NEXT:    mov	z25.d, z2.d
+; CHECK-NEXT:    mov	z24.d, z1.d
 ; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    fmaxnm { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -1030,15 +1030,15 @@ define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <v
 @multi_vec_maxnm_x4_f64(<vscale x 8 x half> %dummy, <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) {
 ; CHECK-LABEL: multi_vec_maxnm_x4_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z29.d, z6.d
-; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
-; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov	z30.d, z7.d
+; CHECK-NEXT:    mov	z27.d, z4.d
+; CHECK-NEXT:    mov	z29.d, z6.d
+; CHECK-NEXT:    mov	z26.d, z3.d
+; CHECK-NEXT:    mov	z28.d, z5.d
+; CHECK-NEXT:    mov	z25.d, z2.d
+; CHECK-NEXT:    mov	z24.d, z1.d
 ; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    fmaxnm { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d

diff  --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll
index 45b46a9b905665..21a55c6436acd8 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll
@@ -542,15 +542,15 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
                            <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) {
 ; CHECK-LABEL: multi_vec_min_multi_x4_s8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
-; CHECK-NEXT:    ld1b { z31.b }, p0/z, [x0]
+; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    mov z24.d, z1.d
+; CHECK-NEXT:    ld1b { z31.b }, p0/z, [x0]
 ; CHECK-NEXT:    smin { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -568,15 +568,15 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
                             <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
 ; CHECK-LABEL: multi_vec_min_multi_x4_s16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
-; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
+; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    mov z24.d, z1.d
+; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
 ; CHECK-NEXT:    smin { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -594,15 +594,15 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4
                             <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
 ; CHECK-LABEL: multi_vec_min_multi_x4_s32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
-; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
+; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    mov z24.d, z1.d
+; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
 ; CHECK-NEXT:    smin { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -620,15 +620,15 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2
                             <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
 ; CHECK-LABEL: multi_vec_min_multi_x4_s64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
-; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
+; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    mov z24.d, z1.d
+; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
 ; CHECK-NEXT:    smin { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -648,15 +648,15 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
                            <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) {
 ; CHECK-LABEL: multi_vec_min_multi_x4_u8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
-; CHECK-NEXT:    ld1b { z31.b }, p0/z, [x0]
+; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    mov z24.d, z1.d
+; CHECK-NEXT:    ld1b { z31.b }, p0/z, [x0]
 ; CHECK-NEXT:    umin { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -674,15 +674,15 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
                             <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
 ; CHECK-LABEL: multi_vec_min_multi_x4_u16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
-; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
+; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    mov z24.d, z1.d
+; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
 ; CHECK-NEXT:    umin { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -700,15 +700,15 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4
                             <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
 ; CHECK-LABEL: multi_vec_min_multi_x4_u32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
-; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
+; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    mov z24.d, z1.d
+; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
 ; CHECK-NEXT:    umin { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -726,15 +726,15 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2
                             <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
 ; CHECK-LABEL: multi_vec_min_multi_x4_u64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
-; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
+; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    mov z24.d, z1.d
+; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
 ; CHECK-NEXT:    umin { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -754,15 +754,15 @@ define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale
                             <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3, <vscale x 8 x half> %zm4) {
 ; CHECK-LABEL: multi_vec_min_multi_x4_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
-; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
+; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    mov z24.d, z1.d
+; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
 ; CHECK-NEXT:    fmin { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -780,15 +780,15 @@ define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vsca
                             <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) {
 ; CHECK-LABEL: multi_vec_min_multi_x4_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
-; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
+; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    mov z24.d, z1.d
+; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
 ; CHECK-NEXT:    fmin { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -806,15 +806,15 @@ define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <v
                             <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) {
 ; CHECK-LABEL: multi_vec_min_multi_x4_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
-; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
+; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    mov z24.d, z1.d
+; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
 ; CHECK-NEXT:    fmin { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -980,15 +980,15 @@ define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale
 @multi_vec_minnm_x4_f16(<vscale x 8 x half> %dummy, <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3, <vscale x 8 x half> %zm4) {
 ; CHECK-LABEL: multi_vec_minnm_x4_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
-; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
+; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    mov z24.d, z1.d
+; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
 ; CHECK-NEXT:    fminnm { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -1005,15 +1005,15 @@ define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vsca
 @multi_vec_minnm_x4_f32(<vscale x 8 x half> %dummy, <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) {
 ; CHECK-LABEL: multi_vec_minnm_x4_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
-; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
+; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    mov z24.d, z1.d
+; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
 ; CHECK-NEXT:    fminnm { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -1030,15 +1030,15 @@ define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <v
 @multi_vec_minnm_x4_f64(<vscale x 8 x half> %dummy, <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) {
 ; CHECK-LABEL: multi_vec_minnm_x4_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
-; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
+; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    mov z24.d, z1.d
+; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
 ; CHECK-NEXT:    fminnm { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d

diff  --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll
index 9db2254df8e369..f766bfcff4d1d1 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll
@@ -38,8 +38,8 @@ define void @multi_vector_mul_add_single_long_vg4x1_s16(i32 %slice, <vscale x 8
 define void @multi_vector_mul_add_single_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
 ; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x2_s8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    smlall za.s[w8, 0:3, vgx2], { z1.b, z2.b }, z3.b
 ; CHECK-NEXT:    smlall za.s[w8, 4:7, vgx2], { z1.b, z2.b }, z3.b
@@ -53,8 +53,8 @@ define void @multi_vector_mul_add_single_long_vg4x2_s8(i32 %slice, <vscale x 16
 define void @multi_vector_mul_add_single_long_vg4x2_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x2_s16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    smlall za.d[w8, 0:3, vgx2], { z1.h, z2.h }, z3.h
 ; CHECK-NEXT:    smlall za.d[w8, 4:7, vgx2], { z1.h, z2.h }, z3.h
@@ -106,9 +106,9 @@ define void @multi_vector_mul_add_single_long_vg4x4_s16(i32 %slice, <vscale x 8
 define void @multi_vector_mul_add_multi_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) {
 ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x2_s8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    smlall za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
@@ -123,9 +123,9 @@ define void @multi_vector_mul_add_multi_long_vg4x2_s8(i32 %slice, <vscale x 16 x
 define void @multi_vector_mul_add_multi_long_vg4x2_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
 ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x2_s16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    smlall za.d[w8, 0:3, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
@@ -142,16 +142,16 @@ define void @multi_vector_mul_add_multi_long_vg4x2_s16(i32 %slice, <vscale x 8 x
 define void @multi_vector_mul_add_multi_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
 ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_s8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z30.d, z3.d
-; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
 ; CHECK-NEXT:    smlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    smlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    ret
@@ -164,16 +164,16 @@ define void @multi_vector_mul_add_multi_long_vg4x4_s8(i32 %slice, <vscale x 16 x
 define void @multi_vector_mul_add_multi_long_vg4x4_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
 ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_s16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z30.d, z3.d
-; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
 ; CHECK-NEXT:    smlall za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    smlall za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ret
@@ -216,8 +216,8 @@ define void @multi_vector_mul_add_lane_long_vg4x1_s16(i32 %slice, <vscale x 8 x
 define void @multi_vector_mul_add_lane_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
 ; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x2_s8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    smlall za.s[w8, 0:3, vgx2], { z4.b, z5.b }, z3.b[0]
 ; CHECK-NEXT:    smlall za.s[w8, 4:7, vgx2], { z4.b, z5.b }, z3.b[15]
@@ -231,8 +231,8 @@ define void @multi_vector_mul_add_lane_long_vg4x2_s8(i32 %slice, <vscale x 16 x
 define void @multi_vector_mul_add_lane_long_vg4x2_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x2_s16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    smlall za.d[w8, 0:3, vgx2], { z4.h, z5.h }, z3.h[0]
 ; CHECK-NEXT:    smlall za.d[w8, 4:7, vgx2], { z4.h, z5.h }, z3.h[7]
@@ -314,8 +314,8 @@ define void @multi_vector_mul_add_single_long_vg4x1_u16(i32 %slice, <vscale x 8
 define void @multi_vector_mul_add_single_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
 ; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x2_u8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    umlall za.s[w8, 0:3, vgx2], { z1.b, z2.b }, z3.b
 ; CHECK-NEXT:    umlall za.s[w8, 4:7, vgx2], { z1.b, z2.b }, z3.b
@@ -329,8 +329,8 @@ define void @multi_vector_mul_add_single_long_vg4x2_u8(i32 %slice, <vscale x 16
 define void @multi_vector_mul_add_single_long_vg4x2_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x2_u16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    umlall za.d[w8, 0:3, vgx2], { z1.h, z2.h }, z3.h
 ; CHECK-NEXT:    umlall za.d[w8, 4:7, vgx2], { z1.h, z2.h }, z3.h
@@ -382,9 +382,9 @@ define void @multi_vector_mul_add_single_long_vg4x4_u16(i32 %slice, <vscale x 8
 define void @multi_vector_mul_add_multi_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) {
 ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x2_u8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    umlall za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
@@ -399,9 +399,9 @@ define void @multi_vector_mul_add_multi_long_vg4x2_u8(i32 %slice, <vscale x 16 x
 define void @multi_vector_mul_add_multi_long_vg4x2_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
 ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x2_u16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    umlall za.d[w8, 0:3, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
@@ -418,16 +418,16 @@ define void @multi_vector_mul_add_multi_long_vg4x2_u16(i32 %slice, <vscale x 8 x
 define void @multi_vector_mul_add_multi_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
 ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_u8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z30.d, z3.d
-; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
 ; CHECK-NEXT:    umlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    umlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    ret
@@ -440,16 +440,16 @@ define void @multi_vector_mul_add_multi_long_vg4x4_u8(i32 %slice, <vscale x 16 x
 define void @multi_vector_mul_add_multi_long_vg4x4_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
 ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_u16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z30.d, z3.d
-; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
 ; CHECK-NEXT:    umlall za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    umlall za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ret
@@ -492,8 +492,8 @@ define void @multi_vector_mul_add_lane_long_vg4x1_u16(i32 %slice, <vscale x 8 x
 define void @multi_vector_mul_add_lane_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
 ; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x2_u8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    umlall za.s[w8, 0:3, vgx2], { z4.b, z5.b }, z3.b[0]
 ; CHECK-NEXT:    umlall za.s[w8, 4:7, vgx2], { z4.b, z5.b }, z3.b[15]
@@ -507,8 +507,8 @@ define void @multi_vector_mul_add_lane_long_vg4x2_u8(i32 %slice, <vscale x 16 x
 define void @multi_vector_mul_add_lane_long_vg4x2_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x2_u16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    umlall za.d[w8, 0:3, vgx2], { z4.h, z5.h }, z3.h[0]
 ; CHECK-NEXT:    umlall za.d[w8, 4:7, vgx2], { z4.h, z5.h }, z3.h[7]
@@ -590,8 +590,8 @@ define void @multi_vector_mul_sub_single_long_vg4x1_s16(i32 %slice, <vscale x 8
 define void @multi_vector_mul_sub_single_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
 ; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x2_s8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    smlsll za.s[w8, 0:3, vgx2], { z1.b, z2.b }, z3.b
 ; CHECK-NEXT:    smlsll za.s[w8, 4:7, vgx2], { z1.b, z2.b }, z3.b
@@ -605,8 +605,8 @@ define void @multi_vector_mul_sub_single_long_vg4x2_s8(i32 %slice, <vscale x 16
 define void @multi_vector_mul_sub_single_long_vg4x2_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x2_s16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    smlsll za.d[w8, 0:3, vgx2], { z1.h, z2.h }, z3.h
 ; CHECK-NEXT:    smlsll za.d[w8, 4:7, vgx2], { z1.h, z2.h }, z3.h
@@ -658,9 +658,9 @@ define void @multi_vector_mul_sub_single_long_vg4x4_s16(i32 %slice, <vscale x 8
 define void @multi_vector_mul_sub_multi_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) {
 ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x2_s8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    smlsll za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
@@ -675,9 +675,9 @@ define void @multi_vector_mul_sub_multi_long_vg4x2_s8(i32 %slice, <vscale x 16 x
 define void @multi_vector_mul_sub_multi_long_vg4x2_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
 ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x2_s16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    smlsll za.d[w8, 0:3, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
@@ -694,16 +694,16 @@ define void @multi_vector_mul_sub_multi_long_vg4x2_s16(i32 %slice, <vscale x 8 x
 define void @multi_vector_mul_sub_multi_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
 ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_s8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z30.d, z3.d
-; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
 ; CHECK-NEXT:    smlsll za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    smlsll za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    ret
@@ -716,16 +716,16 @@ define void @multi_vector_mul_sub_multi_long_vg4x4_s8(i32 %slice, <vscale x 16 x
 define void @multi_vector_mul_sub_multi_long_vg4x4_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
 ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_s16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z30.d, z3.d
-; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
 ; CHECK-NEXT:    smlsll za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    smlsll za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ret
@@ -768,8 +768,8 @@ define void @multi_vector_mul_sub_lane_long_vg4x1_s16(i32 %slice, <vscale x 8 x
 define void @multi_vector_mul_sub_lane_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
 ; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x2_s8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    smlsll za.s[w8, 0:3, vgx2], { z4.b, z5.b }, z3.b[0]
 ; CHECK-NEXT:    smlsll za.s[w8, 4:7, vgx2], { z4.b, z5.b }, z3.b[15]
@@ -783,8 +783,8 @@ define void @multi_vector_mul_sub_lane_long_vg4x2_s8(i32 %slice, <vscale x 16 x
 define void @multi_vector_mul_sub_lane_long_vg4x2_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x2_s16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    smlsll za.d[w8, 0:3, vgx2], { z4.h, z5.h }, z3.h[0]
 ; CHECK-NEXT:    smlsll za.d[w8, 4:7, vgx2], { z4.h, z5.h }, z3.h[7]
@@ -866,8 +866,8 @@ define void @multi_vector_mul_sub_single_long_vg4x1_u16(i32 %slice, <vscale x 8
 define void @multi_vector_mul_sub_single_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
 ; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x2_u8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    umlsll za.s[w8, 0:3, vgx2], { z1.b, z2.b }, z3.b
 ; CHECK-NEXT:    umlsll za.s[w8, 4:7, vgx2], { z1.b, z2.b }, z3.b
@@ -881,8 +881,8 @@ define void @multi_vector_mul_sub_single_long_vg4x2_u8(i32 %slice, <vscale x 16
 define void @multi_vector_mul_sub_single_long_vg4x2_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x2_u16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    umlsll za.d[w8, 0:3, vgx2], { z1.h, z2.h }, z3.h
 ; CHECK-NEXT:    umlsll za.d[w8, 4:7, vgx2], { z1.h, z2.h }, z3.h
@@ -934,9 +934,9 @@ define void @multi_vector_mul_sub_single_long_vg4x4_u16(i32 %slice, <vscale x 8
 define void @multi_vector_mul_sub_multi_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) {
 ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x2_u8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    umlsll za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
@@ -951,9 +951,9 @@ define void @multi_vector_mul_sub_multi_long_vg4x2_u8(i32 %slice, <vscale x 16 x
 define void @multi_vector_mul_sub_multi_long_vg4x2_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
 ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x2_u16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    umlsll za.d[w8, 0:3, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
@@ -970,16 +970,16 @@ define void @multi_vector_mul_sub_multi_long_vg4x2_u16(i32 %slice, <vscale x 8 x
 define void @multi_vector_mul_sub_multi_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
 ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_u8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z30.d, z3.d
-; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
 ; CHECK-NEXT:    umlsll za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    umlsll za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    ret
@@ -992,16 +992,16 @@ define void @multi_vector_mul_sub_multi_long_vg4x4_u8(i32 %slice, <vscale x 16 x
 define void @multi_vector_mul_sub_multi_long_vg4x4_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
 ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_u16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z30.d, z3.d
-; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
 ; CHECK-NEXT:    umlsll za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    umlsll za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ret
@@ -1044,8 +1044,8 @@ define void @multi_vector_mul_sub_lane_long_vg4x1_u16(i32 %slice, <vscale x 8 x
 define void @multi_vector_mul_sub_lane_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
 ; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x2_u8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    umlsll za.s[w8, 0:3, vgx2], { z4.b, z5.b }, z3.b[0]
 ; CHECK-NEXT:    umlsll za.s[w8, 4:7, vgx2], { z4.b, z5.b }, z3.b[15]
@@ -1059,8 +1059,8 @@ define void @multi_vector_mul_sub_lane_long_vg4x2_u8(i32 %slice, <vscale x 16 x
 define void @multi_vector_mul_sub_lane_long_vg4x2_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x2_u16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    umlsll za.d[w8, 0:3, vgx2], { z4.h, z5.h }, z3.h[0]
 ; CHECK-NEXT:    umlsll za.d[w8, 4:7, vgx2], { z4.h, z5.h }, z3.h[7]
@@ -1116,8 +1116,8 @@ define void @multi_vector_mul_sub_lane_long_vg4x4_u16(i32 %slice, <vscale x 8 x
 define void @multi_vector_mul_add_single_signed_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
 ; CHECK-LABEL: multi_vector_mul_add_single_signed_long_vg4x2_s8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    sumlall za.s[w8, 0:3, vgx2], { z1.b, z2.b }, z3.b
 ; CHECK-NEXT:    sumlall za.s[w8, 4:7, vgx2], { z1.b, z2.b }, z3.b
@@ -1167,8 +1167,8 @@ define void @multi_vector_mul_add_lane_signed_long_vg4x1_s8(i32 %slice, <vscale
 define void @multi_vector_mul_add_lane_signed_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
 ; CHECK-LABEL: multi_vector_mul_add_lane_signed_long_vg4x2_s8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    sumlall za.s[w8, 0:3, vgx2], { z4.b, z5.b }, z3.b[0]
 ; CHECK-NEXT:    sumlall za.s[w8, 4:7, vgx2], { z4.b, z5.b }, z3.b[15]
@@ -1220,8 +1220,8 @@ define void @multi_vector_mul_add_single_unsigned_long_vg4x1_s8(i32 %slice, <vsc
 define void @multi_vector_mul_add_single_unsigned_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
 ; CHECK-LABEL: multi_vector_mul_add_single_unsigned_long_vg4x2_s8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
 ; CHECK-NEXT:    usmlall za.s[w8, 0:3, vgx2], { z1.b, z2.b }, z3.b
 ; CHECK-NEXT:    usmlall za.s[w8, 4:7, vgx2], { z1.b, z2.b }, z3.b
@@ -1256,9 +1256,9 @@ define void @multi_vector_mul_add_single_unsigned_long_vg4x4_s8(i32 %slice, <vsc
 define void @multi_vector_mul_add_multi_unsigned_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) {
 ; CHECK-LABEL: multi_vector_mul_add_multi_unsigned_long_vg4x2_u8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    usmlall za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
@@ -1275,16 +1275,16 @@ define void @multi_vector_mul_add_multi_unsigned_long_vg4x2_u8(i32 %slice, <vsca
 define void @multi_vector_mul_add_multi_unsigned_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
 ; CHECK-LABEL: multi_vector_mul_add_multi_unsigned_long_vg4x4_u8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov z25.d, z6.d
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z24.d, z5.d
-; CHECK-NEXT:    mov z30.d, z3.d
-; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
 ; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
 ; CHECK-NEXT:    usmlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    usmlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    ret
@@ -1314,8 +1314,8 @@ define void @multi_vector_mul_add_lane_unsigned_long_vg4x1_s8(i32 %slice, <vscal
 define void @multi_vector_mul_add_lane_unsigned_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
 ; CHECK-LABEL: multi_vector_mul_add_lane_unsigned_long_vg4x2_s8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    usmlall za.s[w8, 0:3, vgx2], { z4.b, z5.b }, z3.b[0]
 ; CHECK-NEXT:    usmlall za.s[w8, 4:7, vgx2], { z4.b, z5.b }, z3.b[15]

diff  --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlals.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlals.ll
index 95527daa24befb..ba10c2dd3cf485 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlals.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlals.ll
@@ -120,8 +120,8 @@ define void @multi_vector_sub_single_vg2x1_u16(i32 %slice, <vscale x 8 x i16> %z
 define void @multi_vector_add_single_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm) {
 ; CHECK-LABEL: multi_vector_add_single_vg2x2_bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    bfmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
 ; CHECK-NEXT:    bfmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
@@ -135,8 +135,8 @@ define void @multi_vector_add_single_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat
 define void @multi_vector_add_single_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm) {
 ; CHECK-LABEL: multi_vector_add_single_vg2x2_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
 ; CHECK-NEXT:    fmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
@@ -150,8 +150,8 @@ define void @multi_vector_add_single_vg2x2_f16(i32 %slice, <vscale x 8 x half> %
 define void @multi_vector_add_single_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: multi_vector_add_single_vg2x2_s16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    smlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
 ; CHECK-NEXT:    smlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
@@ -165,8 +165,8 @@ define void @multi_vector_add_single_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %z
 define void @multi_vector_add_single_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: multi_vector_add_single_vg2x2_u16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    umlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
 ; CHECK-NEXT:    umlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
@@ -184,8 +184,8 @@ define void @multi_vector_add_single_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %z
 define void @multi_vector_sub_single_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm) {
 ; CHECK-LABEL: multi_vector_sub_single_vg2x2_bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    bfmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
 ; CHECK-NEXT:    bfmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
@@ -199,8 +199,8 @@ define void @multi_vector_sub_single_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat
 define void @multi_vector_sub_single_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm) {
 ; CHECK-LABEL: multi_vector_sub_single_vg2x2_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
 ; CHECK-NEXT:    fmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
@@ -214,8 +214,8 @@ define void @multi_vector_sub_single_vg2x2_f16(i32 %slice, <vscale x 8 x half> %
 define void @multi_vector_sub_single_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: multi_vector_sub_single_vg2x2_s16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    smlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
 ; CHECK-NEXT:    smlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
@@ -229,8 +229,8 @@ define void @multi_vector_sub_single_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %z
 define void @multi_vector_sub_single_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: multi_vector_sub_single_vg2x2_u16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    umlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
 ; CHECK-NEXT:    umlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
@@ -424,9 +424,9 @@ define void @multi_vector_sub_single_vg2x4_u16(i32 %slice, <vscale x 8 x i16> %z
 define void @multi_vector_add_multi_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm0,  <vscale x 8 x bfloat> %zm1) {
 ; CHECK-LABEL: multi_vector_add_multi_vg2x2_bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    bfmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
@@ -443,9 +443,9 @@ define void @multi_vector_add_multi_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat>
 define void @multi_vector_add_multi_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1) {
 ; CHECK-LABEL: multi_vector_add_multi_vg2x2_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
@@ -462,9 +462,9 @@ define void @multi_vector_add_multi_vg2x2_f16(i32 %slice, <vscale x 8 x half> %z
 define void @multi_vector_add_multi_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
 ; CHECK-LABEL: multi_vector_add_multi_vg2x2_s16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    smlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
@@ -481,9 +481,9 @@ define void @multi_vector_add_multi_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn
 define void @multi_vector_add_multi_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
 ; CHECK-LABEL: multi_vector_add_multi_vg2x2_u16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    umlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
@@ -504,9 +504,9 @@ define void @multi_vector_add_multi_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn
 define void @multi_vector_sub_multi_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1) {
 ; CHECK-LABEL: multi_vector_sub_multi_vg2x2_bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    bfmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
@@ -523,9 +523,9 @@ define void @multi_vector_sub_multi_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat>
 define void @multi_vector_sub_multi_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1) {
 ; CHECK-LABEL: multi_vector_sub_multi_vg2x2_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
@@ -542,9 +542,9 @@ define void @multi_vector_sub_multi_vg2x2_f16(i32 %slice, <vscale x 8 x half> %z
 define void @multi_vector_sub_multi_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
 ; CHECK-LABEL: multi_vector_sub_multi_vg2x2_s16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    smlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
@@ -561,9 +561,9 @@ define void @multi_vector_sub_multi_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn
 define void @multi_vector_sub_multi_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
 ; CHECK-LABEL: multi_vector_sub_multi_vg2x2_u16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    umlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
@@ -912,8 +912,8 @@ define void @multi_vector_sub_lane_vg2x1_u16(i32 %slice, <vscale x 8 x i16> %zn,
 define void @multi_vector_add_lane_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm) {
 ; CHECK-LABEL: multi_vector_add_lane_vg2x2_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
 ; CHECK-NEXT:    fmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
@@ -929,8 +929,8 @@ define void @multi_vector_add_lane_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn
 define void @multi_vector_add_lane_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm) {
 ; CHECK-LABEL: multi_vector_add_lane_vg2x2_bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    bfmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
 ; CHECK-NEXT:    bfmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
@@ -946,8 +946,8 @@ define void @multi_vector_add_lane_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat>
 define void @multi_vector_add_lane_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: multi_vector_add_lane_vg2x2_s16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    smlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
 ; CHECK-NEXT:    smlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
@@ -963,8 +963,8 @@ define void @multi_vector_add_lane_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0
 define void @multi_vector_add_lane_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: multi_vector_add_lane_vg2x2_u16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    umlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
 ; CHECK-NEXT:    umlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
@@ -984,8 +984,8 @@ define void @multi_vector_add_lane_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0
 define void @multi_vector_sub_lane_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm) {
 ; CHECK-LABEL: multi_vector_sub_lane_vg2x2_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
 ; CHECK-NEXT:    fmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
@@ -1001,8 +1001,8 @@ define void @multi_vector_sub_lane_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn
 define void @multi_vector_sub_lane_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm) {
 ; CHECK-LABEL: multi_vector_sub_lane_vg2x2_bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    bfmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
 ; CHECK-NEXT:    bfmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
@@ -1018,8 +1018,8 @@ define void @multi_vector_sub_lane_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat>
 define void @multi_vector_sub_lane_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: multi_vector_sub_lane_vg2x2_s16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    smlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
 ; CHECK-NEXT:    smlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
@@ -1035,8 +1035,8 @@ define void @multi_vector_sub_lane_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0
 define void @multi_vector_sub_lane_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: multi_vector_sub_lane_vg2x2_u16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    umlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
 ; CHECK-NEXT:    umlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]

diff  --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll
index a5dd7ca0e89490..d138a3af438524 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll
@@ -332,15 +332,15 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
 @multi_vec_rounding_shl_x4_s8(<vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) {
 ; CHECK-LABEL: multi_vec_rounding_shl_x4_s8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov z29.d, z6.d
-; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
-; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov	z30.d, z7.d
+; CHECK-NEXT:    mov	z27.d, z4.d
+; CHECK-NEXT:    mov	z29.d, z6.d
+; CHECK-NEXT:    mov	z26.d, z3.d
+; CHECK-NEXT:    mov	z28.d, z5.d
+; CHECK-NEXT:    mov	z25.d, z2.d
+; CHECK-NEXT:    mov	z24.d, z1.d
 ; CHECK-NEXT:    ld1b { z31.b }, p0/z, [x0]
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    srshl { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -357,15 +357,15 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
 @multi_vec_rounding_shl_x4_s16(<vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
 ; CHECK-LABEL: multi_vec_rounding_shl_x4_s16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z29.d, z6.d
-; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
-; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov	z30.d, z7.d
+; CHECK-NEXT:    mov	z27.d, z4.d
+; CHECK-NEXT:    mov	z29.d, z6.d
+; CHECK-NEXT:    mov	z26.d, z3.d
+; CHECK-NEXT:    mov	z28.d, z5.d
+; CHECK-NEXT:    mov	z25.d, z2.d
+; CHECK-NEXT:    mov	z24.d, z1.d
 ; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    srshl { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -382,15 +382,15 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4
 @multi_vec_rounding_shl_x4_s32(<vscale x 4 x i32> %dummy, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
 ; CHECK-LABEL: multi_vec_rounding_shl_x4_s32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z29.d, z6.d
-; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
-; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov	z30.d, z7.d
+; CHECK-NEXT:    mov	z27.d, z4.d
+; CHECK-NEXT:    mov	z29.d, z6.d
+; CHECK-NEXT:    mov	z26.d, z3.d
+; CHECK-NEXT:    mov	z28.d, z5.d
+; CHECK-NEXT:    mov	z25.d, z2.d
+; CHECK-NEXT:    mov	z24.d, z1.d
 ; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    srshl { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -407,15 +407,15 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2
 @multi_vec_rounding_shl_x4_s64(<vscale x 2 x i64> %dummy, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
 ; CHECK-LABEL: multi_vec_rounding_shl_x4_s64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z29.d, z6.d
-; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
-; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov	z30.d, z7.d
+; CHECK-NEXT:    mov	z27.d, z4.d
+; CHECK-NEXT:    mov	z29.d, z6.d
+; CHECK-NEXT:    mov	z26.d, z3.d
+; CHECK-NEXT:    mov	z28.d, z5.d
+; CHECK-NEXT:    mov	z25.d, z2.d
+; CHECK-NEXT:    mov	z24.d, z1.d
 ; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    srshl { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -496,15 +496,15 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
 @multi_vec_rounding_shl_x4_u8(<vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) {
 ; CHECK-LABEL: multi_vec_rounding_shl_x4_u8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov z29.d, z6.d
-; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
-; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov	z30.d, z7.d
+; CHECK-NEXT:    mov	z27.d, z4.d
+; CHECK-NEXT:    mov	z29.d, z6.d
+; CHECK-NEXT:    mov	z26.d, z3.d
+; CHECK-NEXT:    mov	z28.d, z5.d
+; CHECK-NEXT:    mov	z25.d, z2.d
+; CHECK-NEXT:    mov	z24.d, z1.d
 ; CHECK-NEXT:    ld1b { z31.b }, p0/z, [x0]
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    urshl { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -521,15 +521,15 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
 @multi_vec_rounding_shl_x4_u16(<vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
 ; CHECK-LABEL: multi_vec_rounding_shl_x4_u16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z29.d, z6.d
-; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
-; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov	z30.d, z7.d
+; CHECK-NEXT:    mov	z27.d, z4.d
+; CHECK-NEXT:    mov	z29.d, z6.d
+; CHECK-NEXT:    mov	z26.d, z3.d
+; CHECK-NEXT:    mov	z28.d, z5.d
+; CHECK-NEXT:    mov	z25.d, z2.d
+; CHECK-NEXT:    mov	z24.d, z1.d
 ; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    urshl { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -546,15 +546,15 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4
 @multi_vec_rounding_shl_x4_u32(<vscale x 4 x i32> %dummy, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
 ; CHECK-LABEL: multi_vec_rounding_shl_x4_u32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z29.d, z6.d
-; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
-; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov	z30.d, z7.d
+; CHECK-NEXT:    mov	z27.d, z4.d
+; CHECK-NEXT:    mov	z29.d, z6.d
+; CHECK-NEXT:    mov	z26.d, z3.d
+; CHECK-NEXT:    mov	z28.d, z5.d
+; CHECK-NEXT:    mov	z25.d, z2.d
+; CHECK-NEXT:    mov	z24.d, z1.d
 ; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    urshl { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -571,15 +571,15 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2
 @multi_vec_rounding_shl_x4_u64(<vscale x 2 x i64> %dummy, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
 ; CHECK-LABEL: multi_vec_rounding_shl_x4_u64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z29.d, z6.d
-; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
-; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov	z30.d, z7.d
+; CHECK-NEXT:    mov	z27.d, z4.d
+; CHECK-NEXT:    mov	z29.d, z6.d
+; CHECK-NEXT:    mov	z26.d, z3.d
+; CHECK-NEXT:    mov	z28.d, z5.d
+; CHECK-NEXT:    mov	z25.d, z2.d
+; CHECK-NEXT:    mov	z24.d, z1.d
 ; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    urshl { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d

diff  --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll
index d55951faf5f0a4..9c5dff6c3bf6fb 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll
@@ -201,15 +201,15 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
                                        <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) {
 ; CHECK-LABEL: multi_vec_sat_double_mulh_multi_x4_s8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov z29.d, z6.d
-; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
-; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov	z30.d, z7.d
+; CHECK-NEXT:    mov	z27.d, z4.d
+; CHECK-NEXT:    mov	z29.d, z6.d
+; CHECK-NEXT:    mov	z26.d, z3.d
+; CHECK-NEXT:    mov	z28.d, z5.d
+; CHECK-NEXT:    mov	z25.d, z2.d
+; CHECK-NEXT:    mov	z24.d, z1.d
 ; CHECK-NEXT:    ld1b { z31.b }, p0/z, [x0]
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    sqdmulh { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -227,15 +227,15 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
                                         <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
 ; CHECK-LABEL: multi_vec_sat_double_mulh_multi_x4_s16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z29.d, z6.d
-; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
-; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov	z30.d, z7.d
+; CHECK-NEXT:    mov	z27.d, z4.d
+; CHECK-NEXT:    mov	z29.d, z6.d
+; CHECK-NEXT:    mov	z26.d, z3.d
+; CHECK-NEXT:    mov	z28.d, z5.d
+; CHECK-NEXT:    mov	z25.d, z2.d
+; CHECK-NEXT:    mov	z24.d, z1.d
 ; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    sqdmulh { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -253,15 +253,15 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4
                                         <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
 ; CHECK-LABEL: multi_vec_sat_double_mulh_multi_x4_s32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z29.d, z6.d
-; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
-; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov	z30.d, z7.d
+; CHECK-NEXT:    mov	z27.d, z4.d
+; CHECK-NEXT:    mov	z29.d, z6.d
+; CHECK-NEXT:    mov	z26.d, z3.d
+; CHECK-NEXT:    mov	z28.d, z5.d
+; CHECK-NEXT:    mov	z25.d, z2.d
+; CHECK-NEXT:    mov	z24.d, z1.d
 ; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    sqdmulh { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -279,15 +279,15 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2
                                         <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
 ; CHECK-LABEL: multi_vec_sat_double_mulh_multi_x4_s64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z29.d, z6.d
-; CHECK-NEXT:    mov z27.d, z4.d
-; CHECK-NEXT:    mov z28.d, z5.d
-; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov	z30.d, z7.d
+; CHECK-NEXT:    mov	z27.d, z4.d
+; CHECK-NEXT:    mov	z29.d, z6.d
+; CHECK-NEXT:    mov	z26.d, z3.d
+; CHECK-NEXT:    mov	z28.d, z5.d
+; CHECK-NEXT:    mov	z25.d, z2.d
+; CHECK-NEXT:    mov	z24.d, z1.d
 ; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
-; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    sqdmulh { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d

diff  --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-sub.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-sub.ll
index 1ac6d2dd9855f8..da8c679d5a39a8 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-sub.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-sub.ll
@@ -8,8 +8,8 @@
 define void @multi_vector_sub_write_single_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,  <vscale x 4 x i32> %zm) {
 ; CHECK-LABEL: multi_vector_sub_write_single_za_vg1x2_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    sub za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s
 ; CHECK-NEXT:    sub za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s
@@ -27,8 +27,8 @@ define void @multi_vector_sub_write_single_za_vg1x2_i32(i32 %slice, <vscale x 4
 define void @multi_vector_sub_write_single_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,  <vscale x 2 x i64> %zm) {
 ; CHECK-LABEL: multi_vector_sub_write_single_za_vg1x2_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    sub za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d
 ; CHECK-NEXT:    sub za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d
@@ -105,9 +105,9 @@ define void @multi_vector_sub_write_single_za_vg1x4_i64(i32 %slice,
 define void @multi_vector_sub_write_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
 ; CHECK-LABEL: multi_vector_sub_write_za_vg1x2_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    sub za.s[w8, 0, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
@@ -128,9 +128,9 @@ define void @multi_vector_sub_write_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32>
 define void @multi_vector_sub_write_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
 ; CHECK-LABEL: multi_vector_sub_write_za_vg1x2_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    sub za.d[w8, 0, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
@@ -225,8 +225,8 @@ define void @multi_vector_sub_write_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64>
 define void @multi_vector_sub_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1) {
 ; CHECK-LABEL: multi_vector_sub_za_vg1x2_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    sub za.s[w8, 0, vgx2], { z0.s, z1.s }
 ; CHECK-NEXT:    sub za.s[w8, 7, vgx2], { z0.s, z1.s }
@@ -240,8 +240,8 @@ define void @multi_vector_sub_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0,
 define void @multi_vector_sub_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1) {
 ; CHECK-LABEL: multi_vector_sub_za_vg1x2_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    sub za.d[w8, 0, vgx2], { z0.d, z1.d }
 ; CHECK-NEXT:    sub za.d[w8, 7, vgx2], { z0.d, z1.d }
@@ -255,8 +255,8 @@ define void @multi_vector_sub_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0,
 define void @multi_vector_sub_za_vg1x2_f32(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1) {
 ; CHECK-LABEL: multi_vector_sub_za_vg1x2_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fsub za.s[w8, 0, vgx2], { z0.s, z1.s }
 ; CHECK-NEXT:    fsub za.s[w8, 7, vgx2], { z0.s, z1.s }
@@ -272,8 +272,8 @@ define void @multi_vector_sub_za_vg1x2_f32(i32 %slice, <vscale x 4 x float> %zn0
 define void @multi_vector_sub_za_vg1x2_f64(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1) {
 ; CHECK-LABEL: multi_vector_sub_za_vg1x2_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fsub za.d[w8, 0, vgx2], { z0.d, z1.d }
 ; CHECK-NEXT:    fsub za.d[w8, 7, vgx2], { z0.d, z1.d }

diff  --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
index 2c253a5f6b0e71..b698b60007eb9f 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
@@ -7,8 +7,8 @@
 define void @test_fvdot_lane_za32_vg1x2_nxv8f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm) {
 ; CHECK-LABEL: test_fvdot_lane_za32_vg1x2_nxv8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    fvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3]
 ; CHECK-NEXT:    fvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3]
@@ -25,8 +25,8 @@ define void @test_fvdot_lane_za32_vg1x2_nxv8f16(i32 %slice, <vscale x 8 x half>
 define void @test_fvdot_lane_za32_vg1x2_nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm) {
 ; CHECK-LABEL: test_fvdot_lane_za32_vg1x2_nxv8bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    bfvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3]
 ; CHECK-NEXT:    bfvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3]
@@ -43,8 +43,8 @@ define void @test_fvdot_lane_za32_vg1x2_nxv8bf16(i32 %slice, <vscale x 8 x bfloa
 define void @test_svdot_lane_za32_vg1x2_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: test_svdot_lane_za32_vg1x2_nxv8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    svdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3]
 ; CHECK-NEXT:    svdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3]
@@ -95,8 +95,8 @@ define void @test_svdot_lane_za64_vg1x4_nxv8i16(i32 %slice, <vscale x 8 x i16> %
 define void @test_uvdot_lane_za32_vg1x2_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: test_uvdot_lane_za32_vg1x2_nxv8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    uvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3]
 ; CHECK-NEXT:    uvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3]

diff  --git a/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll b/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll
index 4bae08c0890e6c..03b41db2291a21 100644
--- a/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll
+++ b/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll
@@ -45,8 +45,8 @@ entry:
   ret i32 %ret
 ; Checking that the address laoded from is masked for a floating point load.
 ; CHECK-LABEL: csdb_emitted_for_subreg_use
-; CHECK:      ldr x8, [x0]
-; CHECK-NEXT: cmp sp, #0
+; CHECK:      cmp sp, #0
+; CHECK-NEXT: ldr x8, [x0]
 ; CHECK-NEXT: csetm x16, ne
 ; CHECK-NEXT: and x8, x8, x16
 ; csdb instruction must occur before the add instruction with w8 as operand.
@@ -70,8 +70,8 @@ entry:
   ret i64 %ret
 ; Checking that the address laoded from is masked for a floating point load.
 ; CHECK-LABEL: csdb_emitted_for_superreg_use
-; CHECK:      ldr w8, [x0]
-; CHECK-NEXT: cmp sp, #0
+; CHECK:      cmp sp, #0
+; CHECK-NEXT: ldr w8, [x0]
 ; CHECK-NEXT: csetm x16, ne
 ; CHECK-NEXT: and w8, w8, w16
 ; csdb instruction must occur before the add instruction with x8 as operand.

diff  --git a/llvm/test/CodeGen/AArch64/split-vector-insert.ll b/llvm/test/CodeGen/AArch64/split-vector-insert.ll
index 27d3f7b6ff5389..a507296338f939 100644
--- a/llvm/test/CodeGen/AArch64/split-vector-insert.ll
+++ b/llvm/test/CodeGen/AArch64/split-vector-insert.ll
@@ -16,47 +16,47 @@ define <vscale x 2 x i64> @test_nxv2i64_v8i64(<vscale x 2 x i64> %a, <8 x i64> %
 
 ; CHECK-LABEL: test_nxv2i64_v8i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    addvl sp, sp, #-3
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
-; CHECK-NEXT:    cntd x8
-; CHECK-NEXT:    mov w9, #2
-; CHECK-NEXT:    sub x8, x8, #2
-; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    cmp x8, #2
-; CHECK-NEXT:    mov x10, sp
-; CHECK-NEXT:    csel x9, x8, x9, lo
-; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
-; CHECK-NEXT:    mov z0.d, p0/m, z1.d
-; CHECK-NEXT:    lsl x9, x9, #3
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK-NEXT:    cmp x8, #4
-; CHECK-NEXT:    str q2, [x10, x9]
-; CHECK-NEXT:    mov w9, #4
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
-; CHECK-NEXT:    csel x9, x8, x9, lo
-; CHECK-NEXT:    lsl x9, x9, #3
-; CHECK-NEXT:    addvl x10, sp, #1
-; CHECK-NEXT:    cmp x8, #6
-; CHECK-NEXT:    st1d { z0.d }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    str q3, [x10, x9]
-; CHECK-NEXT:    mov w9, #6
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp, #1, mul vl]
-; CHECK-NEXT:    csel x8, x8, x9, lo
-; CHECK-NEXT:    addvl x9, sp, #2
-; CHECK-NEXT:    lsl x8, x8, #3
-; CHECK-NEXT:    st1d { z0.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT:    str q4, [x9, x8]
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp, #2, mul vl]
-; CHECK-NEXT:    addvl sp, sp, #3
-; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:   .cfi_def_cfa_offset 0
-; CHECK-NEXT:   .cfi_restore w29
-; CHECK-NEXT:    ret
+; CHECK-NEXT: str	x29, [sp, #-16]!                // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: addvl	sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: ptrue	p1.d, vl2
+; CHECK-NEXT: cntd	x8
+; CHECK-NEXT: mov	w9, #2                          // =0x2
+; CHECK-NEXT: ptrue	p0.d
+; CHECK-NEXT: sub	x8, x8, #2
+; CHECK-NEXT:                                       // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: mov	x10, sp
+; CHECK-NEXT: cmp	x8, #2
+; CHECK-NEXT: csel	x9, x8, x9, lo
+; CHECK-NEXT: cmp	x8, #4
+; CHECK-NEXT: lsl	x9, x9, #3
+; CHECK-NEXT: mov	z0.d, p1/m, z1.d
+; CHECK-NEXT: st1d	{ z0.d }, p0, [sp]
+; CHECK-NEXT: str	q2, [x10, x9]
+; CHECK-NEXT: mov	w9, #4                          // =0x4
+; CHECK-NEXT: addvl	x10, sp, #1
+; CHECK-NEXT: ld1d	{ z0.d }, p0/z, [sp]
+; CHECK-NEXT: csel	x9, x8, x9, lo
+; CHECK-NEXT: cmp	x8, #6
+; CHECK-NEXT: lsl	x9, x9, #3
+; CHECK-NEXT: st1d	{ z0.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: str	q3, [x10, x9]
+; CHECK-NEXT: mov	w9, #6                          // =0x6
+; CHECK-NEXT: ld1d	{ z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT: csel	x8, x8, x9, lo
+; CHECK-NEXT: addvl	x9, sp, #2
+; CHECK-NEXT: lsl	x8, x8, #3
+; CHECK-NEXT: st1d	{ z0.d }, p0, [sp, #2, mul vl]
+; CHECK-NEXT: str	q4, [x9, x8]
+; CHECK-NEXT: ld1d	{ z0.d }, p0/z, [sp, #2, mul vl]
+; CHECK-NEXT: addvl	sp, sp, #3
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
+; CHECK-NEXT: ldr	x29, [sp], #16                  // 8-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: ret
 
 
 
@@ -73,47 +73,47 @@ define <vscale x 2 x double> @test_nxv2f64_v8f64(<vscale x 2 x double> %a, <8 x
 
 ; CHECK-LABEL: test_nxv2f64_v8f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    addvl sp, sp, #-3
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
-; CHECK-NEXT:    cntd x8
-; CHECK-NEXT:    mov w9, #2
-; CHECK-NEXT:    sub x8, x8, #2
-; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    cmp x8, #2
-; CHECK-NEXT:    mov x10, sp
-; CHECK-NEXT:    csel x9, x8, x9, lo
-; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
-; CHECK-NEXT:    mov z0.d, p0/m, z1.d
-; CHECK-NEXT:    lsl x9, x9, #3
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK-NEXT:    cmp x8, #4
-; CHECK-NEXT:    str q2, [x10, x9]
-; CHECK-NEXT:    mov w9, #4
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
-; CHECK-NEXT:    csel x9, x8, x9, lo
-; CHECK-NEXT:    lsl x9, x9, #3
-; CHECK-NEXT:    addvl x10, sp, #1
-; CHECK-NEXT:    cmp x8, #6
-; CHECK-NEXT:    st1d { z0.d }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    str q3, [x10, x9]
-; CHECK-NEXT:    mov w9, #6
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp, #1, mul vl]
-; CHECK-NEXT:    csel x8, x8, x9, lo
-; CHECK-NEXT:    addvl x9, sp, #2
-; CHECK-NEXT:    lsl x8, x8, #3
-; CHECK-NEXT:    st1d { z0.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT:    str q4, [x9, x8]
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp, #2, mul vl]
-; CHECK-NEXT:    addvl sp, sp, #3
-; CHECK-NEXT:   .cfi_def_cfa wsp, 16
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:   .cfi_def_cfa_offset 0
-; CHECK-NEXT:   .cfi_restore w29
-; CHECK-NEXT:    ret
+; CHECK-NEXT: str	x29, [sp, #-16]!                // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: addvl	sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: ptrue	p1.d, vl2
+; CHECK-NEXT: cntd	x8
+; CHECK-NEXT: mov	w9, #2                          // =0x2
+; CHECK-NEXT: ptrue	p0.d
+; CHECK-NEXT: sub	x8, x8, #2
+; CHECK-NEXT:                                       // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: mov	x10, sp
+; CHECK-NEXT: cmp	x8, #2
+; CHECK-NEXT: csel	x9, x8, x9, lo
+; CHECK-NEXT: cmp	x8, #4
+; CHECK-NEXT: lsl	x9, x9, #3
+; CHECK-NEXT: mov	z0.d, p1/m, z1.d
+; CHECK-NEXT: st1d	{ z0.d }, p0, [sp]
+; CHECK-NEXT: str	q2, [x10, x9]
+; CHECK-NEXT: mov	w9, #4                          // =0x4
+; CHECK-NEXT: addvl	x10, sp, #1
+; CHECK-NEXT: ld1d	{ z0.d }, p0/z, [sp]
+; CHECK-NEXT: csel	x9, x8, x9, lo
+; CHECK-NEXT: cmp	x8, #6
+; CHECK-NEXT: lsl	x9, x9, #3
+; CHECK-NEXT: st1d	{ z0.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: str	q3, [x10, x9]
+; CHECK-NEXT: mov	w9, #6                          // =0x6
+; CHECK-NEXT: ld1d	{ z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT: csel	x8, x8, x9, lo
+; CHECK-NEXT: addvl	x9, sp, #2
+; CHECK-NEXT: lsl	x8, x8, #3
+; CHECK-NEXT: st1d	{ z0.d }, p0, [sp, #2, mul vl]
+; CHECK-NEXT: str	q4, [x9, x8]
+; CHECK-NEXT: ld1d	{ z0.d }, p0/z, [sp, #2, mul vl]
+; CHECK-NEXT: addvl	sp, sp, #3
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
+; CHECK-NEXT: ldr	x29, [sp], #16                  // 8-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: ret
 
 
 

diff  --git a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
index 3f52f1d35ed65f..f73b4bdba36e64 100644
--- a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
@@ -530,7 +530,7 @@ define double @sqrt_simplify_before_recip_3_uses(double %x, ptr %p1, ptr %p2) no
 ; FAULT-LABEL: sqrt_simplify_before_recip_3_uses:
 ; FAULT:       // %bb.0:
 ; FAULT-NEXT:    fsqrt d0, d0
-; FAULT-NEXT:    mov x8, #4631107791820423168
+; FAULT-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
 ; FAULT-NEXT:    fmov d1, #1.00000000
 ; FAULT-NEXT:    fmov d2, x8
 ; FAULT-NEXT:    fdiv d1, d1, d0
@@ -542,17 +542,17 @@ define double @sqrt_simplify_before_recip_3_uses(double %x, ptr %p1, ptr %p2) no
 ; CHECK-LABEL: sqrt_simplify_before_recip_3_uses:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frsqrte d1, d0
-; CHECK-NEXT:    mov x8, #4631107791820423168
+; CHECK-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
+; CHECK-NEXT:    fmul d2, d1, d1
+; CHECK-NEXT:    frsqrts d2, d0, d2
+; CHECK-NEXT:    fmul d1, d1, d2
+; CHECK-NEXT:    fmul d2, d1, d1
+; CHECK-NEXT:    frsqrts d2, d0, d2
+; CHECK-NEXT:    fmul d1, d1, d2
+; CHECK-NEXT:    fmul d2, d1, d1
+; CHECK-NEXT:    frsqrts d2, d0, d2
+; CHECK-NEXT:    fmul d1, d1, d2
 ; CHECK-NEXT:    fmov d2, x8
-; CHECK-NEXT:    fmul d3, d1, d1
-; CHECK-NEXT:    frsqrts d3, d0, d3
-; CHECK-NEXT:    fmul d1, d1, d3
-; CHECK-NEXT:    fmul d3, d1, d1
-; CHECK-NEXT:    frsqrts d3, d0, d3
-; CHECK-NEXT:    fmul d1, d1, d3
-; CHECK-NEXT:    fmul d3, d1, d1
-; CHECK-NEXT:    frsqrts d3, d0, d3
-; CHECK-NEXT:    fmul d1, d1, d3
 ; CHECK-NEXT:    fmul d0, d0, d1
 ; CHECK-NEXT:    fmul d2, d1, d2
 ; CHECK-NEXT:    str d1, [x0]
@@ -571,9 +571,9 @@ define double @sqrt_simplify_before_recip_3_uses_order(double %x, ptr %p1, ptr %
 ; FAULT-LABEL: sqrt_simplify_before_recip_3_uses_order:
 ; FAULT:       // %bb.0:
 ; FAULT-NEXT:    fsqrt d0, d0
-; FAULT-NEXT:    mov x8, #4631107791820423168
+; FAULT-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
 ; FAULT-NEXT:    fmov d1, x8
-; FAULT-NEXT:    mov x8, #140737488355328
+; FAULT-NEXT:    mov x8, #140737488355328 // =0x800000000000
 ; FAULT-NEXT:    movk x8, #16453, lsl #48
 ; FAULT-NEXT:    fmov d2, x8
 ; FAULT-NEXT:    fdiv d1, d1, d0
@@ -585,10 +585,7 @@ define double @sqrt_simplify_before_recip_3_uses_order(double %x, ptr %p1, ptr %
 ; CHECK-LABEL: sqrt_simplify_before_recip_3_uses_order:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frsqrte d1, d0
-; CHECK-NEXT:    mov x9, #140737488355328
-; CHECK-NEXT:    mov x8, #4631107791820423168
-; CHECK-NEXT:    movk x9, #16453, lsl #48
-; CHECK-NEXT:    fmov d3, x9
+; CHECK-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
 ; CHECK-NEXT:    fmul d2, d1, d1
 ; CHECK-NEXT:    frsqrts d2, d0, d2
 ; CHECK-NEXT:    fmul d1, d1, d2
@@ -599,6 +596,9 @@ define double @sqrt_simplify_before_recip_3_uses_order(double %x, ptr %p1, ptr %
 ; CHECK-NEXT:    frsqrts d2, d0, d2
 ; CHECK-NEXT:    fmul d1, d1, d2
 ; CHECK-NEXT:    fmov d2, x8
+; CHECK-NEXT:    mov x8, #140737488355328 // =0x800000000000
+; CHECK-NEXT:    movk x8, #16453, lsl #48
+; CHECK-NEXT:    fmov d3, x8
 ; CHECK-NEXT:    fmul d0, d0, d1
 ; CHECK-NEXT:    fmul d2, d1, d2
 ; CHECK-NEXT:    fmul d1, d1, d3
@@ -620,11 +620,11 @@ define double @sqrt_simplify_before_recip_4_uses(double %x, ptr %p1, ptr %p2, pt
 ; FAULT:       // %bb.0:
 ; FAULT-NEXT:    fsqrt d0, d0
 ; FAULT-NEXT:    fmov d1, #1.00000000
-; FAULT-NEXT:    mov x9, #140737488355328
-; FAULT-NEXT:    mov x8, #4631107791820423168
-; FAULT-NEXT:    movk x9, #16453, lsl #48
+; FAULT-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
 ; FAULT-NEXT:    fmov d2, x8
-; FAULT-NEXT:    fmov d3, x9
+; FAULT-NEXT:    mov x8, #140737488355328 // =0x800000000000
+; FAULT-NEXT:    movk x8, #16453, lsl #48
+; FAULT-NEXT:    fmov d3, x8
 ; FAULT-NEXT:    fdiv d1, d1, d0
 ; FAULT-NEXT:    fmul d2, d1, d2
 ; FAULT-NEXT:    fmul d3, d1, d3
@@ -637,10 +637,7 @@ define double @sqrt_simplify_before_recip_4_uses(double %x, ptr %p1, ptr %p2, pt
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frsqrte d1, d0
 ; CHECK-NEXT:    fcmp d0, #0.0
-; CHECK-NEXT:    mov x9, #140737488355328
-; CHECK-NEXT:    mov x8, #4631107791820423168
-; CHECK-NEXT:    movk x9, #16453, lsl #48
-; CHECK-NEXT:    fmov d3, x9
+; CHECK-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
 ; CHECK-NEXT:    fmul d2, d1, d1
 ; CHECK-NEXT:    frsqrts d2, d0, d2
 ; CHECK-NEXT:    fmul d1, d1, d2
@@ -651,12 +648,15 @@ define double @sqrt_simplify_before_recip_4_uses(double %x, ptr %p1, ptr %p2, pt
 ; CHECK-NEXT:    frsqrts d2, d0, d2
 ; CHECK-NEXT:    fmul d1, d1, d2
 ; CHECK-NEXT:    fmul d2, d0, d1
-; CHECK-NEXT:    fmul d3, d1, d3
 ; CHECK-NEXT:    str d1, [x0]
 ; CHECK-NEXT:    fcsel d2, d0, d2, eq
 ; CHECK-NEXT:    fdiv d0, d0, d2
 ; CHECK-NEXT:    fmov d2, x8
+; CHECK-NEXT:    mov x8, #140737488355328 // =0x800000000000
+; CHECK-NEXT:    movk x8, #16453, lsl #48
+; CHECK-NEXT:    fmov d3, x8
 ; CHECK-NEXT:    fmul d2, d1, d2
+; CHECK-NEXT:    fmul d3, d1, d3
 ; CHECK-NEXT:    str d2, [x1]
 ; CHECK-NEXT:    str d3, [x2]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
index b607af42339f63..595991e86a91c7 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
@@ -23,12 +23,12 @@ define i1 @test_srem_odd(i29 %X) nounwind {
 define i1 @test_srem_even(i4 %X) nounwind {
 ; CHECK-LABEL: test_srem_even:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sbfx w9, w0, #0, #4
-; CHECK-NEXT:    mov w8, #6 // =0x6
-; CHECK-NEXT:    add w9, w9, w9, lsl #1
-; CHECK-NEXT:    ubfx w10, w9, #7, #1
-; CHECK-NEXT:    add w9, w10, w9, lsr #4
-; CHECK-NEXT:    msub w8, w9, w8, w0
+; CHECK-NEXT:    sbfx w8, w0, #0, #4
+; CHECK-NEXT:    add w8, w8, w8, lsl #1
+; CHECK-NEXT:    ubfx w9, w8, #7, #1
+; CHECK-NEXT:    add w8, w9, w8, lsr #4
+; CHECK-NEXT:    mov w9, #6 // =0x6
+; CHECK-NEXT:    msub w8, w8, w9, w0
 ; CHECK-NEXT:    and w8, w8, #0xf
 ; CHECK-NEXT:    cmp w8, #1
 ; CHECK-NEXT:    cset w0, eq
@@ -57,45 +57,45 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
 define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
 ; CHECK-LABEL: test_srem_vec:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #7282 // =0x1c72
-; CHECK-NEXT:    sbfx x9, x0, #0, #33
-; CHECK-NEXT:    movk x8, #29127, lsl #16
-; CHECK-NEXT:    mov x11, #7281 // =0x1c71
-; CHECK-NEXT:    movk x8, #50972, lsl #32
-; CHECK-NEXT:    movk x11, #29127, lsl #16
-; CHECK-NEXT:    movk x8, #7281, lsl #48
-; CHECK-NEXT:    movk x11, #50972, lsl #32
-; CHECK-NEXT:    sbfx x12, x1, #0, #33
-; CHECK-NEXT:    sbfx x10, x2, #0, #33
-; CHECK-NEXT:    smulh x13, x9, x8
-; CHECK-NEXT:    movk x11, #7281, lsl #48
-; CHECK-NEXT:    smulh x8, x12, x8
-; CHECK-NEXT:    smulh x11, x10, x11
-; CHECK-NEXT:    add x13, x13, x13, lsr #63
-; CHECK-NEXT:    sub x11, x11, x10
-; CHECK-NEXT:    add x8, x8, x8, lsr #63
-; CHECK-NEXT:    add x13, x13, x13, lsl #3
-; CHECK-NEXT:    asr x14, x11, #3
-; CHECK-NEXT:    sub x9, x9, x13
-; CHECK-NEXT:    add x11, x14, x11, lsr #63
-; CHECK-NEXT:    add x8, x8, x8, lsl #3
-; CHECK-NEXT:    sub x8, x12, x8
+; CHECK-NEXT:    mov x9, #7282 // =0x1c72
+; CHECK-NEXT:    sbfx x8, x0, #0, #33
+; CHECK-NEXT:    sbfx x10, x1, #0, #33
+; CHECK-NEXT:    movk x9, #29127, lsl #16
+; CHECK-NEXT:    mov x13, #7281 // =0x1c71
+; CHECK-NEXT:    sbfx x12, x2, #0, #33
+; CHECK-NEXT:    movk x9, #50972, lsl #32
+; CHECK-NEXT:    movk x13, #29127, lsl #16
+; CHECK-NEXT:    movk x9, #7281, lsl #48
+; CHECK-NEXT:    movk x13, #50972, lsl #32
+; CHECK-NEXT:    smulh x11, x8, x9
+; CHECK-NEXT:    movk x13, #7281, lsl #48
+; CHECK-NEXT:    smulh x9, x10, x9
+; CHECK-NEXT:    smulh x13, x12, x13
+; CHECK-NEXT:    add x11, x11, x11, lsr #63
+; CHECK-NEXT:    add x9, x9, x9, lsr #63
 ; CHECK-NEXT:    add x11, x11, x11, lsl #3
-; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    add x10, x10, x11
-; CHECK-NEXT:    mov x9, #8589934591 // =0x1ffffffff
-; CHECK-NEXT:    adrp x11, .LCPI3_0
-; CHECK-NEXT:    adrp x12, .LCPI3_1
-; CHECK-NEXT:    mov v0.d[1], x8
-; CHECK-NEXT:    fmov d1, x10
-; CHECK-NEXT:    dup v2.2d, x9
-; CHECK-NEXT:    ldr q3, [x11, :lo12:.LCPI3_0]
-; CHECK-NEXT:    ldr q4, [x12, :lo12:.LCPI3_1]
-; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    cmeq v0.2d, v0.2d, v3.2d
-; CHECK-NEXT:    cmeq v1.2d, v1.2d, v4.2d
+; CHECK-NEXT:    add x9, x9, x9, lsl #3
+; CHECK-NEXT:    sub x8, x8, x11
+; CHECK-NEXT:    sub x11, x13, x12
+; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    mov x8, #8589934591 // =0x1ffffffff
+; CHECK-NEXT:    sub x9, x10, x9
+; CHECK-NEXT:    asr x10, x11, #3
+; CHECK-NEXT:    dup v1.2d, x8
+; CHECK-NEXT:    mov v0.d[1], x9
+; CHECK-NEXT:    add x9, x10, x11, lsr #63
+; CHECK-NEXT:    add x8, x9, x9, lsl #3
+; CHECK-NEXT:    adrp x9, .LCPI3_0
+; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI3_0]
+; CHECK-NEXT:    add x8, x12, x8
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    fmov d3, x8
+; CHECK-NEXT:    adrp x8, .LCPI3_1
+; CHECK-NEXT:    cmeq v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI3_1]
+; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    cmeq v1.2d, v1.2d, v2.2d
 ; CHECK-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
 ; CHECK-NEXT:    xtn v1.2s, v1.2d

diff  --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll
index 58998b02887134..f8c6f4193959d2 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll
@@ -7,10 +7,9 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI0_0
 ; CHECK-NEXT:    adrp x9, .LCPI0_1
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI0_0]
-; CHECK-NEXT:    adrp x8, .LCPI0_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI0_1]
+; CHECK-NEXT:    adrp x8, .LCPI0_2
 ; CHECK-NEXT:    adrp x9, .LCPI0_3
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI0_2]
@@ -18,10 +17,11 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI0_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI0_4]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI0_4]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 5, i32 14, i32 25, i32 100>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -43,8 +43,8 @@ define <4 x i32> @test_srem_odd_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    dup v2.4s, w9
 ; CHECK-NEXT:    adrp x8, .LCPI1_0
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
-; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI1_0]
+; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    cmhs v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -64,8 +64,8 @@ define <4 x i32> @test_srem_odd_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    dup v2.4s, w9
 ; CHECK-NEXT:    adrp x8, .LCPI2_0
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
-; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI2_0]
+; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    cmhi v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -129,10 +129,9 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI5_0
 ; CHECK-NEXT:    adrp x9, .LCPI5_1
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI5_0]
-; CHECK-NEXT:    adrp x8, .LCPI5_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI5_1]
+; CHECK-NEXT:    adrp x8, .LCPI5_2
 ; CHECK-NEXT:    adrp x9, .LCPI5_3
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI5_2]
@@ -140,10 +139,11 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI5_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI5_4]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI5_4]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 5, i32 14, i32 4294967295, i32 100>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -155,10 +155,9 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI6_0
 ; CHECK-NEXT:    adrp x9, .LCPI6_1
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI6_0]
-; CHECK-NEXT:    adrp x8, .LCPI6_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI6_1]
+; CHECK-NEXT:    adrp x8, .LCPI6_2
 ; CHECK-NEXT:    adrp x9, .LCPI6_3
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI6_2]
@@ -166,10 +165,11 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI6_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI6_4]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    cmhi v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI6_4]
+; CHECK-NEXT:    cmhi v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 5, i32 14, i32 4294967295, i32 100>
   %cmp = icmp ne <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -185,10 +185,9 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI7_0
 ; CHECK-NEXT:    adrp x9, .LCPI7_1
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI7_0]
-; CHECK-NEXT:    adrp x8, .LCPI7_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI7_1]
+; CHECK-NEXT:    adrp x8, .LCPI7_2
 ; CHECK-NEXT:    adrp x9, .LCPI7_3
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI7_2]
@@ -196,10 +195,11 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI7_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI7_4]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI7_4]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 5, i32 5, i32 16, i32 5>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -213,10 +213,9 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI8_0
 ; CHECK-NEXT:    adrp x9, .LCPI8_1
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI8_0]
-; CHECK-NEXT:    adrp x8, .LCPI8_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI8_1]
+; CHECK-NEXT:    adrp x8, .LCPI8_2
 ; CHECK-NEXT:    adrp x9, .LCPI8_3
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI8_2]
@@ -224,10 +223,11 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI8_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI8_4]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI8_4]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 14, i32 14, i32 16, i32 14>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -241,10 +241,9 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI9_0
 ; CHECK-NEXT:    adrp x9, .LCPI9_1
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI9_0]
-; CHECK-NEXT:    adrp x8, .LCPI9_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI9_1]
+; CHECK-NEXT:    adrp x8, .LCPI9_2
 ; CHECK-NEXT:    adrp x9, .LCPI9_3
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI9_2]
@@ -252,10 +251,11 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI9_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI9_4]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI9_4]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 5, i32 14, i32 16, i32 100>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -277,8 +277,8 @@ define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    dup v2.4s, w9
 ; CHECK-NEXT:    adrp x8, .LCPI10_0
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
-; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI10_0]
+; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    cmhs v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -319,10 +319,9 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI12_0
 ; CHECK-NEXT:    adrp x9, .LCPI12_1
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI12_0]
-; CHECK-NEXT:    adrp x8, .LCPI12_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI12_1]
+; CHECK-NEXT:    adrp x8, .LCPI12_2
 ; CHECK-NEXT:    adrp x9, .LCPI12_3
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI12_2]
@@ -330,10 +329,11 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI12_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI12_4]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI12_4]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 5, i32 14, i32 1, i32 100>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -438,10 +438,9 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI16_0
 ; CHECK-NEXT:    adrp x9, .LCPI16_1
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
-; CHECK-NEXT:    adrp x8, .LCPI16_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI16_1]
+; CHECK-NEXT:    adrp x8, .LCPI16_2
 ; CHECK-NEXT:    adrp x9, .LCPI16_3
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI16_2]
@@ -449,10 +448,11 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI16_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI16_4]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI16_4]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 5>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -466,10 +466,9 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI17_0
 ; CHECK-NEXT:    adrp x9, .LCPI17_1
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI17_0]
-; CHECK-NEXT:    adrp x8, .LCPI17_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI17_1]
+; CHECK-NEXT:    adrp x8, .LCPI17_2
 ; CHECK-NEXT:    adrp x9, .LCPI17_3
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI17_2]
@@ -477,10 +476,11 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI17_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI17_4]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI17_4]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 14, i32 4294967295, i32 16, i32 14>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -494,10 +494,9 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI18_0
 ; CHECK-NEXT:    adrp x9, .LCPI18_1
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI18_0]
-; CHECK-NEXT:    adrp x8, .LCPI18_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI18_1]
+; CHECK-NEXT:    adrp x8, .LCPI18_2
 ; CHECK-NEXT:    adrp x9, .LCPI18_3
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI18_2]
@@ -505,10 +504,11 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
 ; CHECK-NEXT:    adrp x8, .LCPI18_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI18_4]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI18_4]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 100>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -530,8 +530,8 @@ define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    dup v2.4s, w9
 ; CHECK-NEXT:    adrp x8, .LCPI19_0
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
-; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI19_0]
+; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    cmhs v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -572,10 +572,9 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI21_0
 ; CHECK-NEXT:    adrp x9, .LCPI21_1
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI21_0]
-; CHECK-NEXT:    adrp x8, .LCPI21_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI21_1]
+; CHECK-NEXT:    adrp x8, .LCPI21_2
 ; CHECK-NEXT:    adrp x9, .LCPI21_3
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI21_2]
@@ -583,10 +582,11 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI21_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI21_4]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI21_4]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 1, i32 100>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -602,10 +602,9 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI22_0
 ; CHECK-NEXT:    adrp x9, .LCPI22_1
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI22_0]
-; CHECK-NEXT:    adrp x8, .LCPI22_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI22_1]
+; CHECK-NEXT:    adrp x8, .LCPI22_2
 ; CHECK-NEXT:    adrp x9, .LCPI22_3
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI22_2]
@@ -613,10 +612,11 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI22_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI22_4]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI22_4]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 5>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -630,10 +630,9 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI23_0
 ; CHECK-NEXT:    adrp x9, .LCPI23_1
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI23_0]
-; CHECK-NEXT:    adrp x8, .LCPI23_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI23_1]
+; CHECK-NEXT:    adrp x8, .LCPI23_2
 ; CHECK-NEXT:    adrp x9, .LCPI23_3
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI23_2]
@@ -641,10 +640,11 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI23_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI23_4]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI23_4]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 14, i32 16, i32 1, i32 14>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -658,10 +658,9 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI24_0
 ; CHECK-NEXT:    adrp x9, .LCPI24_1
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI24_0]
-; CHECK-NEXT:    adrp x8, .LCPI24_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI24_1]
+; CHECK-NEXT:    adrp x8, .LCPI24_2
 ; CHECK-NEXT:    adrp x9, .LCPI24_3
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI24_2]
@@ -669,10 +668,11 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI24_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI24_4]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI24_4]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 100>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -687,10 +687,9 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI25_0
 ; CHECK-NEXT:    adrp x9, .LCPI25_1
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI25_0]
-; CHECK-NEXT:    adrp x8, .LCPI25_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI25_1]
+; CHECK-NEXT:    adrp x8, .LCPI25_2
 ; CHECK-NEXT:    adrp x9, .LCPI25_3
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI25_2]
@@ -698,10 +697,11 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 ; CHECK-NEXT:    adrp x8, .LCPI25_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI25_4]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI25_4]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 1>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
@@ -714,10 +714,9 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI26_0
 ; CHECK-NEXT:    adrp x9, .LCPI26_1
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI26_0]
-; CHECK-NEXT:    adrp x8, .LCPI26_2
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI26_1]
+; CHECK-NEXT:    adrp x8, .LCPI26_2
 ; CHECK-NEXT:    adrp x9, .LCPI26_3
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI26_2]
@@ -725,10 +724,11 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
 ; CHECK-NEXT:    adrp x8, .LCPI26_4
 ; CHECK-NEXT:    ushl v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI26_4]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI26_4]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %srem = srem <4 x i32> %X, <i32 14, i32 4294967295, i32 16, i32 1>
   %cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>

diff  --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
index 0b06032add8421..1d9cb88260b609 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
@@ -14,8 +14,8 @@ define <4 x i32> @test_srem_odd_25(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    mov w8, #28834 // =0x70a2
 ; CHECK-NEXT:    movk w8, #2621, lsl #16
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
-; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    dup v0.4s, w8
+; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    cmhs v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -36,8 +36,8 @@ define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    dup v2.4s, w9
 ; CHECK-NEXT:    mov w8, #23592 // =0x5c28
-; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    movk w8, #655, lsl #16
+; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    shl v0.4s, v2.4s, #30
 ; CHECK-NEXT:    usra v0.4s, v2.4s, #2
@@ -66,8 +66,8 @@ define <4 x i32> @test_srem_odd_neg25(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    mov w8, #28834 // =0x70a2
 ; CHECK-NEXT:    movk w8, #2621, lsl #16
 ; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
-; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    dup v0.4s, w8
+; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    cmhs v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -88,8 +88,8 @@ define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    dup v2.4s, w9
 ; CHECK-NEXT:    mov w8, #23592 // =0x5c28
-; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    movk w8, #655, lsl #16
+; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    shl v0.4s, v2.4s, #30
 ; CHECK-NEXT:    usra v0.4s, v2.4s, #2
@@ -112,15 +112,15 @@ define <4 x i32> @test_srem_odd_undef1(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #34079 // =0x851f
 ; CHECK-NEXT:    movk w8, #20971, lsl #16
-; CHECK-NEXT:    movi v3.4s, #25
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    smull2 v2.2d, v0.4s, v1.4s
 ; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
 ; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    sshr v2.4s, v1.4s, #3
 ; CHECK-NEXT:    usra v2.4s, v1.4s, #31
+; CHECK-NEXT:    movi v1.4s, #25
+; CHECK-NEXT:    mls v0.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    mls v0.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -135,15 +135,15 @@ define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #34079 // =0x851f
 ; CHECK-NEXT:    movk w8, #20971, lsl #16
-; CHECK-NEXT:    movi v3.4s, #100
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    smull2 v2.2d, v0.4s, v1.4s
 ; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
 ; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    sshr v2.4s, v1.4s, #5
 ; CHECK-NEXT:    usra v2.4s, v1.4s, #31
+; CHECK-NEXT:    movi v1.4s, #100
+; CHECK-NEXT:    mls v0.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    mls v0.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -182,9 +182,9 @@ define <4 x i32> @test_srem_one_ne(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_pow2(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_srem_pow2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmlt v3.4s, v0.4s, #0
+; CHECK-NEXT:    cmlt v1.4s, v0.4s, #0
 ; CHECK-NEXT:    mov v2.16b, v0.16b
-; CHECK-NEXT:    usra v2.4s, v3.4s, #28
+; CHECK-NEXT:    usra v2.4s, v1.4s, #28
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    bic v2.4s, #15
 ; CHECK-NEXT:    sub v0.4s, v0.4s, v2.4s
@@ -201,11 +201,11 @@ define <4 x i32> @test_srem_pow2(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_int_min(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_srem_int_min:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmlt v2.4s, v0.4s, #0
-; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    movi v3.4s, #128, lsl #24
-; CHECK-NEXT:    usra v1.4s, v2.4s, #1
-; CHECK-NEXT:    and v1.16b, v1.16b, v3.16b
+; CHECK-NEXT:    cmlt v1.4s, v0.4s, #0
+; CHECK-NEXT:    mov v2.16b, v0.16b
+; CHECK-NEXT:    usra v2.4s, v1.4s, #1
+; CHECK-NEXT:    movi v1.4s, #128, lsl #24
+; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0

diff  --git a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll
index da8052f5e88403..0598af7c980635 100644
--- a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll
@@ -7,9 +7,9 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
 ; CHECK-NEXT:    adrp x8, .LCPI0_1
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI0_1]
 ; CHECK-NEXT:    adrp x8, .LCPI0_0
-; CHECK-NEXT:    smull v1.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI0_0]
 ; CHECK-NEXT:    adrp x8, .LCPI0_2
+; CHECK-NEXT:    smull v1.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    shrn v1.4h, v1.4s, #16
 ; CHECK-NEXT:    mla v1.4h, v0.4h, v2.4h
 ; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI0_2]
@@ -69,9 +69,9 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
 ; CHECK-NEXT:    adrp x8, .LCPI3_0
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI3_0]
 ; CHECK-NEXT:    adrp x8, .LCPI3_1
-; CHECK-NEXT:    smull v1.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI3_1]
 ; CHECK-NEXT:    adrp x8, .LCPI3_2
+; CHECK-NEXT:    smull v1.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    shrn v1.4h, v1.4s, #16
 ; CHECK-NEXT:    add v1.4h, v1.4h, v0.4h
 ; CHECK-NEXT:    sshl v1.4h, v1.4h, v2.4h
@@ -91,18 +91,18 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
 ; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI4_0]
 ; CHECK-NEXT:    adrp x8, .LCPI4_1
-; CHECK-NEXT:    and v2.8b, v0.8b, v2.8b
 ; CHECK-NEXT:    smull v1.4s, v0.4h, v1.4h
-; CHECK-NEXT:    ldr d3, [x8, :lo12:.LCPI4_1]
-; CHECK-NEXT:    adrp x8, .LCPI4_2
+; CHECK-NEXT:    and v2.8b, v0.8b, v2.8b
 ; CHECK-NEXT:    shrn v1.4h, v1.4s, #16
 ; CHECK-NEXT:    add v1.4h, v1.4h, v2.4h
-; CHECK-NEXT:    sshl v1.4h, v1.4h, v3.4h
-; CHECK-NEXT:    ldr d3, [x8, :lo12:.LCPI4_2]
+; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI4_1]
+; CHECK-NEXT:    adrp x8, .LCPI4_2
+; CHECK-NEXT:    sshl v1.4h, v1.4h, v2.4h
 ; CHECK-NEXT:    ushr v2.4h, v1.4h, #15
 ; CHECK-NEXT:    mov v2.h[0], wzr
 ; CHECK-NEXT:    add v1.4h, v1.4h, v2.4h
-; CHECK-NEXT:    mls v0.4h, v1.4h, v3.4h
+; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI4_2]
+; CHECK-NEXT:    mls v0.4h, v1.4h, v2.4h
 ; CHECK-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>
   ret <4 x i16> %1
@@ -115,19 +115,19 @@ define <4 x i16> @dont_fold_srem_i16_smax(<4 x i16> %x) {
 ; CHECK-NEXT:    adrp x8, .LCPI5_1
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI5_1]
 ; CHECK-NEXT:    adrp x8, .LCPI5_0
-; CHECK-NEXT:    smull v1.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI5_0]
 ; CHECK-NEXT:    adrp x8, .LCPI5_2
+; CHECK-NEXT:    smull v1.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    shrn v1.4h, v1.4s, #16
 ; CHECK-NEXT:    mla v1.4h, v0.4h, v2.4h
 ; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI5_2]
 ; CHECK-NEXT:    adrp x8, .LCPI5_3
 ; CHECK-NEXT:    sshl v1.4h, v1.4h, v2.4h
-; CHECK-NEXT:    ldr d3, [x8, :lo12:.LCPI5_3]
 ; CHECK-NEXT:    ushr v2.4h, v1.4h, #15
 ; CHECK-NEXT:    mov v2.h[0], wzr
 ; CHECK-NEXT:    add v1.4h, v1.4h, v2.4h
-; CHECK-NEXT:    mls v0.4h, v1.4h, v3.4h
+; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI5_3]
+; CHECK-NEXT:    mls v0.4h, v1.4h, v2.4h
 ; CHECK-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 1, i16 32768, i16 23, i16 5423>
   ret <4 x i16> %1
@@ -138,38 +138,38 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) {
 ; CHECK-LABEL: dont_fold_srem_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov x8, #8549 // =0x2165
-; CHECK-NEXT:    fmov x9, d1
+; CHECK-NEXT:    fmov x10, d1
+; CHECK-NEXT:    mov x9, v1.d[1]
 ; CHECK-NEXT:    movk x8, #22795, lsl #16
 ; CHECK-NEXT:    mov x12, #6055 // =0x17a7
+; CHECK-NEXT:    mov x11, v0.d[1]
 ; CHECK-NEXT:    movk x8, #17096, lsl #32
 ; CHECK-NEXT:    movk x12, #58853, lsl #16
+; CHECK-NEXT:    mov x13, #21445 // =0x53c5
 ; CHECK-NEXT:    movk x8, #45590, lsl #48
-; CHECK-NEXT:    mov x14, #21445 // =0x53c5
-; CHECK-NEXT:    mov x10, v1.d[1]
 ; CHECK-NEXT:    movk x12, #47142, lsl #32
-; CHECK-NEXT:    smulh x8, x9, x8
-; CHECK-NEXT:    movk x14, #1603, lsl #16
-; CHECK-NEXT:    mov x11, v0.d[1]
+; CHECK-NEXT:    movk x13, #1603, lsl #16
+; CHECK-NEXT:    smulh x8, x10, x8
 ; CHECK-NEXT:    movk x12, #24749, lsl #48
-; CHECK-NEXT:    add x8, x8, x9
-; CHECK-NEXT:    movk x14, #15432, lsl #32
-; CHECK-NEXT:    asr x13, x8, #4
-; CHECK-NEXT:    movk x14, #25653, lsl #48
-; CHECK-NEXT:    add x8, x13, x8, lsr #63
-; CHECK-NEXT:    mov w13, #23 // =0x17
-; CHECK-NEXT:    smulh x12, x10, x12
-; CHECK-NEXT:    smulh x14, x11, x14
-; CHECK-NEXT:    msub x8, x8, x13, x9
-; CHECK-NEXT:    asr x13, x12, #11
-; CHECK-NEXT:    add x12, x13, x12, lsr #63
-; CHECK-NEXT:    asr x13, x14, #8
-; CHECK-NEXT:    mov w9, #5423 // =0x152f
-; CHECK-NEXT:    add x13, x13, x14, lsr #63
-; CHECK-NEXT:    mov w14, #654 // =0x28e
-; CHECK-NEXT:    msub x9, x12, x9, x10
-; CHECK-NEXT:    fmov d1, x8
-; CHECK-NEXT:    msub x10, x13, x14, x11
+; CHECK-NEXT:    movk x13, #15432, lsl #32
+; CHECK-NEXT:    movk x13, #25653, lsl #48
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    smulh x12, x9, x12
+; CHECK-NEXT:    smulh x13, x11, x13
+; CHECK-NEXT:    add x8, x8, x10
+; CHECK-NEXT:    asr x14, x8, #4
+; CHECK-NEXT:    asr x15, x12, #11
+; CHECK-NEXT:    add x8, x14, x8, lsr #63
+; CHECK-NEXT:    mov w14, #23 // =0x17
+; CHECK-NEXT:    add x12, x15, x12, lsr #63
+; CHECK-NEXT:    msub x8, x8, x14, x10
+; CHECK-NEXT:    asr x10, x13, #8
+; CHECK-NEXT:    mov w14, #5423 // =0x152f
+; CHECK-NEXT:    add x10, x10, x13, lsr #63
+; CHECK-NEXT:    msub x9, x12, x14, x9
+; CHECK-NEXT:    mov w12, #654 // =0x28e
+; CHECK-NEXT:    msub x10, x10, x12, x11
+; CHECK-NEXT:    fmov d1, x8
 ; CHECK-NEXT:    mov v1.d[1], x9
 ; CHECK-NEXT:    mov v0.d[1], x10
 ; CHECK-NEXT:    ret
@@ -246,14 +246,14 @@ define <4 x i32> @fold_srem_v4i32(<4 x i32> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #26215 // =0x6667
 ; CHECK-NEXT:    movk w8, #26214, lsl #16
-; CHECK-NEXT:    movi v3.4s, #10
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    smull2 v2.2d, v0.4s, v1.4s
 ; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
 ; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    sshr v2.4s, v1.4s, #2
 ; CHECK-NEXT:    usra v2.4s, v1.4s, #31
-; CHECK-NEXT:    mls v0.4s, v2.4s, v3.4s
+; CHECK-NEXT:    movi v1.4s, #10
+; CHECK-NEXT:    mls v0.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    ret
   %1 = srem <4 x i32> %x, <i32 10, i32 10, i32 10, i32 10>
   ret <4 x i32> %1
@@ -281,18 +281,18 @@ define <2 x i32> @fold_srem_v2i32(<2 x i32> %x) {
 define <2 x i64> @fold_srem_v2i64(<2 x i64> %x) {
 ; CHECK-LABEL: fold_srem_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #7378697629483820646 // =0x6666666666666666
 ; CHECK-NEXT:    fmov x10, d0
-; CHECK-NEXT:    movk x8, #26215
+; CHECK-NEXT:    mov x8, #7378697629483820646 // =0x6666666666666666
 ; CHECK-NEXT:    mov x9, v0.d[1]
+; CHECK-NEXT:    movk x8, #26215
 ; CHECK-NEXT:    smulh x11, x10, x8
-; CHECK-NEXT:    asr x12, x11, #2
 ; CHECK-NEXT:    smulh x8, x9, x8
+; CHECK-NEXT:    asr x12, x11, #2
 ; CHECK-NEXT:    add x11, x12, x11, lsr #63
+; CHECK-NEXT:    asr x13, x8, #2
 ; CHECK-NEXT:    mov w12, #10 // =0xa
 ; CHECK-NEXT:    msub x10, x11, x12, x10
-; CHECK-NEXT:    asr x11, x8, #2
-; CHECK-NEXT:    add x8, x11, x8, lsr #63
+; CHECK-NEXT:    add x8, x13, x8, lsr #63
 ; CHECK-NEXT:    msub x8, x8, x12, x9
 ; CHECK-NEXT:    fmov d0, x10
 ; CHECK-NEXT:    mov v0.d[1], x8
@@ -305,8 +305,8 @@ define <1 x i64> @fold_srem_v1i64(<1 x i64> %x) {
 ; CHECK-LABEL: fold_srem_v1i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov x8, #7378697629483820646 // =0x6666666666666666
 ; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    mov x8, #7378697629483820646 // =0x6666666666666666
 ; CHECK-NEXT:    movk x8, #26215
 ; CHECK-NEXT:    smulh x8, x9, x8
 ; CHECK-NEXT:    asr x10, x8, #2

diff  --git a/llvm/test/CodeGen/AArch64/sshl_sat.ll b/llvm/test/CodeGen/AArch64/sshl_sat.ll
index 4572633ea15357..fbcd2db1298f0b 100644
--- a/llvm/test/CodeGen/AArch64/sshl_sat.ll
+++ b/llvm/test/CodeGen/AArch64/sshl_sat.ll
@@ -74,7 +74,7 @@ define i16 @combine_shlsat_by_zero(i16 %x, i16 %y) nounwind {
 define i16 @combine_shlsat_constfold(i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: combine_shlsat_constfold:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, #32
+; CHECK-NEXT:    mov w0, #32 // =0x20
 ; CHECK-NEXT:    ret
   %tmp = call i16 @llvm.sshl.sat.i16(i16 8, i16 2)
   ret i16 %tmp
@@ -84,7 +84,7 @@ define i16 @combine_shlsat_constfold(i16 %x, i16 %y) nounwind {
 define i16 @combine_shlsat_satmax(i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: combine_shlsat_satmax:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, #32767
+; CHECK-NEXT:    mov w0, #32767 // =0x7fff
 ; CHECK-NEXT:    ret
   %tmp = call i16 @llvm.sshl.sat.i16(i16 8, i16 15)
   ret i16 %tmp
@@ -94,7 +94,7 @@ define i16 @combine_shlsat_satmax(i16 %x, i16 %y) nounwind {
 define i16 @combine_shlsat_satmin(i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: combine_shlsat_satmin:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, #32768
+; CHECK-NEXT:    mov w0, #32768 // =0x8000
 ; CHECK-NEXT:    ret
   %tmp = call i16 @llvm.sshl.sat.i16(i16 -8, i16 15)
   ret i16 %tmp
@@ -107,10 +107,10 @@ define void @combine_shlsat_vector() nounwind {
 ; CHECK-LABEL: combine_shlsat_vector:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w0, #32
-; CHECK-NEXT:    mov w1, #32767
-; CHECK-NEXT:    mov w2, #65504
-; CHECK-NEXT:    mov w3, #32768
+; CHECK-NEXT:    mov w0, #32 // =0x20
+; CHECK-NEXT:    mov w1, #32767 // =0x7fff
+; CHECK-NEXT:    mov w2, #65504 // =0xffe0
+; CHECK-NEXT:    mov w3, #32768 // =0x8000
 ; CHECK-NEXT:    bl sink4xi16
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -143,11 +143,11 @@ define i16 @combine_shlsat_to_shl_no_fold(i16 %x) nounwind {
 ; CHECK-LABEL: combine_shlsat_to_shl_no_fold:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    mov w9, #-65536
-; CHECK-NEXT:    mov w10, #-2147483648
+; CHECK-NEXT:    mov w9, #-65536 // =0xffff0000
+; CHECK-NEXT:    mov w10, #-2147483648 // =0x80000000
 ; CHECK-NEXT:    ands w8, w9, w8, lsl #14
-; CHECK-NEXT:    lsl w9, w8, #3
 ; CHECK-NEXT:    cinv w10, w10, ge
+; CHECK-NEXT:    lsl w9, w8, #3
 ; CHECK-NEXT:    cmp w8, w9, asr #3
 ; CHECK-NEXT:    csel w8, w10, w9, ne
 ; CHECK-NEXT:    asr w0, w8, #16

diff  --git a/llvm/test/CodeGen/AArch64/ssub_sat.ll b/llvm/test/CodeGen/AArch64/ssub_sat.ll
index b541e7c4faa661..4ecfc03c8bbd7b 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat.ll
@@ -36,11 +36,11 @@ define i16 @func16(i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: func16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    mov w9, #32767
+; CHECK-NEXT:    mov w9, #32767 // =0x7fff
 ; CHECK-NEXT:    sub w8, w8, w1, sxth
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    csel w8, w8, w9, lt
-; CHECK-NEXT:    mov w9, #-32768
+; CHECK-NEXT:    mov w9, #-32768 // =0xffff8000
 ; CHECK-NEXT:    cmn w8, #8, lsl #12 // =32768
 ; CHECK-NEXT:    csel w0, w8, w9, gt
 ; CHECK-NEXT:    ret
@@ -51,12 +51,12 @@ define i16 @func16(i16 %x, i16 %y) nounwind {
 define i8 @func8(i8 %x, i8 %y) nounwind {
 ; CHECK-LABEL: func8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    mov w9, #127
-; CHECK-NEXT:    sub w8, w8, w1, sxtb
-; CHECK-NEXT:    cmp w8, #127
-; CHECK-NEXT:    csel w8, w8, w9, lt
-; CHECK-NEXT:    mov w9, #-128
+; CHECK-NEXT:    sxtb w9, w0
+; CHECK-NEXT:    mov w8, #127 // =0x7f
+; CHECK-NEXT:    sub w9, w9, w1, sxtb
+; CHECK-NEXT:    cmp w9, #127
+; CHECK-NEXT:    csel w8, w9, w8, lt
+; CHECK-NEXT:    mov w9, #-128 // =0xffffff80
 ; CHECK-NEXT:    cmn w8, #128
 ; CHECK-NEXT:    csel w0, w8, w9, gt
 ; CHECK-NEXT:    ret
@@ -67,13 +67,13 @@ define i8 @func8(i8 %x, i8 %y) nounwind {
 define i4 @func3(i4 %x, i4 %y) nounwind {
 ; CHECK-LABEL: func3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsl w8, w1, #28
-; CHECK-NEXT:    sbfx w9, w0, #0, #4
-; CHECK-NEXT:    sub w8, w9, w8, asr #28
-; CHECK-NEXT:    mov w9, #7
-; CHECK-NEXT:    cmp w8, #7
-; CHECK-NEXT:    csel w8, w8, w9, lt
-; CHECK-NEXT:    mov w9, #-8
+; CHECK-NEXT:    lsl w9, w1, #28
+; CHECK-NEXT:    sbfx w10, w0, #0, #4
+; CHECK-NEXT:    mov w8, #7 // =0x7
+; CHECK-NEXT:    sub w9, w10, w9, asr #28
+; CHECK-NEXT:    cmp w9, #7
+; CHECK-NEXT:    csel w8, w9, w8, lt
+; CHECK-NEXT:    mov w9, #-8 // =0xfffffff8
 ; CHECK-NEXT:    cmn w8, #8
 ; CHECK-NEXT:    csel w0, w8, w9, gt
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/ssub_sat_plus.ll b/llvm/test/CodeGen/AArch64/ssub_sat_plus.ll
index 0eec25a212b01d..f7634f82499e75 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_plus.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_plus.ll
@@ -37,13 +37,13 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
 ; CHECK-LABEL: func16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul w9, w1, w2
-; CHECK-NEXT:    sxth w10, w0
-; CHECK-NEXT:    mov w8, #32767
-; CHECK-NEXT:    sub w9, w10, w9, sxth
-; CHECK-NEXT:    cmp w9, w8
-; CHECK-NEXT:    csel w8, w9, w8, lt
-; CHECK-NEXT:    mov w9, #-32768
+; CHECK-NEXT:    mul w8, w1, w2
+; CHECK-NEXT:    sxth w9, w0
+; CHECK-NEXT:    sub w8, w9, w8, sxth
+; CHECK-NEXT:    mov w9, #32767 // =0x7fff
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    csel w8, w8, w9, lt
+; CHECK-NEXT:    mov w9, #-32768 // =0xffff8000
 ; CHECK-NEXT:    cmn w8, #8, lsl #12 // =32768
 ; CHECK-NEXT:    csel w0, w8, w9, gt
 ; CHECK-NEXT:    ret
@@ -55,13 +55,13 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
 define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind {
 ; CHECK-LABEL: func8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul w9, w1, w2
-; CHECK-NEXT:    sxtb w10, w0
-; CHECK-NEXT:    mov w8, #127
-; CHECK-NEXT:    sub w9, w10, w9, sxtb
-; CHECK-NEXT:    cmp w9, #127
-; CHECK-NEXT:    csel w8, w9, w8, lt
-; CHECK-NEXT:    mov w9, #-128
+; CHECK-NEXT:    mul w8, w1, w2
+; CHECK-NEXT:    sxtb w9, w0
+; CHECK-NEXT:    sub w8, w9, w8, sxtb
+; CHECK-NEXT:    mov w9, #127 // =0x7f
+; CHECK-NEXT:    cmp w8, #127
+; CHECK-NEXT:    csel w8, w8, w9, lt
+; CHECK-NEXT:    mov w9, #-128 // =0xffffff80
 ; CHECK-NEXT:    cmn w8, #128
 ; CHECK-NEXT:    csel w0, w8, w9, gt
 ; CHECK-NEXT:    ret
@@ -73,14 +73,14 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind {
 define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind {
 ; CHECK-LABEL: func4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul w9, w1, w2
-; CHECK-NEXT:    sbfx w10, w0, #0, #4
-; CHECK-NEXT:    mov w8, #7
-; CHECK-NEXT:    lsl w9, w9, #28
-; CHECK-NEXT:    sub w9, w10, w9, asr #28
-; CHECK-NEXT:    cmp w9, #7
-; CHECK-NEXT:    csel w8, w9, w8, lt
-; CHECK-NEXT:    mov w9, #-8
+; CHECK-NEXT:    mul w8, w1, w2
+; CHECK-NEXT:    sbfx w9, w0, #0, #4
+; CHECK-NEXT:    lsl w8, w8, #28
+; CHECK-NEXT:    sub w8, w9, w8, asr #28
+; CHECK-NEXT:    mov w9, #7 // =0x7
+; CHECK-NEXT:    cmp w8, #7
+; CHECK-NEXT:    csel w8, w8, w9, lt
+; CHECK-NEXT:    mov w9, #-8 // =0xfffffff8
 ; CHECK-NEXT:    cmn w8, #8
 ; CHECK-NEXT:    csel w0, w8, w9, gt
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index 7feeac33151111..fa707d18710ae2 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -45,8 +45,8 @@ define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
 ; CHECK-LABEL: v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    sqsub v1.16b, v1.16b, v3.16b
+; CHECK-NEXT:    sqsub v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %z = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %x, <32 x i8> %y)
   ret <32 x i8> %z
@@ -76,8 +76,8 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
 ; CHECK-LABEL: v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v0.8h, v0.8h, v2.8h
 ; CHECK-NEXT:    sqsub v1.8h, v1.8h, v3.8h
+; CHECK-NEXT:    sqsub v0.8h, v0.8h, v2.8h
 ; CHECK-NEXT:    ret
   %z = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %x, <16 x i16> %y)
   ret <16 x i16> %z
@@ -98,9 +98,9 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
 define void @v8i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-LABEL: v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    sqsub v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    sqsub v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <8 x i8>, ptr %px
@@ -117,8 +117,8 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-NEXT:    ldr s1, [x1]
 ; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-NEXT:    shl v1.4h, v1.4h, #8
+; CHECK-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-NEXT:    sqsub v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
 ; CHECK-NEXT:    xtn v0.8b, v0.8h
@@ -134,15 +134,15 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-LABEL: v2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1 { v0.b }[0], [x1]
-; CHECK-NEXT:    add x8, x1, #1
-; CHECK-NEXT:    ld1 { v1.b }[0], [x0]
-; CHECK-NEXT:    add x9, x0, #1
+; CHECK-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-NEXT:    ld1 { v1.b }[0], [x1]
+; CHECK-NEXT:    add x8, x0, #1
+; CHECK-NEXT:    add x9, x1, #1
 ; CHECK-NEXT:    ld1 { v0.b }[4], [x8]
 ; CHECK-NEXT:    ld1 { v1.b }[4], [x9]
-; CHECK-NEXT:    shl v0.2s, v0.2s, #24
 ; CHECK-NEXT:    shl v1.2s, v1.2s, #24
-; CHECK-NEXT:    sqsub v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-NEXT:    sqsub v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ushr v0.2s, v0.2s, #24
 ; CHECK-NEXT:    mov w8, v0.s[1]
 ; CHECK-NEXT:    fmov w9, s0
@@ -159,9 +159,9 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-LABEL: v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    sqsub v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    sqsub v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <4 x i16>, ptr %px
@@ -174,15 +174,15 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-LABEL: v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1 { v0.h }[0], [x1]
-; CHECK-NEXT:    add x8, x1, #2
-; CHECK-NEXT:    ld1 { v1.h }[0], [x0]
-; CHECK-NEXT:    add x9, x0, #2
+; CHECK-NEXT:    ld1 { v0.h }[0], [x0]
+; CHECK-NEXT:    ld1 { v1.h }[0], [x1]
+; CHECK-NEXT:    add x8, x0, #2
+; CHECK-NEXT:    add x9, x1, #2
 ; CHECK-NEXT:    ld1 { v0.h }[2], [x8]
 ; CHECK-NEXT:    ld1 { v1.h }[2], [x9]
-; CHECK-NEXT:    shl v0.2s, v0.2s, #16
 ; CHECK-NEXT:    shl v1.2s, v1.2s, #16
-; CHECK-NEXT:    sqsub v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-NEXT:    sqsub v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ushr v0.2s, v0.2s, #16
 ; CHECK-NEXT:    mov w8, v0.s[1]
 ; CHECK-NEXT:    fmov w9, s0
@@ -225,9 +225,9 @@ define void @v12i16(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v1i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-LABEL: v1i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr b0, [x1]
-; CHECK-NEXT:    ldr b1, [x0]
-; CHECK-NEXT:    sqsub v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    ldr b1, [x1]
+; CHECK-NEXT:    sqsub v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    st1 { v0.b }[0], [x2]
 ; CHECK-NEXT:    ret
   %x = load <1 x i8>, ptr %px
@@ -240,9 +240,9 @@ define void @v1i8(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v1i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-LABEL: v1i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr h0, [x1]
-; CHECK-NEXT:    ldr h1, [x0]
-; CHECK-NEXT:    sqsub v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ldr h1, [x1]
+; CHECK-NEXT:    sqsub v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    str h0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <1 x i16>, ptr %px
@@ -300,8 +300,8 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; CHECK-LABEL: v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    sqsub v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    sqsub v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
   %z = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
   ret <8 x i32> %z
@@ -331,8 +331,8 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; CHECK-LABEL: v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sqsub v0.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    sqsub v1.2d, v1.2d, v3.2d
+; CHECK-NEXT:    sqsub v0.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    ret
   %z = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
   ret <4 x i64> %z
@@ -356,16 +356,16 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; CHECK-NEXT:    subs x8, x2, x6
 ; CHECK-NEXT:    sbcs x9, x3, x7
 ; CHECK-NEXT:    asr x10, x9, #63
+; CHECK-NEXT:    eor x11, x10, #0x8000000000000000
 ; CHECK-NEXT:    csel x2, x10, x8, vs
-; CHECK-NEXT:    eor x8, x10, #0x8000000000000000
-; CHECK-NEXT:    csel x3, x8, x9, vs
+; CHECK-NEXT:    csel x3, x11, x9, vs
 ; CHECK-NEXT:    subs x8, x0, x4
 ; CHECK-NEXT:    sbcs x9, x1, x5
 ; CHECK-NEXT:    asr x10, x9, #63
 ; CHECK-NEXT:    csel x8, x10, x8, vs
-; CHECK-NEXT:    eor x10, x10, #0x8000000000000000
-; CHECK-NEXT:    csel x1, x10, x9, vs
+; CHECK-NEXT:    eor x11, x10, #0x8000000000000000
 ; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    csel x1, x11, x9, vs
 ; CHECK-NEXT:    mov v0.d[1], x1
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/stack-guard-sysreg.ll b/llvm/test/CodeGen/AArch64/stack-guard-sysreg.ll
index 7b308f306cfc50..9749a1cdd3a83c 100644
--- a/llvm/test/CodeGen/AArch64/stack-guard-sysreg.ll
+++ b/llvm/test/CodeGen/AArch64/stack-guard-sysreg.ll
@@ -1,4 +1,3 @@
-; RUN: split-file %s %t
 ; RUN: cat %t/main.ll %t/a.ll > %t/a2.ll
 ; RUN: cat %t/main.ll %t/b.ll > %t/b2.ll
 ; RUN: cat %t/main.ll %t/c.ll > %t/c2.ll
@@ -20,9 +19,9 @@
 ; RUN: llc %t/e2.ll -verify-machineinstrs -o - | \
 ; RUN: FileCheck --check-prefix=CHECK --check-prefix=CHECK-NPOT-NEG-OFFSET %s
 ; RUN: llc %t/f2.ll -verify-machineinstrs -o - | \
-; RUN: FileCheck --check-prefix=CHECK --check-prefix=CHECK-257-OFFSET %s
+; RUN: FileCheck --check-prefix=CHECK-ADD --check-prefix=CHECK-257-OFFSET %s
 ; RUN: llc %t/g2.ll -verify-machineinstrs -o - | \
-; RUN: FileCheck --check-prefix=CHECK --check-prefix=CHECK-MINUS-257-OFFSET %s
+; RUN: FileCheck --check-prefix=CHECK-ADD --check-prefix=CHECK-MINUS-257-OFFSET %s
 
 ; XFAIL
 ; RUN: not --crash llc %t/h2.ll -o - 2>&1 | \
@@ -39,59 +38,101 @@ target triple = "aarch64-unknown-linux-gnu"
 ; Verify that we `mrs` from `SP_EL0` twice, rather than load from
 ; __stack_chk_guard.
 define dso_local void @foo(i64 %t) local_unnamed_addr #0 {
-; CHECK-LABEL:   foo:
-; CHECK:         // %bb.0: // %entry
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-LABEL: foo:                                    // @foo
+; CHECK:         .cfi_startproc
+; CHECK: // %bb.0:                               // %entry
+; CHECK-NEXT:    stp     x29, x30, [sp, #-16]!           // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    mov     x29, sp
 ; CHECK-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    .cfi_remember_state
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    mrs x8, SP_EL0
-; CHECK-NEXT:    lsl x9, x0, #2
-; CHECK-NO-OFFSET:       ldr x8, [x8]
+; CHECK-NEXT:    sub     sp, sp, #16
+; CHECK-NEXT:    mrs     x8, SP_EL0
+; CHECK-NEXT:    lsl     x9, x0, #2
+; CHECK-NO-OFFSET: ldr     x8, [x8]
 ; CHECK-POSITIVE-OFFSET: ldr x8, [x8, #8]
 ; CHECK-NEGATIVE-OFFSET: ldur x8, [x8, #-8]
 ; CHECK-NPOT-OFFSET:     ldur x8, [x8, #1]
 ; CHECK-NPOT-NEG-OFFSET: ldur x8, [x8, #-1]
-; CHECK-257-OFFSET:      add x8, x8, #257
-; CHECK-MINUS-257-OFFSET:      sub x8, x8, #257
-; CHECK-NEXT:    add x9, x9, #15
-; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
-; CHECK-257-OFFSET-NEXT: ldr x8, [x8]
-; CHECK-MINUS-257-OFFSET-NEXT: ldr x8, [x8]
-; CHECK-NEXT:    stur x8, [x29, #-8]
-; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    sub x0, x8, x9
-; CHECK-NEXT:    mov sp, x0
-; CHECK-NEXT:    bl baz
-; CHECK-NEXT:    mrs x8, SP_EL0
+; CHECK-NEXT:    add     x9, x9, #15
+; CHECK-NEXT:    stur    x8, [x29, #-8]
+; CHECK-NEXT     mov     x8, sp
+; CHECK-NEXT     and     x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT     sub     x0, x8, x9
+; CHECK-NEXT     mov     sp, x0
+; CHECK-NEXT     bl      baz
+; CHECK-NEXT     mrs     x8, SP_EL0
 ; CHECK-NO-OFFSET:       ldr x8, [x8]
 ; CHECK-POSITIVE-OFFSET: ldr x8, [x8, #8]
 ; CHECK-NEGATIVE-OFFSET: ldur x8, [x8, #-8]
 ; CHECK-NPOT-OFFSET:     ldur x8, [x8, #1]
 ; CHECK-NPOT-NEG-OFFSET: ldur x8, [x8, #-1]
-; CHECK-257-OFFSET:      add x8, x8, #257
-; CHECK-257-OFFSET-NEXT: ldr x8, [x8]
-; CHECK-MINUS-257-OFFSET:      sub x8, x8, #257
-; CHECK-MINUS-257-OFFSET-NEXT: ldr x8, [x8]
-; CHECK-NEXT:    ldur x9, [x29, #-8]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB0_2
-; CHECK-NEXT:  // %bb.1: // %entry
-; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    .cfi_def_cfa wsp, 16
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-NEXT:   .cfi_def_cfa_offset 0
-; CHECK-NEXT:   .cfi_restore w30
-; CHECK-NEXT:   .cfi_restore w29
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB0_2: // %entry
-; CHECK-NEXT:   .cfi_restore_state
-; CHECK-NEXT:    bl __stack_chk_fail
-; CHECK-NOT: __stack_chk_guard
+; CHECK-NEXT:          ldur    x9, [x29, #-8]
+; CHECK-NEXT:          cmp     x8, x9
+; CHECK-NEXT:          b.ne    .LBB0_2
+; CHECK-NEXT: // %bb.1:                               // %entry
+; CHECK-NEXT:         mov     sp, x29
+; CHECK-NEXT:         .cfi_def_cfa wsp, 16
+; CHECK-NEXT:         ldp     x29, x30, [sp], #16             // 16-byte Folded Reload
+; CHECK-NEXT:         .cfi_def_cfa_offset 0
+; CHECK-NEXT:         .cfi_restore w30
+; CHECK-NEXT:         .cfi_restore w29
+; CHECK-NEXT:         ret
+; CHECK-NEXT: .LBB0_2:                                // %entry
+; CHECK-NEXT:         .cfi_restore_state
+; CHECK-NEXT:         bl      __stack_chk_fail
+; CHECK-NEXT: .Lfunc_end0:
+; CHECK-NEXT:         .size   foo, .Lfunc_end0-foo
+; CHECK-NEXT:         .cfi_endproc
+; CHECK-NEXT:                                        // -- End function
+; CHECK-NEXT:        .section        ".note.GNU-stack","", at progbits
+
+
+; CHECK-ADD:        stp     x29, x30, [sp, #-16]!           // 16-byte Folded Spill
+; CHECK-ADD-NEXT:        .cfi_def_cfa_offset 16
+; CHECK-ADD-NEXT:        mov     x29, sp
+; CHECK-ADD-NEXT:        .cfi_def_cfa w29, 16
+; CHECK-ADD-NEXT:        .cfi_offset w30, -8
+; CHECK-ADD-NEXT:        .cfi_offset w29, -16
+; CHECK-ADD-NEXT:        .cfi_remember_state
+; CHECK-ADD-NEXT:        sub     sp, sp, #16
+; CHECK-ADD-NEXT:        mrs     x8, SP_EL0
+; CHECK-ADD-NEXT:        lsl     x9, x0, #2
+; CHECK-MINUS-257-OFFSET: sub     x8, x8, #257
+; CHECK-257-OFFSET:      add     x8, x8, #257
+; CHECK-ADD-NEXT:        ldr     x8, [x8]
+; CHECK-ADD-NEXT:        add     x9, x9, #15
+; CHECK-ADD-NEXT:        and     x9, x9, #0xfffffffffffffff0
+; CHECK-ADD-NEXT:        stur    x8, [x29, #-8]
+; CHECK-ADD-NEXT:        mov     x8, sp
+; CHECK-ADD-NEXT:        sub     x0, x8, x9
+; CHECK-ADD-NEXT:        mov     sp, x0
+; CHECK-ADD-NEXT:        bl      baz
+; CHECK-ADD-NEXT:        mrs     x8, SP_EL0
+; CHECK-257-OFFSET:      add     x8, x8, #257
+; CHECK-MINUS-257-OFFSET: sub     x8, x8, #257
+; CHECK-ADD-NEXT:         ldr     x8, [x8]
+; CHECK-ADD-NEXT:         ldur    x9, [x29, #-8]
+; CHECK-ADD-NEXT:         cmp     x8, x9
+; CHECK-ADD-NEXT:         b.ne    .LBB0_2
+; CHECK-ADD-NEXT: // %bb.1:                               // %entry
+; CHECK-ADD-NEXT:         mov     sp, x29
+; CHECK-ADD-NEXT:         .cfi_def_cfa wsp, 16
+; CHECK-ADD-NEXT:         ldp     x29, x30, [sp], #16             // 16-byte Folded Reload
+; CHECK-ADD-NEXT:         .cfi_def_cfa_offset 0
+; CHECK-ADD-NEXT:         .cfi_restore w30
+; CHECK-ADD-NEXT:         .cfi_restore w29
+; CHECK-ADD-NEXT:         ret
+; CHECK-ADD-NEXT: .LBB0_2:                                // %entry
+; CHECK-ADD-NEXT:         .cfi_restore_state
+; CHECK-ADD-NEXT:         bl      __stack_chk_fail
+; CHECK-ADD-NEXT: .Lfunc_end0:
+; CHECK-ADD-NEXT:         .size   foo, .Lfunc_end0-foo
+; CHECK-ADD-NEXT:         .cfi_endproc
+; CHECK-ADD-NEXT:                                         // -- End function
+; CHECK-ADD-NEXT:         .section        ".note.GNU-stack","", at progbits
 entry:
   %vla = alloca i32, i64 %t, align 4
   call void @baz(ptr nonnull %vla)

diff  --git a/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll b/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll
index e45c36bfaf10a3..6326d3db9afb81 100644
--- a/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll
+++ b/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll
@@ -123,8 +123,8 @@ define void @test_void_vararg() gc "statepoint-example" {
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov w0, #42
-; CHECK-NEXT:    mov w1, #43
+; CHECK-NEXT:    mov w0, #42 // =0x2a
+; CHECK-NEXT:    mov w1, #43 // =0x2b
 ; CHECK-NEXT:    bl varargf
 ; CHECK-NEXT:  .Ltmp6:
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -177,7 +177,7 @@ define i1 @test_cross_bb(ptr addrspace(1) %a, i1 %external_cond) gc "statepoint-
 ; CHECK-NEXT:    bl consume
 ; CHECK-NEXT:    b .LBB8_3
 ; CHECK-NEXT:  .LBB8_2:
-; CHECK-NEXT:    mov w19, #1
+; CHECK-NEXT:    mov w19, #1 // =0x1
 ; CHECK-NEXT:  .LBB8_3: // %common.ret
 ; CHECK-NEXT:    and w0, w19, #0x1
 ; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
@@ -208,13 +208,13 @@ define void @test_attributes(ptr byval(%struct2) %s) gc "statepoint-example" {
 ; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov x18, xzr
-; CHECK-NEXT:    ldr q0, [sp, #48]
 ; CHECK-NEXT:    ldr x8, [sp, #64]
-; CHECK-NEXT:    mov w0, #42
-; CHECK-NEXT:    mov w1, #17
-; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    ldr q0, [sp, #48]
+; CHECK-NEXT:    mov x18, xzr
+; CHECK-NEXT:    mov w0, #42 // =0x2a
+; CHECK-NEXT:    mov w1, #17 // =0x11
 ; CHECK-NEXT:    str x8, [sp, #16]
+; CHECK-NEXT:    str q0, [sp]
 ; CHECK-NEXT:    bl consume_attributes
 ; CHECK-NEXT:  .Ltmp9:
 ; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AArch64/sve-abd.ll b/llvm/test/CodeGen/AArch64/sve-abd.ll
index 1bdff3a42db937..31a226ab74c947 100644
--- a/llvm/test/CodeGen/AArch64/sve-abd.ll
+++ b/llvm/test/CodeGen/AArch64/sve-abd.ll
@@ -251,9 +251,9 @@ define <vscale x 4 x i32> @uabd_non_matching_extension(<vscale x 4 x i32> %a, <v
 ; CHECK-NEXT:    and z1.s, z1.s, #0xff
 ; CHECK-NEXT:    uunpkhi z2.d, z0.s
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uunpkhi z3.d, z1.s
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    sub z0.d, z0.d, z1.d
 ; CHECK-NEXT:    sub z1.d, z2.d, z3.d
 ; CHECK-NEXT:    abs z1.d, p0/m, z1.d

diff  --git a/llvm/test/CodeGen/AArch64/sve-alloca.ll b/llvm/test/CodeGen/AArch64/sve-alloca.ll
index 90eed07c242bf6..209c6198fe575e 100644
--- a/llvm/test/CodeGen/AArch64/sve-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-alloca.ll
@@ -67,33 +67,33 @@ define void @foo(<vscale x 4 x i64> %dst, i1 %cond) {
 ; CHECK-NEXT:    bl bar
 ; CHECK-NEXT:    addvl sp, x29, #-18
 ; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    ldp x28, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AArch64/sve-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-bitcast.ll
index 5f8fcb3d56e4b2..7dd568fc837a3b 100644
--- a/llvm/test/CodeGen/AArch64/sve-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/sve-bitcast.ll
@@ -16,9 +16,9 @@ define <vscale x 16 x i8> @bitcast_nxv8i16_to_nxv16i8(<vscale x 8 x i16> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.b
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.b
-; CHECK_BE-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1b { z0.b }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -36,9 +36,9 @@ define <vscale x 16 x i8> @bitcast_nxv4i32_to_nxv16i8(<vscale x 4 x i32> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.b
 ; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.b
-; CHECK_BE-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1b { z0.b }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -56,9 +56,9 @@ define <vscale x 16 x i8> @bitcast_nxv2i64_to_nxv16i8(<vscale x 2 x i64> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.b
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.b
-; CHECK_BE-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1b { z0.b }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -76,9 +76,9 @@ define <vscale x 16 x i8> @bitcast_nxv8f16_to_nxv16i8(<vscale x 8 x half> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.b
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.b
-; CHECK_BE-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1b { z0.b }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -96,9 +96,9 @@ define <vscale x 16 x i8> @bitcast_nxv4f32_to_nxv16i8(<vscale x 4 x float> %v) #
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.b
 ; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.b
-; CHECK_BE-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1b { z0.b }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -116,9 +116,9 @@ define <vscale x 16 x i8> @bitcast_nxv2f64_to_nxv16i8(<vscale x 2 x double> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.b
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.b
-; CHECK_BE-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1b { z0.b }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -136,9 +136,9 @@ define <vscale x 16 x i8> @bitcast_nxv8bf16_to_nxv16i8(<vscale x 8 x bfloat> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.b
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.b
-; CHECK_BE-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1b { z0.b }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -160,9 +160,9 @@ define <vscale x 8 x i16> @bitcast_nxv16i8_to_nxv8i16(<vscale x 16 x i8> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -180,9 +180,9 @@ define <vscale x 8 x i16> @bitcast_nxv4i32_to_nxv8i16(<vscale x 4 x i32> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -200,9 +200,9 @@ define <vscale x 8 x i16> @bitcast_nxv2i64_to_nxv8i16(<vscale x 2 x i64> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -239,9 +239,9 @@ define <vscale x 8 x i16> @bitcast_nxv4f32_to_nxv8i16(<vscale x 4 x float> %v) #
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -259,9 +259,9 @@ define <vscale x 8 x i16> @bitcast_nxv2f64_to_nxv8i16(<vscale x 2 x double> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -302,9 +302,9 @@ define <vscale x 4 x i32> @bitcast_nxv16i8_to_nxv4i32(<vscale x 16 x i8> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -322,9 +322,9 @@ define <vscale x 4 x i32> @bitcast_nxv8i16_to_nxv4i32(<vscale x 8 x i16> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -342,9 +342,9 @@ define <vscale x 4 x i32> @bitcast_nxv2i64_to_nxv4i32(<vscale x 2 x i64> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -362,9 +362,9 @@ define <vscale x 4 x i32> @bitcast_nxv8f16_to_nxv4i32(<vscale x 8 x half> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -401,9 +401,9 @@ define <vscale x 4 x i32> @bitcast_nxv2f64_to_nxv4i32(<vscale x 2 x double> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -421,9 +421,9 @@ define <vscale x 4 x i32> @bitcast_nxv8bf16_to_nxv4i32(<vscale x 8 x bfloat> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -445,9 +445,9 @@ define <vscale x 2 x i64> @bitcast_nxv16i8_to_nxv2i64(<vscale x 16 x i8> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -465,9 +465,9 @@ define <vscale x 2 x i64> @bitcast_nxv8i16_to_nxv2i64(<vscale x 8 x i16> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -485,9 +485,9 @@ define <vscale x 2 x i64> @bitcast_nxv4i32_to_nxv2i64(<vscale x 4 x i32> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -505,9 +505,9 @@ define <vscale x 2 x i64> @bitcast_nxv8f16_to_nxv2i64(<vscale x 8 x half> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -525,9 +525,9 @@ define <vscale x 2 x i64> @bitcast_nxv4f32_to_nxv2i64(<vscale x 4 x float> %v) #
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -564,9 +564,9 @@ define <vscale x 2 x i64> @bitcast_nxv8bf16_to_nxv2i64(<vscale x 8 x bfloat> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -588,9 +588,9 @@ define <vscale x 8 x half> @bitcast_nxv16i8_to_nxv8f16(<vscale x 16 x i8> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -627,9 +627,9 @@ define <vscale x 8 x half> @bitcast_nxv4i32_to_nxv8f16(<vscale x 4 x i32> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -647,9 +647,9 @@ define <vscale x 8 x half> @bitcast_nxv2i64_to_nxv8f16(<vscale x 2 x i64> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -667,9 +667,9 @@ define <vscale x 8 x half> @bitcast_nxv4f32_to_nxv8f16(<vscale x 4 x float> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -687,9 +687,9 @@ define <vscale x 8 x half> @bitcast_nxv2f64_to_nxv8f16(<vscale x 2 x double> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -730,9 +730,9 @@ define <vscale x 4 x float> @bitcast_nxv16i8_to_nxv4f32(<vscale x 16 x i8> %v) #
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -750,9 +750,9 @@ define <vscale x 4 x float> @bitcast_nxv8i16_to_nxv4f32(<vscale x 8 x i16> %v) #
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -789,9 +789,9 @@ define <vscale x 4 x float> @bitcast_nxv2i64_to_nxv4f32(<vscale x 2 x i64> %v) #
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -809,9 +809,9 @@ define <vscale x 4 x float> @bitcast_nxv8f16_to_nxv4f32(<vscale x 8 x half> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -829,9 +829,9 @@ define <vscale x 4 x float> @bitcast_nxv2f64_to_nxv4f32(<vscale x 2 x double> %v
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -849,9 +849,9 @@ define <vscale x 4 x float> @bitcast_nxv8bf16_to_nxv4f32(<vscale x 8 x bfloat> %
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -873,9 +873,9 @@ define <vscale x 2 x double> @bitcast_nxv16i8_to_nxv2f64(<vscale x 16 x i8> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -893,9 +893,9 @@ define <vscale x 2 x double> @bitcast_nxv8i16_to_nxv2f64(<vscale x 8 x i16> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -913,9 +913,9 @@ define <vscale x 2 x double> @bitcast_nxv4i32_to_nxv2f64(<vscale x 4 x i32> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -952,9 +952,9 @@ define <vscale x 2 x double> @bitcast_nxv8f16_to_nxv2f64(<vscale x 8 x half> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -972,9 +972,9 @@ define <vscale x 2 x double> @bitcast_nxv4f32_to_nxv2f64(<vscale x 4 x float> %v
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -992,9 +992,9 @@ define <vscale x 2 x double> @bitcast_nxv8bf16_to_nxv2f64(<vscale x 8 x bfloat>
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1016,9 +1016,9 @@ define <vscale x 8 x bfloat> @bitcast_nxv16i8_to_nxv8bf16(<vscale x 16 x i8> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1055,9 +1055,9 @@ define <vscale x 8 x bfloat> @bitcast_nxv4i32_to_nxv8bf16(<vscale x 4 x i32> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1075,9 +1075,9 @@ define <vscale x 8 x bfloat> @bitcast_nxv2i64_to_nxv8bf16(<vscale x 2 x i64> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1114,9 +1114,9 @@ define <vscale x 8 x bfloat> @bitcast_nxv4f32_to_nxv8bf16(<vscale x 4 x float> %
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1134,9 +1134,9 @@ define <vscale x 8 x bfloat> @bitcast_nxv2f64_to_nxv8bf16(<vscale x 2 x double>
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1154,9 +1154,9 @@ define <vscale x 8 x i8> @bitcast_nxv4i16_to_nxv8i8(<vscale x 4 x i16> %v) #0 {
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ptrue p1.h
 ; CHECK-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    ld1b { z0.h }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1b { z0.h }, p1/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -1166,9 +1166,9 @@ define <vscale x 8 x i8> @bitcast_nxv4i16_to_nxv8i8(<vscale x 4 x i16> %v) #0 {
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1b { z0.h }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1b { z0.h }, p1/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1182,9 +1182,9 @@ define <vscale x 8 x i8> @bitcast_nxv2i32_to_nxv8i8(<vscale x 2 x i32> %v) #0 {
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.h
 ; CHECK-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    ld1b { z0.h }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1b { z0.h }, p1/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -1194,9 +1194,9 @@ define <vscale x 8 x i8> @bitcast_nxv2i32_to_nxv8i8(<vscale x 2 x i32> %v) #0 {
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1b { z0.h }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1b { z0.h }, p1/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1215,9 +1215,9 @@ define <vscale x 8 x i8> @bitcast_nxv1i64_to_nxv8i8(<vscale x 1 x i64> %v) #0 {
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.b
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.b
-; CHECK_BE-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1b { z0.b }, p1/z, [sp]
 ; CHECK_BE-NEXT:    uunpklo z0.h, z0.b
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1232,9 +1232,9 @@ define <vscale x 8 x i8> @bitcast_nxv4f16_to_nxv8i8(<vscale x 4 x half> %v) #0 {
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ptrue p1.h
 ; CHECK-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    ld1b { z0.h }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1b { z0.h }, p1/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -1244,9 +1244,9 @@ define <vscale x 8 x i8> @bitcast_nxv4f16_to_nxv8i8(<vscale x 4 x half> %v) #0 {
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1b { z0.h }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1b { z0.h }, p1/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1260,9 +1260,9 @@ define <vscale x 8 x i8> @bitcast_nxv2f32_to_nxv8i8(<vscale x 2 x float> %v) #0
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.h
 ; CHECK-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    ld1b { z0.h }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1b { z0.h }, p1/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -1272,9 +1272,9 @@ define <vscale x 8 x i8> @bitcast_nxv2f32_to_nxv8i8(<vscale x 2 x float> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1b { z0.h }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1b { z0.h }, p1/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1293,9 +1293,9 @@ define <vscale x 8 x i8> @bitcast_nxv1f64_to_nxv8i8(<vscale x 1 x double> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.b
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.b
-; CHECK_BE-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1b { z0.b }, p1/z, [sp]
 ; CHECK_BE-NEXT:    uunpklo z0.h, z0.b
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1310,9 +1310,9 @@ define <vscale x 8 x i8> @bitcast_nxv4bf16_to_nxv8i8(<vscale x 4 x bfloat> %v) #
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ptrue p1.h
 ; CHECK-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    ld1b { z0.h }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1b { z0.h }, p1/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -1322,9 +1322,9 @@ define <vscale x 8 x i8> @bitcast_nxv4bf16_to_nxv8i8(<vscale x 4 x bfloat> %v) #
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1b { z0.h }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1b { z0.h }, p1/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1342,9 +1342,9 @@ define <vscale x 4 x i16> @bitcast_nxv8i8_to_nxv4i16(<vscale x 8 x i8> %v) #0 {
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    st1b { z0.h }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1h { z0.s }, p1/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -1354,9 +1354,9 @@ define <vscale x 4 x i16> @bitcast_nxv8i8_to_nxv4i16(<vscale x 8 x i8> %v) #0 {
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1b { z0.h }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1h { z0.s }, p1/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1370,9 +1370,9 @@ define <vscale x 4 x i16> @bitcast_nxv2i32_to_nxv4i16(<vscale x 2 x i32> %v) #0
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1h { z0.s }, p1/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -1382,9 +1382,9 @@ define <vscale x 4 x i16> @bitcast_nxv2i32_to_nxv4i16(<vscale x 2 x i32> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1h { z0.s }, p1/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1403,9 +1403,9 @@ define <vscale x 4 x i16> @bitcast_nxv1i64_to_nxv4i16(<vscale x 1 x i64> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1424,9 +1424,9 @@ define <vscale x 4 x i16> @bitcast_nxv4f16_to_nxv4i16(<vscale x 4 x half> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1440,9 +1440,9 @@ define <vscale x 4 x i16> @bitcast_nxv2f32_to_nxv4i16(<vscale x 2 x float> %v) #
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1h { z0.s }, p1/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -1452,9 +1452,9 @@ define <vscale x 4 x i16> @bitcast_nxv2f32_to_nxv4i16(<vscale x 2 x float> %v) #
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1h { z0.s }, p1/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1473,9 +1473,9 @@ define <vscale x 4 x i16> @bitcast_nxv1f64_to_nxv4i16(<vscale x 1 x double> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1494,9 +1494,9 @@ define <vscale x 4 x i16> @bitcast_nxv4bf16_to_nxv4i16(<vscale x 4 x bfloat> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1514,9 +1514,9 @@ define <vscale x 2 x i32> @bitcast_nxv8i8_to_nxv2i32(<vscale x 8 x i8> %v) #0 {
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    st1b { z0.h }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1w { z0.d }, p1/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -1526,9 +1526,9 @@ define <vscale x 2 x i32> @bitcast_nxv8i8_to_nxv2i32(<vscale x 8 x i8> %v) #0 {
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1b { z0.h }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1w { z0.d }, p1/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1542,9 +1542,9 @@ define <vscale x 2 x i32> @bitcast_nxv4i16_to_nxv2i32(<vscale x 4 x i16> %v) #0
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1w { z0.d }, p1/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -1554,9 +1554,9 @@ define <vscale x 2 x i32> @bitcast_nxv4i16_to_nxv2i32(<vscale x 4 x i16> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1w { z0.d }, p1/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1575,9 +1575,9 @@ define <vscale x 2 x i32> @bitcast_nxv1i64_to_nxv2i32(<vscale x 1 x i64> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
 ; CHECK_BE-NEXT:    uunpklo z0.d, z0.s
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1592,9 +1592,9 @@ define <vscale x 2 x i32> @bitcast_nxv4f16_to_nxv2i32(<vscale x 4 x half> %v) #0
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1w { z0.d }, p1/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -1604,9 +1604,9 @@ define <vscale x 2 x i32> @bitcast_nxv4f16_to_nxv2i32(<vscale x 4 x half> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1w { z0.d }, p1/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1624,9 +1624,9 @@ define <vscale x 2 x i32> @bitcast_nxv2f32_to_nxv2i32(<vscale x 2 x float> %v) #
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1645,9 +1645,9 @@ define <vscale x 2 x i32> @bitcast_nxv1f64_to_nxv2i32(<vscale x 1 x double> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
 ; CHECK_BE-NEXT:    uunpklo z0.d, z0.s
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1662,9 +1662,9 @@ define <vscale x 2 x i32> @bitcast_nxv4bf16_to_nxv2i32(<vscale x 4 x bfloat> %v)
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1w { z0.d }, p1/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -1674,9 +1674,9 @@ define <vscale x 2 x i32> @bitcast_nxv4bf16_to_nxv2i32(<vscale x 4 x bfloat> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1w { z0.d }, p1/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1700,9 +1700,9 @@ define <vscale x 1 x i64> @bitcast_nxv8i8_to_nxv1i64(<vscale x 8 x i8> %v) #0 {
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.b
 ; CHECK_BE-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1722,9 +1722,9 @@ define <vscale x 1 x i64> @bitcast_nxv4i16_to_nxv1i64(<vscale x 4 x i16> %v) #0
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
 ; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1744,9 +1744,9 @@ define <vscale x 1 x i64> @bitcast_nxv2i32_to_nxv1i64(<vscale x 2 x i32> %v) #0
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
 ; CHECK_BE-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1768,12 +1768,12 @@ define <vscale x 1 x i64> @bitcast_nxv4f16_to_nxv1i64(<vscale x 4 x half> %v) #0
 ; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
 ; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #2, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp, #2, mul vl]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp, #2, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #3
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1840,12 +1840,12 @@ define <vscale x 1 x i64> @bitcast_nxv4bf16_to_nxv1i64(<vscale x 4 x bfloat> %v)
 ; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
 ; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #2, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp, #2, mul vl]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp, #2, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #3
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1863,9 +1863,9 @@ define <vscale x 4 x half> @bitcast_nxv8i8_to_nxv4f16(<vscale x 8 x i8> %v) #0 {
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    st1b { z0.h }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1h { z0.s }, p1/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -1875,9 +1875,9 @@ define <vscale x 4 x half> @bitcast_nxv8i8_to_nxv4f16(<vscale x 8 x i8> %v) #0 {
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1b { z0.h }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1h { z0.s }, p1/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1895,9 +1895,9 @@ define <vscale x 4 x half> @bitcast_nxv4i16_to_nxv4f16(<vscale x 4 x i16> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1911,9 +1911,9 @@ define <vscale x 4 x half> @bitcast_nxv2i32_to_nxv4f16(<vscale x 2 x i32> %v) #0
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1h { z0.s }, p1/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -1923,9 +1923,9 @@ define <vscale x 4 x half> @bitcast_nxv2i32_to_nxv4f16(<vscale x 2 x i32> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1h { z0.s }, p1/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1944,9 +1944,9 @@ define <vscale x 4 x half> @bitcast_nxv1i64_to_nxv4f16(<vscale x 1 x i64> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1961,9 +1961,9 @@ define <vscale x 4 x half> @bitcast_nxv2f32_to_nxv4f16(<vscale x 2 x float> %v)
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1h { z0.s }, p1/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -1973,9 +1973,9 @@ define <vscale x 4 x half> @bitcast_nxv2f32_to_nxv4f16(<vscale x 2 x float> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1h { z0.s }, p1/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -1994,9 +1994,9 @@ define <vscale x 4 x half> @bitcast_nxv1f64_to_nxv4f16(<vscale x 1 x double> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2034,9 +2034,9 @@ define <vscale x 2 x float> @bitcast_nxv8i8_to_nxv2f32(<vscale x 8 x i8> %v) #0
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    st1b { z0.h }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1w { z0.d }, p1/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -2046,9 +2046,9 @@ define <vscale x 2 x float> @bitcast_nxv8i8_to_nxv2f32(<vscale x 8 x i8> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1b { z0.h }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1w { z0.d }, p1/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2062,9 +2062,9 @@ define <vscale x 2 x float> @bitcast_nxv4i16_to_nxv2f32(<vscale x 4 x i16> %v) #
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1w { z0.d }, p1/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -2074,9 +2074,9 @@ define <vscale x 2 x float> @bitcast_nxv4i16_to_nxv2f32(<vscale x 4 x i16> %v) #
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1w { z0.d }, p1/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2094,9 +2094,9 @@ define <vscale x 2 x float> @bitcast_nxv2i32_to_nxv2f32(<vscale x 2 x i32> %v) #
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2115,9 +2115,9 @@ define <vscale x 2 x float> @bitcast_nxv1i64_to_nxv2f32(<vscale x 1 x i64> %v) #
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
 ; CHECK_BE-NEXT:    uunpklo z0.d, z0.s
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2132,9 +2132,9 @@ define <vscale x 2 x float> @bitcast_nxv4f16_to_nxv2f32(<vscale x 4 x half> %v)
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1w { z0.d }, p1/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -2144,9 +2144,9 @@ define <vscale x 2 x float> @bitcast_nxv4f16_to_nxv2f32(<vscale x 4 x half> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1w { z0.d }, p1/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2165,9 +2165,9 @@ define <vscale x 2 x float> @bitcast_nxv1f64_to_nxv2f32(<vscale x 1 x double> %v
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
 ; CHECK_BE-NEXT:    uunpklo z0.d, z0.s
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2182,9 +2182,9 @@ define <vscale x 2 x float> @bitcast_nxv4bf16_to_nxv2f32(<vscale x 4 x bfloat> %
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1w { z0.d }, p1/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -2194,9 +2194,9 @@ define <vscale x 2 x float> @bitcast_nxv4bf16_to_nxv2f32(<vscale x 4 x bfloat> %
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1w { z0.d }, p1/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2220,9 +2220,9 @@ define <vscale x 1 x double> @bitcast_nxv8i8_to_nxv1f64(<vscale x 8 x i8> %v) #0
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.b
 ; CHECK_BE-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2242,9 +2242,9 @@ define <vscale x 1 x double> @bitcast_nxv4i16_to_nxv1f64(<vscale x 4 x i16> %v)
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
 ; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2264,9 +2264,9 @@ define <vscale x 1 x double> @bitcast_nxv2i32_to_nxv1f64(<vscale x 2 x i32> %v)
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
 ; CHECK_BE-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2307,12 +2307,12 @@ define <vscale x 1 x double> @bitcast_nxv4f16_to_nxv1f64(<vscale x 4 x half> %v)
 ; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
 ; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #2, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp, #2, mul vl]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp, #2, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #3
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2360,12 +2360,12 @@ define <vscale x 1 x double> @bitcast_nxv4bf16_to_nxv1f64(<vscale x 4 x bfloat>
 ; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
 ; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #2, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp, #2, mul vl]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp, #2, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #3
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2383,9 +2383,9 @@ define <vscale x 4 x bfloat> @bitcast_nxv8i8_to_nxv4bf16(<vscale x 8 x i8> %v) #
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    st1b { z0.h }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1h { z0.s }, p1/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -2395,9 +2395,9 @@ define <vscale x 4 x bfloat> @bitcast_nxv8i8_to_nxv4bf16(<vscale x 8 x i8> %v) #
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1b { z0.h }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1h { z0.s }, p1/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2415,9 +2415,9 @@ define <vscale x 4 x bfloat> @bitcast_nxv4i16_to_nxv4bf16(<vscale x 4 x i16> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2431,9 +2431,9 @@ define <vscale x 4 x bfloat> @bitcast_nxv2i32_to_nxv4bf16(<vscale x 2 x i32> %v)
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1h { z0.s }, p1/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -2443,9 +2443,9 @@ define <vscale x 4 x bfloat> @bitcast_nxv2i32_to_nxv4bf16(<vscale x 2 x i32> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1h { z0.s }, p1/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2464,9 +2464,9 @@ define <vscale x 4 x bfloat> @bitcast_nxv1i64_to_nxv4bf16(<vscale x 1 x i64> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2500,9 +2500,9 @@ define <vscale x 4 x bfloat> @bitcast_nxv2f32_to_nxv4bf16(<vscale x 2 x float> %
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1h { z0.s }, p1/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -2512,9 +2512,9 @@ define <vscale x 4 x bfloat> @bitcast_nxv2f32_to_nxv4bf16(<vscale x 2 x float> %
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1h { z0.s }, p1/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2533,9 +2533,9 @@ define <vscale x 4 x bfloat> @bitcast_nxv1f64_to_nxv4bf16(<vscale x 1 x double>
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -2554,9 +2554,9 @@ define <vscale x 4 x i8> @bitcast_nxv2i16_to_nxv4i8(<vscale x 2 x i16> %v) #0 {
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    st1h { z0.d }, p0, [sp, #3, mul vl]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1b { z0.s }, p0/z, [sp, #3, mul vl]
+; CHECK-NEXT:    ld1b { z0.s }, p1/z, [sp, #3, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -2566,9 +2566,9 @@ define <vscale x 4 x i8> @bitcast_nxv2i16_to_nxv4i8(<vscale x 2 x i16> %v) #0 {
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1h { z0.d }, p0, [sp, #3, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1b { z0.s }, p0/z, [sp, #3, mul vl]
+; CHECK_BE-NEXT:    ld1b { z0.s }, p1/z, [sp, #3, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2588,9 +2588,9 @@ define <vscale x 4 x i8> @bitcast_nxv1i32_to_nxv4i8(<vscale x 1 x i32> %v) #0 {
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.b
 ; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.b
-; CHECK_BE-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1b { z0.b }, p1/z, [sp]
 ; CHECK_BE-NEXT:    uunpklo z0.h, z0.b
 ; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
@@ -2606,9 +2606,9 @@ define <vscale x 4 x i8> @bitcast_nxv2f16_to_nxv4i8(<vscale x 2 x half> %v) #0 {
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    st1h { z0.d }, p0, [sp, #3, mul vl]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1b { z0.s }, p0/z, [sp, #3, mul vl]
+; CHECK-NEXT:    ld1b { z0.s }, p1/z, [sp, #3, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -2618,9 +2618,9 @@ define <vscale x 4 x i8> @bitcast_nxv2f16_to_nxv4i8(<vscale x 2 x half> %v) #0 {
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1h { z0.d }, p0, [sp, #3, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1b { z0.s }, p0/z, [sp, #3, mul vl]
+; CHECK_BE-NEXT:    ld1b { z0.s }, p1/z, [sp, #3, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2636,9 +2636,9 @@ define <vscale x 4 x i8> @bitcast_nxv2bf16_to_nxv4i8(<vscale x 2 x bfloat> %v) #
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    st1h { z0.d }, p0, [sp, #3, mul vl]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1b { z0.s }, p0/z, [sp, #3, mul vl]
+; CHECK-NEXT:    ld1b { z0.s }, p1/z, [sp, #3, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -2648,9 +2648,9 @@ define <vscale x 4 x i8> @bitcast_nxv2bf16_to_nxv4i8(<vscale x 2 x bfloat> %v) #
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    st1h { z0.d }, p0, [sp, #3, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1b { z0.s }, p0/z, [sp, #3, mul vl]
+; CHECK_BE-NEXT:    ld1b { z0.s }, p1/z, [sp, #3, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2668,9 +2668,9 @@ define <vscale x 2 x i16> @bitcast_nxv4i8_to_nxv2i16(<vscale x 4 x i8> %v) #0 {
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    st1b { z0.s }, p0, [sp, #3, mul vl]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [sp, #3, mul vl]
+; CHECK-NEXT:    ld1h { z0.d }, p1/z, [sp, #3, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -2680,9 +2680,9 @@ define <vscale x 2 x i16> @bitcast_nxv4i8_to_nxv2i16(<vscale x 4 x i8> %v) #0 {
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1b { z0.s }, p0, [sp, #3, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1h { z0.d }, p0/z, [sp, #3, mul vl]
+; CHECK_BE-NEXT:    ld1h { z0.d }, p1/z, [sp, #3, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2702,9 +2702,9 @@ define <vscale x 2 x i16> @bitcast_nxv1i32_to_nxv2i16(<vscale x 1 x i32> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
 ; CHECK_BE-NEXT:    uunpklo z0.d, z0.s
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
@@ -2724,9 +2724,9 @@ define <vscale x 2 x i16> @bitcast_nxv2f16_to_nxv2i16(<vscale x 2 x half> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2746,9 +2746,9 @@ define <vscale x 2 x i16> @bitcast_nxv2bf16_to_nxv2i16(<vscale x 2 x bfloat> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2773,10 +2773,10 @@ define <vscale x 1 x i32> @bitcast_nxv4i8_to_nxv1i32(<vscale x 4 x i8> %v) #0 {
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2797,10 +2797,10 @@ define <vscale x 1 x i32> @bitcast_nxv2i16_to_nxv1i32(<vscale x 2 x i16> %v) #0
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.s
 ; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2814,9 +2814,9 @@ define <vscale x 1 x i32> @bitcast_nxv2f16_to_nxv1i32(<vscale x 2 x half> %v) #0
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    st1h { z0.d }, p0, [sp]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [sp]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -2826,12 +2826,12 @@ define <vscale x 1 x i32> @bitcast_nxv2f16_to_nxv1i32(<vscale x 2 x half> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-2
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.h
+; CHECK_BE-NEXT:    ptrue p2.s
 ; CHECK_BE-NEXT:    st1h { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
+; CHECK_BE-NEXT:    st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p2/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #2
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2847,9 +2847,9 @@ define <vscale x 1 x i32> @bitcast_nxv2bf16_to_nxv1i32(<vscale x 2 x bfloat> %v)
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    st1h { z0.d }, p0, [sp]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [sp]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -2859,12 +2859,12 @@ define <vscale x 1 x i32> @bitcast_nxv2bf16_to_nxv1i32(<vscale x 2 x bfloat> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-2
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.h
+; CHECK_BE-NEXT:    ptrue p2.s
 ; CHECK_BE-NEXT:    st1h { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
-; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.s
-; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
+; CHECK_BE-NEXT:    st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p2/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #2
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2882,9 +2882,9 @@ define <vscale x 2 x half> @bitcast_nxv4i8_to_nxv2f16(<vscale x 4 x i8> %v) #0 {
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    st1b { z0.s }, p0, [sp, #3, mul vl]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [sp, #3, mul vl]
+; CHECK-NEXT:    ld1h { z0.d }, p1/z, [sp, #3, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -2894,9 +2894,9 @@ define <vscale x 2 x half> @bitcast_nxv4i8_to_nxv2f16(<vscale x 4 x i8> %v) #0 {
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1b { z0.s }, p0, [sp, #3, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1h { z0.d }, p0/z, [sp, #3, mul vl]
+; CHECK_BE-NEXT:    ld1h { z0.d }, p1/z, [sp, #3, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2914,9 +2914,9 @@ define <vscale x 2 x half> @bitcast_nxv2i16_to_nxv2f16(<vscale x 2 x i16> %v) #0
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2966,9 +2966,9 @@ define <vscale x 2 x bfloat> @bitcast_nxv4i8_to_nxv2bf16(<vscale x 4 x i8> %v) #
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    st1b { z0.s }, p0, [sp, #3, mul vl]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [sp, #3, mul vl]
+; CHECK-NEXT:    ld1h { z0.d }, p1/z, [sp, #3, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -2978,9 +2978,9 @@ define <vscale x 2 x bfloat> @bitcast_nxv4i8_to_nxv2bf16(<vscale x 4 x i8> %v) #
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1b { z0.s }, p0, [sp, #3, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1h { z0.d }, p0/z, [sp, #3, mul vl]
+; CHECK_BE-NEXT:    ld1h { z0.d }, p1/z, [sp, #3, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2998,9 +2998,9 @@ define <vscale x 2 x bfloat> @bitcast_nxv2i16_to_nxv2bf16(<vscale x 2 x i16> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.d
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -3048,9 +3048,9 @@ define <vscale x 2 x i8> @bitcast_nxv1i16_to_nxv2i8(<vscale x 1 x i16> %v) #0 {
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.h
+; CHECK_BE-NEXT:    ptrue p1.b
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.b
-; CHECK_BE-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1b { z0.b }, p1/z, [sp]
 ; CHECK_BE-NEXT:    uunpklo z0.h, z0.b
 ; CHECK_BE-NEXT:    uunpklo z0.s, z0.h
 ; CHECK_BE-NEXT:    uunpklo z0.d, z0.s
@@ -3082,11 +3082,11 @@ define <vscale x 1 x i16> @bitcast_nxv2i8_to_nxv1i16(<vscale x 2 x i8> %v) #0 {
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK_BE-NEXT:    ptrue p0.b
+; CHECK_BE-NEXT:    ptrue p1.h
 ; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK_BE-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK_BE-NEXT:    ptrue p0.h
-; CHECK_BE-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -3172,10 +3172,10 @@ define <vscale x 2 x float> @bitcast_short_half_to_float(<vscale x 4 x half> %v)
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z0.h
 ; CHECK-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1w { z0.d }, p1/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -3185,10 +3185,10 @@ define <vscale x 2 x float> @bitcast_short_half_to_float(<vscale x 4 x half> %v)
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
 ; CHECK_BE-NEXT:    ptrue p0.s
+; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    fadd z0.h, p0/m, z0.h, z0.h
 ; CHECK_BE-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ptrue p0.d
-; CHECK_BE-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1w { z0.d }, p1/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #1
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-breakdown-scalable-vectortype.ll b/llvm/test/CodeGen/AArch64/sve-breakdown-scalable-vectortype.ll
index 63184513a769e9..5b69b68552a4d7 100644
--- a/llvm/test/CodeGen/AArch64/sve-breakdown-scalable-vectortype.ll
+++ b/llvm/test/CodeGen/AArch64/sve-breakdown-scalable-vectortype.ll
@@ -16,9 +16,9 @@ define <vscale x 32 x i8> @wide_32i8(i1 %b, <vscale x 16 x i8> %legal, <vscale x
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    str z9, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z9.d, z1.d
 ; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov z8.d, z2.d
-; CHECK-NEXT:    mov z9.d, z1.d
 ; CHECK-NEXT:    tbz w0, #0, .LBB0_2
 ; CHECK-NEXT:  // %bb.1: // %L1
 ; CHECK-NEXT:    bl bar
@@ -44,9 +44,9 @@ define <vscale x 16 x i16> @wide_16i16(i1 %b, <vscale x 16 x i8> %legal, <vscale
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    str z9, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z9.d, z1.d
 ; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov z8.d, z2.d
-; CHECK-NEXT:    mov z9.d, z1.d
 ; CHECK-NEXT:    tbz w0, #0, .LBB1_2
 ; CHECK-NEXT:  // %bb.1: // %L1
 ; CHECK-NEXT:    bl bar
@@ -72,9 +72,9 @@ define <vscale x 8 x i32> @wide_8i32(i1 %b, <vscale x 16 x i8> %legal, <vscale x
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    str z9, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z9.d, z1.d
 ; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov z8.d, z2.d
-; CHECK-NEXT:    mov z9.d, z1.d
 ; CHECK-NEXT:    tbz w0, #0, .LBB2_2
 ; CHECK-NEXT:  // %bb.1: // %L1
 ; CHECK-NEXT:    bl bar
@@ -100,9 +100,9 @@ define <vscale x 4 x i64> @wide_4i64(i1 %b, <vscale x 16 x i8> %legal, <vscale x
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    str z9, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z9.d, z1.d
 ; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov z8.d, z2.d
-; CHECK-NEXT:    mov z9.d, z1.d
 ; CHECK-NEXT:    tbz w0, #0, .LBB3_2
 ; CHECK-NEXT:  // %bb.1: // %L1
 ; CHECK-NEXT:    bl bar
@@ -128,9 +128,9 @@ define <vscale x 16 x half> @wide_16f16(i1 %b, <vscale x 16 x i8> %legal, <vscal
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    str z9, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z9.d, z1.d
 ; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov z8.d, z2.d
-; CHECK-NEXT:    mov z9.d, z1.d
 ; CHECK-NEXT:    tbz w0, #0, .LBB4_2
 ; CHECK-NEXT:  // %bb.1: // %L1
 ; CHECK-NEXT:    bl bar
@@ -156,9 +156,9 @@ define <vscale x 8 x float> @wide_8f32(i1 %b, <vscale x 16 x i8> %legal, <vscale
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    str z9, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z9.d, z1.d
 ; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov z8.d, z2.d
-; CHECK-NEXT:    mov z9.d, z1.d
 ; CHECK-NEXT:    tbz w0, #0, .LBB5_2
 ; CHECK-NEXT:  // %bb.1: // %L1
 ; CHECK-NEXT:    bl bar
@@ -184,9 +184,9 @@ define <vscale x 4 x double> @wide_4f64(i1 %b, <vscale x 16 x i8> %legal, <vscal
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    str z9, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z9.d, z1.d
 ; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov z8.d, z2.d
-; CHECK-NEXT:    mov z9.d, z1.d
 ; CHECK-NEXT:    tbz w0, #0, .LBB6_2
 ; CHECK-NEXT:  // %bb.1: // %L1
 ; CHECK-NEXT:    bl bar
@@ -216,19 +216,19 @@ define <vscale x 48 x i8> @wide_48i8(i1 %b, <vscale x 16 x i8> %legal, <vscale x
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-3
 ; CHECK-NEXT:    str z10, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z10.d, z1.d
 ; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z9.d, z2.d
 ; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov z8.d, z3.d
-; CHECK-NEXT:    mov z9.d, z2.d
-; CHECK-NEXT:    mov z10.d, z1.d
 ; CHECK-NEXT:    tbz w0, #0, .LBB7_2
 ; CHECK-NEXT:  // %bb.1: // %L1
 ; CHECK-NEXT:    bl bar
 ; CHECK-NEXT:  .LBB7_2: // %common.ret
 ; CHECK-NEXT:    mov z0.d, z10.d
 ; CHECK-NEXT:    mov z1.d, z9.d
-; CHECK-NEXT:    mov z2.d, z8.d
 ; CHECK-NEXT:    ldr z10, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, z8.d
 ; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #3
@@ -248,19 +248,19 @@ define <vscale x 24 x i16> @wide_24i16(i1 %b, <vscale x 16 x i8> %legal, <vscale
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-3
 ; CHECK-NEXT:    str z10, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z10.d, z1.d
 ; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z9.d, z2.d
 ; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov z8.d, z3.d
-; CHECK-NEXT:    mov z9.d, z2.d
-; CHECK-NEXT:    mov z10.d, z1.d
 ; CHECK-NEXT:    tbz w0, #0, .LBB8_2
 ; CHECK-NEXT:  // %bb.1: // %L1
 ; CHECK-NEXT:    bl bar
 ; CHECK-NEXT:  .LBB8_2: // %common.ret
 ; CHECK-NEXT:    mov z0.d, z10.d
 ; CHECK-NEXT:    mov z1.d, z9.d
-; CHECK-NEXT:    mov z2.d, z8.d
 ; CHECK-NEXT:    ldr z10, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, z8.d
 ; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #3
@@ -280,19 +280,19 @@ define <vscale x 12 x i32> @wide_12i32(i1 %b, <vscale x 16 x i8> %legal, <vscale
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-3
 ; CHECK-NEXT:    str z10, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z10.d, z1.d
 ; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z9.d, z2.d
 ; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov z8.d, z3.d
-; CHECK-NEXT:    mov z9.d, z2.d
-; CHECK-NEXT:    mov z10.d, z1.d
 ; CHECK-NEXT:    tbz w0, #0, .LBB9_2
 ; CHECK-NEXT:  // %bb.1: // %L1
 ; CHECK-NEXT:    bl bar
 ; CHECK-NEXT:  .LBB9_2: // %common.ret
 ; CHECK-NEXT:    mov z0.d, z10.d
 ; CHECK-NEXT:    mov z1.d, z9.d
-; CHECK-NEXT:    mov z2.d, z8.d
 ; CHECK-NEXT:    ldr z10, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, z8.d
 ; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #3
@@ -312,19 +312,19 @@ define <vscale x 6 x i64> @wide_6i64(i1 %b, <vscale x 16 x i8> %legal, <vscale x
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-3
 ; CHECK-NEXT:    str z10, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z10.d, z1.d
 ; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z9.d, z2.d
 ; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov z8.d, z3.d
-; CHECK-NEXT:    mov z9.d, z2.d
-; CHECK-NEXT:    mov z10.d, z1.d
 ; CHECK-NEXT:    tbz w0, #0, .LBB10_2
 ; CHECK-NEXT:  // %bb.1: // %L1
 ; CHECK-NEXT:    bl bar
 ; CHECK-NEXT:  .LBB10_2: // %common.ret
 ; CHECK-NEXT:    mov z0.d, z10.d
 ; CHECK-NEXT:    mov z1.d, z9.d
-; CHECK-NEXT:    mov z2.d, z8.d
 ; CHECK-NEXT:    ldr z10, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, z8.d
 ; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #3
@@ -344,19 +344,19 @@ define <vscale x 24 x half> @wide_24f16(i1 %b, <vscale x 16 x i8> %legal, <vscal
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-3
 ; CHECK-NEXT:    str z10, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z10.d, z1.d
 ; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z9.d, z2.d
 ; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov z8.d, z3.d
-; CHECK-NEXT:    mov z9.d, z2.d
-; CHECK-NEXT:    mov z10.d, z1.d
 ; CHECK-NEXT:    tbz w0, #0, .LBB11_2
 ; CHECK-NEXT:  // %bb.1: // %L1
 ; CHECK-NEXT:    bl bar
 ; CHECK-NEXT:  .LBB11_2: // %common.ret
 ; CHECK-NEXT:    mov z0.d, z10.d
 ; CHECK-NEXT:    mov z1.d, z9.d
-; CHECK-NEXT:    mov z2.d, z8.d
 ; CHECK-NEXT:    ldr z10, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, z8.d
 ; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #3
@@ -376,19 +376,19 @@ define <vscale x 12 x float> @wide_12f32(i1 %b, <vscale x 16 x i8> %legal, <vsca
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-3
 ; CHECK-NEXT:    str z10, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z10.d, z1.d
 ; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z9.d, z2.d
 ; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov z8.d, z3.d
-; CHECK-NEXT:    mov z9.d, z2.d
-; CHECK-NEXT:    mov z10.d, z1.d
 ; CHECK-NEXT:    tbz w0, #0, .LBB12_2
 ; CHECK-NEXT:  // %bb.1: // %L1
 ; CHECK-NEXT:    bl bar
 ; CHECK-NEXT:  .LBB12_2: // %common.ret
 ; CHECK-NEXT:    mov z0.d, z10.d
 ; CHECK-NEXT:    mov z1.d, z9.d
-; CHECK-NEXT:    mov z2.d, z8.d
 ; CHECK-NEXT:    ldr z10, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, z8.d
 ; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #3
@@ -408,19 +408,19 @@ define <vscale x 6 x double> @wide_6f64(i1 %b, <vscale x 16 x i8> %legal, <vscal
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-3
 ; CHECK-NEXT:    str z10, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z10.d, z1.d
 ; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z9.d, z2.d
 ; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov z8.d, z3.d
-; CHECK-NEXT:    mov z9.d, z2.d
-; CHECK-NEXT:    mov z10.d, z1.d
 ; CHECK-NEXT:    tbz w0, #0, .LBB13_2
 ; CHECK-NEXT:  // %bb.1: // %L1
 ; CHECK-NEXT:    bl bar
 ; CHECK-NEXT:  .LBB13_2: // %common.ret
 ; CHECK-NEXT:    mov z0.d, z10.d
 ; CHECK-NEXT:    mov z1.d, z9.d
-; CHECK-NEXT:    mov z2.d, z8.d
 ; CHECK-NEXT:    ldr z10, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, z8.d
 ; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #3
@@ -444,22 +444,22 @@ define <vscale x 64 x i8> @wide_64i8(i1 %b, <vscale x 16 x i8> %legal, <vscale x
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    str z11, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z11.d, z1.d
 ; CHECK-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z10.d, z2.d
 ; CHECK-NEXT:    str z9, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z9.d, z3.d
 ; CHECK-NEXT:    str z8, [sp, #3, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov z8.d, z4.d
-; CHECK-NEXT:    mov z9.d, z3.d
-; CHECK-NEXT:    mov z10.d, z2.d
-; CHECK-NEXT:    mov z11.d, z1.d
 ; CHECK-NEXT:    tbz w0, #0, .LBB14_2
 ; CHECK-NEXT:  // %bb.1: // %L1
 ; CHECK-NEXT:    bl bar
 ; CHECK-NEXT:  .LBB14_2: // %common.ret
 ; CHECK-NEXT:    mov z0.d, z11.d
 ; CHECK-NEXT:    mov z1.d, z10.d
+; CHECK-NEXT:    ldr z11, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov z2.d, z9.d
 ; CHECK-NEXT:    mov z3.d, z8.d
-; CHECK-NEXT:    ldr z11, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -480,22 +480,22 @@ define <vscale x 32 x i16> @wide_32i16(i1 %b, <vscale x 16 x i8> %legal, <vscale
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    str z11, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z11.d, z1.d
 ; CHECK-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z10.d, z2.d
 ; CHECK-NEXT:    str z9, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z9.d, z3.d
 ; CHECK-NEXT:    str z8, [sp, #3, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov z8.d, z4.d
-; CHECK-NEXT:    mov z9.d, z3.d
-; CHECK-NEXT:    mov z10.d, z2.d
-; CHECK-NEXT:    mov z11.d, z1.d
 ; CHECK-NEXT:    tbz w0, #0, .LBB15_2
 ; CHECK-NEXT:  // %bb.1: // %L1
 ; CHECK-NEXT:    bl bar
 ; CHECK-NEXT:  .LBB15_2: // %common.ret
 ; CHECK-NEXT:    mov z0.d, z11.d
 ; CHECK-NEXT:    mov z1.d, z10.d
+; CHECK-NEXT:    ldr z11, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov z2.d, z9.d
 ; CHECK-NEXT:    mov z3.d, z8.d
-; CHECK-NEXT:    ldr z11, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -516,22 +516,22 @@ define <vscale x 16 x i32> @wide_16i32(i1 %b, <vscale x 16 x i8> %legal, <vscale
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    str z11, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z11.d, z1.d
 ; CHECK-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z10.d, z2.d
 ; CHECK-NEXT:    str z9, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z9.d, z3.d
 ; CHECK-NEXT:    str z8, [sp, #3, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov z8.d, z4.d
-; CHECK-NEXT:    mov z9.d, z3.d
-; CHECK-NEXT:    mov z10.d, z2.d
-; CHECK-NEXT:    mov z11.d, z1.d
 ; CHECK-NEXT:    tbz w0, #0, .LBB16_2
 ; CHECK-NEXT:  // %bb.1: // %L1
 ; CHECK-NEXT:    bl bar
 ; CHECK-NEXT:  .LBB16_2: // %common.ret
 ; CHECK-NEXT:    mov z0.d, z11.d
 ; CHECK-NEXT:    mov z1.d, z10.d
+; CHECK-NEXT:    ldr z11, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov z2.d, z9.d
 ; CHECK-NEXT:    mov z3.d, z8.d
-; CHECK-NEXT:    ldr z11, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -552,22 +552,22 @@ define <vscale x 8 x i64> @wide_8i64(i1 %b, <vscale x 16 x i8> %legal, <vscale x
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    str z11, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z11.d, z1.d
 ; CHECK-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z10.d, z2.d
 ; CHECK-NEXT:    str z9, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z9.d, z3.d
 ; CHECK-NEXT:    str z8, [sp, #3, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov z8.d, z4.d
-; CHECK-NEXT:    mov z9.d, z3.d
-; CHECK-NEXT:    mov z10.d, z2.d
-; CHECK-NEXT:    mov z11.d, z1.d
 ; CHECK-NEXT:    tbz w0, #0, .LBB17_2
 ; CHECK-NEXT:  // %bb.1: // %L1
 ; CHECK-NEXT:    bl bar
 ; CHECK-NEXT:  .LBB17_2: // %common.ret
 ; CHECK-NEXT:    mov z0.d, z11.d
 ; CHECK-NEXT:    mov z1.d, z10.d
+; CHECK-NEXT:    ldr z11, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov z2.d, z9.d
 ; CHECK-NEXT:    mov z3.d, z8.d
-; CHECK-NEXT:    ldr z11, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -588,22 +588,22 @@ define <vscale x 32 x half> @wide_32f16(i1 %b, <vscale x 16 x i8> %legal, <vscal
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    str z11, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z11.d, z1.d
 ; CHECK-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z10.d, z2.d
 ; CHECK-NEXT:    str z9, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z9.d, z3.d
 ; CHECK-NEXT:    str z8, [sp, #3, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov z8.d, z4.d
-; CHECK-NEXT:    mov z9.d, z3.d
-; CHECK-NEXT:    mov z10.d, z2.d
-; CHECK-NEXT:    mov z11.d, z1.d
 ; CHECK-NEXT:    tbz w0, #0, .LBB18_2
 ; CHECK-NEXT:  // %bb.1: // %L1
 ; CHECK-NEXT:    bl bar
 ; CHECK-NEXT:  .LBB18_2: // %common.ret
 ; CHECK-NEXT:    mov z0.d, z11.d
 ; CHECK-NEXT:    mov z1.d, z10.d
+; CHECK-NEXT:    ldr z11, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov z2.d, z9.d
 ; CHECK-NEXT:    mov z3.d, z8.d
-; CHECK-NEXT:    ldr z11, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -624,22 +624,22 @@ define <vscale x 16 x float> @wide_16f32(i1 %b, <vscale x 16 x i8> %legal, <vsca
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    str z11, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z11.d, z1.d
 ; CHECK-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z10.d, z2.d
 ; CHECK-NEXT:    str z9, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z9.d, z3.d
 ; CHECK-NEXT:    str z8, [sp, #3, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov z8.d, z4.d
-; CHECK-NEXT:    mov z9.d, z3.d
-; CHECK-NEXT:    mov z10.d, z2.d
-; CHECK-NEXT:    mov z11.d, z1.d
 ; CHECK-NEXT:    tbz w0, #0, .LBB19_2
 ; CHECK-NEXT:  // %bb.1: // %L1
 ; CHECK-NEXT:    bl bar
 ; CHECK-NEXT:  .LBB19_2: // %common.ret
 ; CHECK-NEXT:    mov z0.d, z11.d
 ; CHECK-NEXT:    mov z1.d, z10.d
+; CHECK-NEXT:    ldr z11, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov z2.d, z9.d
 ; CHECK-NEXT:    mov z3.d, z8.d
-; CHECK-NEXT:    ldr z11, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -660,22 +660,22 @@ define <vscale x 8 x double> @wide_8f64(i1 %b, <vscale x 16 x i8> %legal, <vscal
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    str z11, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z11.d, z1.d
 ; CHECK-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z10.d, z2.d
 ; CHECK-NEXT:    str z9, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov z9.d, z3.d
 ; CHECK-NEXT:    str z8, [sp, #3, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov z8.d, z4.d
-; CHECK-NEXT:    mov z9.d, z3.d
-; CHECK-NEXT:    mov z10.d, z2.d
-; CHECK-NEXT:    mov z11.d, z1.d
 ; CHECK-NEXT:    tbz w0, #0, .LBB20_2
 ; CHECK-NEXT:  // %bb.1: // %L1
 ; CHECK-NEXT:    bl bar
 ; CHECK-NEXT:  .LBB20_2: // %common.ret
 ; CHECK-NEXT:    mov z0.d, z11.d
 ; CHECK-NEXT:    mov z1.d, z10.d
+; CHECK-NEXT:    ldr z11, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov z2.d, z9.d
 ; CHECK-NEXT:    mov z3.d, z8.d
-; CHECK-NEXT:    ldr z11, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
index 251e06bad004ba..53d7509c79dce9 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
@@ -14,9 +14,9 @@ define float @foo1(ptr %x0, ptr %x1, ptr %x2) nounwind {
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    fmov s0, #1.00000000
 ; CHECK-NEXT:    ld4d { z1.d - z4.d }, p0/z, [x0]
+; CHECK-NEXT:    mov x0, sp
 ; CHECK-NEXT:    ld4d { z16.d - z19.d }, p0/z, [x1]
 ; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x2]
-; CHECK-NEXT:    mov x0, sp
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    st1d { z16.d }, p0, [sp]
 ; CHECK-NEXT:    st1d { z17.d }, p0, [sp, #1, mul vl]
@@ -59,23 +59,23 @@ define float @foo2(ptr %x0, ptr %x1) nounwind {
 ; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    fmov s0, #1.00000000
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    add x9, sp, #16
+; CHECK-NEXT:    mov w2, #2 // =0x2
+; CHECK-NEXT:    mov w3, #3 // =0x3
+; CHECK-NEXT:    mov w4, #4 // =0x4
+; CHECK-NEXT:    mov w5, #5 // =0x5
+; CHECK-NEXT:    mov w6, #6 // =0x6
+; CHECK-NEXT:    mov w7, #7 // =0x7
 ; CHECK-NEXT:    ld4d { z1.d - z4.d }, p0/z, [x0]
-; CHECK-NEXT:    ld4d { z16.d - z19.d }, p0/z, [x1]
-; CHECK-NEXT:    fmov s0, #1.00000000
 ; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    mov w1, #1
-; CHECK-NEXT:    mov w2, #2
-; CHECK-NEXT:    mov w3, #3
-; CHECK-NEXT:    mov w4, #4
-; CHECK-NEXT:    mov w5, #5
-; CHECK-NEXT:    mov w6, #6
-; CHECK-NEXT:    mov w7, #7
-; CHECK-NEXT:    add x9, sp, #16
+; CHECK-NEXT:    ld4d { z16.d - z19.d }, p0/z, [x1]
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    st1d { z16.d }, p0, [x9]
-; CHECK-NEXT:    st1d { z17.d }, p0, [x9, #1, mul vl]
-; CHECK-NEXT:    st1d { z18.d }, p0, [x9, #2, mul vl]
+; CHECK-NEXT:    mov w1, #1 // =0x1
+; CHECK-NEXT:    st1d { z16.d }, p0, [x8]
+; CHECK-NEXT:    st1d { z17.d }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1d { z18.d }, p0, [x8, #2, mul vl]
 ; CHECK-NEXT:    st1d { z19.d }, p0, [x9, #3, mul vl]
 ; CHECK-NEXT:    str x8, [sp]
 ; CHECK-NEXT:    bl callee2
@@ -115,11 +115,11 @@ define float @foo3(ptr %x0, ptr %x1, ptr %x2) nounwind {
 ; CHECK-NEXT:    addvl sp, sp, #-3
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    fmov s0, #1.00000000
+; CHECK-NEXT:    fmov s1, #2.00000000
 ; CHECK-NEXT:    ld4d { z2.d - z5.d }, p0/z, [x0]
+; CHECK-NEXT:    mov x0, sp
 ; CHECK-NEXT:    ld3d { z16.d - z18.d }, p0/z, [x1]
 ; CHECK-NEXT:    ld1d { z6.d }, p0/z, [x2]
-; CHECK-NEXT:    fmov s1, #2.00000000
-; CHECK-NEXT:    mov x0, sp
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    st1d { z16.d }, p0, [sp]
 ; CHECK-NEXT:    st1d { z17.d }, p0, [sp, #1, mul vl]
@@ -182,8 +182,8 @@ entry:
 define double @foo5(i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, ptr %ptr1, ptr %ptr2, double %x0, <vscale x 8 x double> %x1, <vscale x 8 x double> %x2) nounwind {
 ; CHECK-LABEL: foo5:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr x8, [sp]
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr x8, [sp]
 ; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x8, #1, mul vl]
 ; CHECK-NEXT:    ld1d { z6.d }, p0/z, [x8]
 ; CHECK-NEXT:    ld1d { z7.d }, p0/z, [x8, #3, mul vl]
@@ -229,8 +229,8 @@ entry:
 define void @aavpcs1(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, i32 %s6, <vscale x 4 x i32> %s7, <vscale x 4 x i32> %s8, <vscale x 4 x i32> %s9, <vscale x 4 x i32> %s10, <vscale x 4 x i32> %s11, <vscale x 4 x i32> %s12, <vscale x 4 x i32> %s13, <vscale x 4 x i32> %s14, <vscale x 4 x i32> %s15, <vscale x 4 x i32> %s16, ptr %ptr) nounwind {
 ; CHECK-LABEL: aavpcs1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp x8, x9, [sp]
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldp x8, x9, [sp]
 ; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x8]
 ; CHECK-NEXT:    ld1w { z24.s }, p0/z, [x7]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x9]
@@ -261,8 +261,8 @@ entry:
 define void @aavpcs2(float %s0, float %s1, float %s2, float %s3, float %s4, float %s5, float %s6, <vscale x 4 x float> %s7, <vscale x 4 x float> %s8, <vscale x 4 x float> %s9, <vscale x 4 x float> %s10, <vscale x 4 x float> %s11, <vscale x 4 x float> %s12,<vscale x 4 x float> %s13,<vscale x 4 x float> %s14,<vscale x 4 x float> %s15,<vscale x 4 x float> %s16,ptr %ptr) nounwind {
 ; CHECK-LABEL: aavpcs2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp x8, x9, [sp]
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldp x8, x9, [sp]
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x7]
 ; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x6]
@@ -299,8 +299,8 @@ entry:
 define void @aavpcs3(float %s0, float %s1, float %s2, float %s3, float %s4, float %s5, float %s6, float %s7, <vscale x 4 x float> %s8, <vscale x 4 x float> %s9, <vscale x 4 x float> %s10, <vscale x 4 x float> %s11, <vscale x 4 x float> %s12, <vscale x 4 x float> %s13, <vscale x 4 x float> %s14, <vscale x 4 x float> %s15, <vscale x 4 x float> %s16, <vscale x 4 x float> %s17, <vscale x 16 x i1> %p0, ptr %ptr) nounwind {
 ; CHECK-LABEL: aavpcs3:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr x8, [sp]
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr x8, [sp]
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x7]
 ; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x6]
@@ -339,8 +339,8 @@ entry:
 define void @aavpcs4(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, i32 %s6, i32 %s7, <vscale x 4 x i32> %s8, <vscale x 4 x i32> %s9, <vscale x 4 x i32> %s10, <vscale x 4 x i32> %s11, <vscale x 4 x i32> %s12, <vscale x 4 x i32> %s13, <vscale x 4 x i32> %s14, <vscale x 4 x i32> %s15, <vscale x 4 x i32> %s16, <vscale x 4 x i32> %s17, ptr %ptr) nounwind {
 ; CHECK-LABEL: aavpcs4:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr x8, [sp]
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr x8, [sp]
 ; CHECK-NEXT:    ldr x9, [sp, #16]
 ; CHECK-NEXT:    ld1w { z24.s }, p0/z, [x8]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x9]
@@ -371,8 +371,8 @@ entry:
 define <vscale x 4 x float> @aavpcs5(float %s0, float %s1, float %s2, float %s3, float %s4, float %s5, float %s6, float %s7, <vscale x 4 x float> %s8, <vscale x 4 x float> %s9, <vscale x 4 x float> %s10, <vscale x 4 x float> %s11, <vscale x 4 x float> %s12, <vscale x 4 x float> %s13, <vscale x 4 x float> %s14, <vscale x 4 x float> %s15, <vscale x 4 x float> %s16, <vscale x 4 x float> %s17, ptr %ptr) nounwind {
 ; CHECK-LABEL: aavpcs5:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr x8, [sp]
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr x8, [sp]
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x8]
 ; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x7]
 ; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x6]
@@ -409,8 +409,8 @@ entry:
 define void @aapcs1(float %s0, float %s1, float %s2, float %s3, float %s4, float %s5, float %s6, float %s7, <vscale x 4 x float> %s8, <vscale x 4 x float> %s9, <vscale x 4 x float> %s10, <vscale x 4 x float> %s11, <vscale x 4 x float> %s12, <vscale x 4 x float> %s13, <vscale x 4 x float> %s14, <vscale x 4 x float> %s15, <vscale x 4 x float> %s16, <vscale x 4 x float> %s17, ptr %ptr) nounwind {
 ; CHECK-LABEL: aapcs1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr x8, [sp]
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr x8, [sp]
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x7]
 ; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x6]
@@ -456,14 +456,14 @@ define void @non_sve_caller_non_sve_callee_high_range()  {
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    movi d0, #0000000000000000
 ; CHECK-NEXT:    fmov s1, #1.00000000
+; CHECK-NEXT:    addvl x0, sp, #1
 ; CHECK-NEXT:    fmov s2, #2.00000000
 ; CHECK-NEXT:    fmov s3, #3.00000000
+; CHECK-NEXT:    mov x1, sp
 ; CHECK-NEXT:    fmov s4, #4.00000000
 ; CHECK-NEXT:    fmov s5, #5.00000000
 ; CHECK-NEXT:    fmov s6, #6.00000000
 ; CHECK-NEXT:    fmov s7, #7.00000000
-; CHECK-NEXT:    mov x1, sp
-; CHECK-NEXT:    addvl x0, sp, #1
 ; CHECK-NEXT:    bl non_sve_callee_high_range
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
@@ -482,8 +482,6 @@ define void @non_sve_caller_high_range_non_sve_callee_high_range(float %f0, floa
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    movi d0, #0000000000000000
-; CHECK-NEXT:    ld1w { z16.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z17.s }, p0/z, [x1]
 ; CHECK-NEXT:    fmov s1, #1.00000000
 ; CHECK-NEXT:    fmov s2, #2.00000000
 ; CHECK-NEXT:    fmov s3, #3.00000000
@@ -491,8 +489,10 @@ define void @non_sve_caller_high_range_non_sve_callee_high_range(float %f0, floa
 ; CHECK-NEXT:    fmov s5, #5.00000000
 ; CHECK-NEXT:    fmov s6, #6.00000000
 ; CHECK-NEXT:    fmov s7, #7.00000000
-; CHECK-NEXT:    mov x1, sp
+; CHECK-NEXT:    ld1w { z16.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z17.s }, p0/z, [x1]
 ; CHECK-NEXT:    addvl x0, sp, #1
+; CHECK-NEXT:    mov x1, sp
 ; CHECK-NEXT:    st1w { z17.s }, p0, [sp]
 ; CHECK-NEXT:    st1w { z16.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    bl non_sve_callee_high_range
@@ -548,53 +548,53 @@ define <vscale x 4 x float> @sve_caller_non_sve_callee_high_range(<vscale x 4 x
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z25.d, z0.d
 ; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    movi d0, #0000000000000000
 ; CHECK-NEXT:    mov z24.d, z1.d
+; CHECK-NEXT:    addvl x0, sp, #2
 ; CHECK-NEXT:    fmov s1, #1.00000000
 ; CHECK-NEXT:    fmov s2, #2.00000000
+; CHECK-NEXT:    addvl x1, sp, #1
 ; CHECK-NEXT:    fmov s3, #3.00000000
 ; CHECK-NEXT:    fmov s4, #4.00000000
 ; CHECK-NEXT:    fmov s5, #5.00000000
 ; CHECK-NEXT:    fmov s6, #6.00000000
 ; CHECK-NEXT:    fmov s7, #7.00000000
-; CHECK-NEXT:    addvl x0, sp, #2
-; CHECK-NEXT:    addvl x1, sp, #1
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    st1w { z24.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1w { z25.s }, p0, [sp, #2, mul vl]
 ; CHECK-NEXT:    bl non_sve_callee_high_range
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #3
 ; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -649,44 +649,44 @@ define <vscale x 4 x float> @sve_ret_caller_non_sve_callee_high_range()  {
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
 ; CHECK-NEXT:    movi d0, #0000000000000000
 ; CHECK-NEXT:    fmov s1, #1.00000000
+; CHECK-NEXT:    addvl x0, sp, #1
 ; CHECK-NEXT:    fmov s2, #2.00000000
 ; CHECK-NEXT:    fmov s3, #3.00000000
+; CHECK-NEXT:    mov x1, sp
 ; CHECK-NEXT:    fmov s4, #4.00000000
 ; CHECK-NEXT:    fmov s5, #5.00000000
 ; CHECK-NEXT:    fmov s6, #6.00000000
 ; CHECK-NEXT:    fmov s7, #7.00000000
-; CHECK-NEXT:    mov x1, sp
-; CHECK-NEXT:    addvl x0, sp, #1
 ; CHECK-NEXT:    bl non_sve_callee_high_range
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
index 22192cb0665047..f5721cd0fd7936 100644
--- a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
+++ b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
@@ -58,9 +58,9 @@ define float @fminimum_f32(<vscale x 8 x float> %a, <vscale x 4 x float> %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    fminv s2, p0, z2.s
+; CHECK-NEXT:    fminv s1, p0, z2.s
 ; CHECK-NEXT:    fminv s0, p0, z0.s
-; CHECK-NEXT:    fminnm s0, s0, s2
+; CHECK-NEXT:    fminnm s0, s0, s1
 ; CHECK-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fminimum.nxv8f32(<vscale x 8 x float> %a)
   %r2 = call fast float @llvm.vector.reduce.fminimum.nxv4f32(<vscale x 4 x float> %b)
@@ -73,9 +73,9 @@ define float @fmaximum_f32(<vscale x 8 x float> %a, <vscale x 4 x float> %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    fmaxv s2, p0, z2.s
+; CHECK-NEXT:    fmaxv s1, p0, z2.s
 ; CHECK-NEXT:    fmaxv s0, p0, z0.s
-; CHECK-NEXT:    fmaxnm s0, s0, s2
+; CHECK-NEXT:    fmaxnm s0, s0, s1
 ; CHECK-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmaximum.nxv8f32(<vscale x 8 x float> %a)
   %r2 = call fast float @llvm.vector.reduce.fmaximum.nxv4f32(<vscale x 4 x float> %b)
@@ -87,8 +87,8 @@ define float @fmaximum_f32(<vscale x 8 x float> %a, <vscale x 4 x float> %b) {
 define i32 @add_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: add_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    add z0.s, z0.s, z2.s
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
 ; CHECK-NEXT:    fmov x0, d0
@@ -107,9 +107,9 @@ define i16 @add_ext_i16(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    uunpkhi z3.h, z1.b
 ; CHECK-NEXT:    uunpklo z1.h, z1.b
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    add z0.h, z0.h, z2.h
 ; CHECK-NEXT:    add z1.h, z1.h, z3.h
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
 ; CHECK-NEXT:    fmov x0, d0
@@ -132,11 +132,11 @@ define i16 @add_ext_v32i16(<vscale x 32 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-NEXT:    uunpkhi z0.h, z0.b
 ; CHECK-NEXT:    uunpkhi z5.h, z2.b
 ; CHECK-NEXT:    uunpklo z2.h, z2.b
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    add z1.h, z4.h, z3.h
 ; CHECK-NEXT:    add z0.h, z1.h, z0.h
 ; CHECK-NEXT:    add z1.h, z2.h, z5.h
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
 ; CHECK-NEXT:    fmov x0, d0
@@ -160,8 +160,8 @@ define i16 @add_ext_v32i16(<vscale x 32 x i8> %a, <vscale x 16 x i8> %b) {
 define i32 @and_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: and_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    and z0.d, z0.d, z2.d
 ; CHECK-NEXT:    andv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
@@ -175,8 +175,8 @@ define i32 @and_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
 define i32 @or_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: or_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NEXT:    orr z0.d, z0.d, z2.d
 ; CHECK-NEXT:    orv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0

diff  --git a/llvm/test/CodeGen/AArch64/sve-expand-div.ll b/llvm/test/CodeGen/AArch64/sve-expand-div.ll
index abafb311d12c8a..5469c29f1aa7e0 100644
--- a/llvm/test/CodeGen/AArch64/sve-expand-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-expand-div.ll
@@ -10,8 +10,8 @@
 define <vscale x 16 x i8> @sdiv_i8(<vscale x 16 x i8> %a) #0 {
 ; CHECK-LABEL: sdiv_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.b, #86 // =0x56
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov z1.b, #86 // =0x56
 ; CHECK-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    lsr z1.b, z0.b, #7
 ; CHECK-NEXT:    add z0.b, z0.b, z1.b
@@ -23,8 +23,8 @@ define <vscale x 16 x i8> @sdiv_i8(<vscale x 16 x i8> %a) #0 {
 define <vscale x 8 x i16> @sdiv_i16(<vscale x 8 x i16> %a) #0 {
 ; CHECK-LABEL: sdiv_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #21846
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov w8, #21846 // =0x5556
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    lsr z1.h, z0.h, #15
@@ -37,8 +37,8 @@ define <vscale x 8 x i16> @sdiv_i16(<vscale x 8 x i16> %a) #0 {
 define <vscale x 4 x i32> @sdiv_i32(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: sdiv_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #21846
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w8, #21846 // =0x5556
 ; CHECK-NEXT:    movk w8, #21845, lsl #16
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
@@ -52,8 +52,8 @@ define <vscale x 4 x i32> @sdiv_i32(<vscale x 4 x i32> %a) #0 {
 define <vscale x 2 x i64> @sdiv_i64(<vscale x 2 x i64> %a) #0 {
 ; CHECK-LABEL: sdiv_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #6148914691236517205
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x8, #6148914691236517205 // =0x5555555555555555
 ; CHECK-NEXT:    movk x8, #21846
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
@@ -71,8 +71,8 @@ define <vscale x 2 x i64> @sdiv_i64(<vscale x 2 x i64> %a) #0 {
 define <vscale x 16 x i8> @udiv_i8(<vscale x 16 x i8> %a) #0 {
 ; CHECK-LABEL: udiv_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.b, #-85 // =0xffffffffffffffab
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov z1.b, #-85 // =0xffffffffffffffab
 ; CHECK-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    lsr z0.b, z0.b, #1
 ; CHECK-NEXT:    ret
@@ -83,8 +83,8 @@ define <vscale x 16 x i8> @udiv_i8(<vscale x 16 x i8> %a) #0 {
 define <vscale x 8 x i16> @udiv_i16(<vscale x 8 x i16> %a) #0 {
 ; CHECK-LABEL: udiv_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-21845
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov w8, #-21845 // =0xffffaaab
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    lsr z0.h, z0.h, #1
@@ -96,8 +96,8 @@ define <vscale x 8 x i16> @udiv_i16(<vscale x 8 x i16> %a) #0 {
 define <vscale x 4 x i32> @udiv_i32(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: udiv_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #43691
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w8, #43691 // =0xaaab
 ; CHECK-NEXT:    movk w8, #43690, lsl #16
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
@@ -110,8 +110,8 @@ define <vscale x 4 x i32> @udiv_i32(<vscale x 4 x i32> %a) #0 {
 define <vscale x 2 x i64> @udiv_i64(<vscale x 2 x i64> %a) #0 {
 ; CHECK-LABEL: udiv_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-6148914691236517206
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x8, #-6148914691236517206 // =0xaaaaaaaaaaaaaaaa
 ; CHECK-NEXT:    movk x8, #43691
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    umulh z0.d, p0/m, z0.d, z1.d

diff  --git a/llvm/test/CodeGen/AArch64/sve-extract-element.ll b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
index 3385ad525c7ed7..273785f2436404 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-element.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
@@ -630,8 +630,8 @@ define i1 @test_lanex_4xi1(<vscale x 4 x i1> %a, i32 %x) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z0.s, p0/z, #1 // =0x1
-; CHECK-NEXT:    whilels p0.s, xzr, x8
-; CHECK-NEXT:    lastb w8, p0, z0.s
+; CHECK-NEXT:    whilels p1.s, xzr, x8
+; CHECK-NEXT:    lastb w8, p1, z0.s
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
   %b = extractelement <vscale x 4 x i1> %a, i32 %x

diff  --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
index e4448df84e47a0..bc1c563810f358 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
@@ -70,8 +70,8 @@ define <2 x i16> @extract_v2i16_nxv32i16_8(<vscale x 32 x i16> %arg) {
 ; CHECK-NEXT:    addvl sp, sp, #-8
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    mov x8, sp
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov x8, sp
 ; CHECK-NEXT:    add x8, x8, #32
 ; CHECK-NEXT:    st1h { z3.h }, p0, [sp, #3, mul vl]
 ; CHECK-NEXT:    st1h { z2.h }, p0, [sp, #2, mul vl]
@@ -100,15 +100,15 @@ define <2 x i64> @extract_v2i64_nxv8i64_8(<vscale x 8 x i64> %arg) {
 ; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cnth x8
-; CHECK-NEXT:    mov w9, #8
+; CHECK-NEXT:    mov w9, #8 // =0x8
 ; CHECK-NEXT:    sub x8, x8, #2
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cmp x8, #8
-; CHECK-NEXT:    st1d { z3.d }, p0, [sp, #3, mul vl]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    lsl x8, x8, #3
+; CHECK-NEXT:    st1d { z3.d }, p0, [sp, #3, mul vl]
 ; CHECK-NEXT:    st1d { z2.d }, p0, [sp, #2, mul vl]
 ; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
@@ -162,8 +162,8 @@ define <4 x i1> @extract_v4i1_nxv32i1_0(<vscale x 32 x i1> %arg) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z1.b, p0/z, #1 // =0x1
 ; CHECK-NEXT:    umov w8, v1.b[1]
-; CHECK-NEXT:    umov w9, v1.b[2]
 ; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    umov w9, v1.b[2]
 ; CHECK-NEXT:    mov v0.h[1], w8
 ; CHECK-NEXT:    umov w8, v1.b[3]
 ; CHECK-NEXT:    mov v0.h[2], w9
@@ -183,11 +183,11 @@ define <4 x i1> @extract_v4i1_nxv32i1_16(<vscale x 32 x i1> %arg) {
 ; CHECK-NEXT:    addvl sp, sp, #-8
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    mov x8, sp
 ; CHECK-NEXT:    ptrue p2.b
-; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    mov z0.b, p1/z, #1 // =0x1
 ; CHECK-NEXT:    mov z1.b, p0/z, #1 // =0x1
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    st1b { z0.b }, p2, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1b { z1.b }, p2, [sp]
 ; CHECK-NEXT:    st1b { z0.b }, p2, [sp, #3, mul vl]
@@ -240,8 +240,8 @@ define <4 x i3> @extract_v4i3_nxv32i3_16(<vscale x 32 x i3> %arg) {
 ; CHECK-NEXT:    addvl sp, sp, #-8
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    mov x8, sp
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov x8, sp
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    st1b { z1.b }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
@@ -298,8 +298,8 @@ define <4 x i64> @extract_v4i64_nxv8i64_0(<vscale x 8 x i64> %arg) {
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK-NEXT:    ldr q1, [sp, #16]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ldr q1, [sp, #16]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
index 9038fa698fe3b1..79962d441d1048 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
@@ -17,15 +17,15 @@ define <2 x i64> @extract_v2i64_nxv2i64_idx2(<vscale x 2 x i64> %vec) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cntd x8
-; CHECK-NEXT:    mov w9, #2
+; CHECK-NEXT:    mov w9, #2 // =0x2
 ; CHECK-NEXT:    sub x8, x8, #2
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cmp x8, #2
-; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    lsl x8, x8, #3
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
 ; CHECK-NEXT:    ldr q0, [x9, x8]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -50,15 +50,15 @@ define <4 x i32> @extract_v4i32_nxv4i32_idx4(<vscale x 4 x i32> %vec) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cntw x8
-; CHECK-NEXT:    mov w9, #4
+; CHECK-NEXT:    mov w9, #4 // =0x4
 ; CHECK-NEXT:    sub x8, x8, #4
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cmp x8, #4
-; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    lsl x8, x8, #2
+; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
 ; CHECK-NEXT:    ldr q0, [x9, x8]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -84,12 +84,12 @@ define <4 x i32> @extract_v4i32_nxv2i32_idx4(<vscale x 2 x i32> %vec) nounwind #
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov x8, #4
-; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x8, #4 // =0x4
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ptrue p1.d, vl4
 ; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK-NEXT:    ptrue p0.d, vl4
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x9, x8, lsl #3]
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x9, x8, lsl #3]
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -115,15 +115,15 @@ define <8 x i16> @extract_v8i16_nxv8i16_idx8(<vscale x 8 x i16> %vec) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    cnth x8
-; CHECK-NEXT:    mov w9, #8
+; CHECK-NEXT:    mov w9, #8 // =0x8
 ; CHECK-NEXT:    sub x8, x8, #8
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    cmp x8, #8
-; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    lsl x8, x8, #1
+; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
 ; CHECK-NEXT:    ldr q0, [x9, x8]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -149,12 +149,12 @@ define <8 x i16> @extract_v8i16_nxv4i16_idx8(<vscale x 4 x i16> %vec) nounwind #
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov x8, #8
-; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov x8, #8 // =0x8
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ptrue p1.s, vl8
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK-NEXT:    ptrue p0.s, vl8
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x9, x8, lsl #2]
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x9, x8, lsl #2]
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -182,12 +182,12 @@ define <8 x i16> @extract_v8i16_nxv2i16_idx8(<vscale x 2 x i16> %vec) nounwind #
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov x8, #8
-; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x8, #8 // =0x8
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ptrue p1.d, vl8
 ; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK-NEXT:    ptrue p0.d, vl8
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x9, x8, lsl #3]
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x9, x8, lsl #3]
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -214,14 +214,14 @@ define <16 x i8> @extract_v16i8_nxv16i8_idx16(<vscale x 16 x i8> %vec) nounwind
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov x8, #-16
-; CHECK-NEXT:    mov w9, #16
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK-NEXT:    mov x8, #-16 // =0xfffffffffffffff0
+; CHECK-NEXT:    mov w9, #16 // =0x10
 ; CHECK-NEXT:    addvl x8, x8, #1
 ; CHECK-NEXT:    cmp x8, #16
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
 ; CHECK-NEXT:    ldr q0, [x9, x8]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -247,12 +247,12 @@ define <16 x i8> @extract_v16i8_nxv8i8_idx16(<vscale x 8 x i8> %vec) nounwind #1
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov x8, #16
-; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov x8, #16 // =0x10
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ptrue p1.h, vl16
 ; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
-; CHECK-NEXT:    ptrue p0.h, vl16
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x9, x8, lsl #1]
+; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x9, x8, lsl #1]
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -280,12 +280,12 @@ define <16 x i8> @extract_v16i8_nxv4i8_idx16(<vscale x 4 x i8> %vec) nounwind #1
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov x8, #16
-; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov x8, #16 // =0x10
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ptrue p1.s, vl16
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
-; CHECK-NEXT:    ptrue p0.s, vl16
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x9, x8, lsl #2]
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x9, x8, lsl #2]
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -351,8 +351,8 @@ define <4 x i1> @extract_v4i1_nxv4i1(<vscale x 4 x i1> %inmask) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z1.s, p0/z, #1 // =0x1
 ; CHECK-NEXT:    mov w8, v1.s[1]
-; CHECK-NEXT:    mov w9, v1.s[2]
 ; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    mov w9, v1.s[2]
 ; CHECK-NEXT:    mov v0.h[1], w8
 ; CHECK-NEXT:    mov w8, v1.s[3]
 ; CHECK-NEXT:    mov v0.h[2], w9
@@ -368,8 +368,8 @@ define <8 x i1> @extract_v8i1_nxv8i1(<vscale x 8 x i1> %inmask) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z1.h, p0/z, #1 // =0x1
 ; CHECK-NEXT:    umov w8, v1.h[1]
-; CHECK-NEXT:    umov w9, v1.h[2]
 ; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    umov w9, v1.h[2]
 ; CHECK-NEXT:    mov v0.b[1], w8
 ; CHECK-NEXT:    umov w8, v1.h[3]
 ; CHECK-NEXT:    mov v0.b[2], w9
@@ -393,8 +393,8 @@ define <16 x i1> @extract_v16i1_nxv16i1(<vscale x 16 x i1> %inmask) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z1.b, p0/z, #1 // =0x1
 ; CHECK-NEXT:    umov w8, v1.b[1]
-; CHECK-NEXT:    umov w9, v1.b[2]
 ; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    umov w9, v1.b[2]
 ; CHECK-NEXT:    mov v0.b[1], w8
 ; CHECK-NEXT:    umov w8, v1.b[3]
 ; CHECK-NEXT:    mov v0.b[2], w9

diff  --git a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
index 462f02d3f5d7e3..e60a2f142922fd 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
@@ -65,27 +65,29 @@ define <vscale x 14 x i1> @extract_nxv14i1_nxv28i1_14(<vscale x 28 x i1> %in) uw
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    punpkhi p2.h, p1.b
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    punpklo p2.h, p2.b
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    punpklo p2.h, p2.b
 ; CHECK-NEXT:    punpkhi p3.h, p1.b
-; CHECK-NEXT:    punpkhi p4.h, p2.b
-; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    punpkhi p0.h, p0.b
+; CHECK-NEXT:    punpkhi p4.h, p2.b
 ; CHECK-NEXT:    punpklo p2.h, p2.b
 ; CHECK-NEXT:    punpkhi p5.h, p3.b
-; CHECK-NEXT:    uzp1 p4.s, p4.s, p0.s
-; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    punpklo p3.h, p3.b
-; CHECK-NEXT:    uzp1 p2.s, p5.s, p2.s
-; CHECK-NEXT:    punpkhi p5.h, p1.b
+; CHECK-NEXT:    punpkhi p6.h, p1.b
 ; CHECK-NEXT:    punpklo p1.h, p1.b
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
-; CHECK-NEXT:    uzp1 p3.s, p5.s, p3.s
+; CHECK-NEXT:    uzp1 p2.s, p5.s, p2.s
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    uzp1 p3.s, p6.s, p3.s
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    uzp1 p4.s, p4.s, p0.s
 ; CHECK-NEXT:    uzp1 p0.s, p0.s, p1.s
 ; CHECK-NEXT:    uzp1 p1.h, p2.h, p4.h
-; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    uzp1 p0.h, p0.h, p3.h
 ; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
@@ -556,18 +558,18 @@ define <vscale x 14 x i8> @extract_nxv14i8_nxv28i8_14(<vscale x 28 x i8> %in) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uunpkhi z0.h, z0.b
 ; CHECK-NEXT:    uunpklo z2.h, z1.b
+; CHECK-NEXT:    uunpkhi z1.h, z1.b
 ; CHECK-NEXT:    uunpkhi z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z4.s, z2.h
+; CHECK-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    uunpklo z5.d, z4.s
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    uunpkhi z2.s, z2.h
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
-; CHECK-NEXT:    uunpkhi z1.h, z1.b
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
-; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z3.s, z0.h
 ; CHECK-NEXT:    uunpkhi z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z3.d, z3.s
@@ -580,8 +582,8 @@ define <vscale x 14 x i8> @extract_nxv14i8_nxv28i8_14(<vscale x 28 x i8> %in) {
 ; CHECK-NEXT:    uunpkhi z3.d, z3.s
 ; CHECK-NEXT:    uzp1 z3.s, z4.s, z3.s
 ; CHECK-NEXT:    uunpklo z4.d, z2.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z3.h
 ; CHECK-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z3.h
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    uunpkhi z3.s, z0.h
@@ -596,8 +598,8 @@ define <vscale x 14 x i8> @extract_nxv14i8_nxv28i8_14(<vscale x 28 x i8> %in) {
 ; CHECK-NEXT:    uunpkhi z4.d, z4.s
 ; CHECK-NEXT:    uzp1 z2.s, z2.s, z4.s
 ; CHECK-NEXT:    uunpklo z4.d, z1.s
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z3.h
 ; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z3.h
 ; CHECK-NEXT:    uzp1 z2.b, z0.b, z2.b
 ; CHECK-NEXT:    uunpkhi z2.h, z2.b
 ; CHECK-NEXT:    uunpklo z3.s, z2.h

diff  --git a/llvm/test/CodeGen/AArch64/sve-fcmp.ll b/llvm/test/CodeGen/AArch64/sve-fcmp.ll
index 35cbe65c6a8b86..f7e3b6d0171ac3 100644
--- a/llvm/test/CodeGen/AArch64/sve-fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fcmp.ll
@@ -374,8 +374,8 @@ define <vscale x 4 x i1> @one_zero(<vscale x 4 x float> %x) {
 define <vscale x 4 x i1> @ueq_zero(<vscale x 4 x float> %x) {
 ; CHECK-LABEL: ueq_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.s, #0 // =0x0
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov z1.s, #0 // =0x0
 ; CHECK-NEXT:    fcmuo p1.s, p0/z, z0.s, z1.s
 ; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
 ; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b

diff  --git a/llvm/test/CodeGen/AArch64/sve-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-fcopysign.ll
index 65f1055ffafc01..d2315844dc2f0f 100644
--- a/llvm/test/CodeGen/AArch64/sve-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fcopysign.ll
@@ -62,13 +62,13 @@ define <vscale x 4 x float> @test_copysign_v4f32_v4f64(<vscale x 4 x float> %a,
 ; CHECK-EXTEND-ROUND:       // %bb.0:
 ; CHECK-EXTEND-ROUND-NEXT:    ptrue p0.d
 ; CHECK-EXTEND-ROUND-NEXT:    uunpkhi z3.d, z0.s
+; CHECK-EXTEND-ROUND-NEXT:    uunpklo z0.d, z0.s
+; CHECK-EXTEND-ROUND-NEXT:    and z3.s, z3.s, #0x7fffffff
+; CHECK-EXTEND-ROUND-NEXT:    and z0.s, z0.s, #0x7fffffff
 ; CHECK-EXTEND-ROUND-NEXT:    fcvt z2.s, p0/m, z2.d
 ; CHECK-EXTEND-ROUND-NEXT:    fcvt z1.s, p0/m, z1.d
-; CHECK-EXTEND-ROUND-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-EXTEND-ROUND-NEXT:    and z2.s, z2.s, #0x80000000
-; CHECK-EXTEND-ROUND-NEXT:    and z3.s, z3.s, #0x7fffffff
 ; CHECK-EXTEND-ROUND-NEXT:    and z1.s, z1.s, #0x80000000
-; CHECK-EXTEND-ROUND-NEXT:    and z0.s, z0.s, #0x7fffffff
 ; CHECK-EXTEND-ROUND-NEXT:    orr z2.d, z3.d, z2.d
 ; CHECK-EXTEND-ROUND-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-EXTEND-ROUND-NEXT:    uzp1 z0.s, z0.s, z2.s
@@ -116,16 +116,16 @@ define <vscale x 4 x double> @test_copysign_v4f64_v4f32(<vscale x 4 x double> %a
 ; CHECK-LABEL: test_copysign_v4f64_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpkhi z3.d, z2.s
-; CHECK-NEXT:    uunpklo z2.d, z2.s
+; CHECK-NEXT:    uunpklo z3.d, z2.s
+; CHECK-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-NEXT:    and z0.d, z0.d, #0x7fffffffffffffff
+; CHECK-NEXT:    and z1.d, z1.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcvt z3.d, p0/m, z3.s
 ; CHECK-NEXT:    fcvt z2.d, p0/m, z2.s
-; CHECK-NEXT:    and z0.d, z0.d, #0x7fffffffffffffff
-; CHECK-NEXT:    and z2.d, z2.d, #0x8000000000000000
 ; CHECK-NEXT:    and z3.d, z3.d, #0x8000000000000000
-; CHECK-NEXT:    and z1.d, z1.d, #0x7fffffffffffffff
-; CHECK-NEXT:    orr z0.d, z0.d, z2.d
-; CHECK-NEXT:    orr z1.d, z1.d, z3.d
+; CHECK-NEXT:    and z2.d, z2.d, #0x8000000000000000
+; CHECK-NEXT:    orr z0.d, z0.d, z3.d
+; CHECK-NEXT:    orr z1.d, z1.d, z2.d
 ; CHECK-NEXT:    ret
   %tmp0 = fpext <vscale x 4 x float> %b to <vscale x 4 x double>
   %r = call <vscale x 4 x double> @llvm.copysign.v4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %tmp0)
@@ -192,13 +192,13 @@ define <vscale x 4 x half> @test_copysign_v4f16_v4f64(<vscale x 4 x half> %a, <v
 ; CHECK-EXTEND-ROUND:       // %bb.0:
 ; CHECK-EXTEND-ROUND-NEXT:    ptrue p0.d
 ; CHECK-EXTEND-ROUND-NEXT:    uunpkhi z3.d, z0.s
+; CHECK-EXTEND-ROUND-NEXT:    uunpklo z0.d, z0.s
+; CHECK-EXTEND-ROUND-NEXT:    and z3.h, z3.h, #0x7fff
+; CHECK-EXTEND-ROUND-NEXT:    and z0.h, z0.h, #0x7fff
 ; CHECK-EXTEND-ROUND-NEXT:    fcvt z2.h, p0/m, z2.d
 ; CHECK-EXTEND-ROUND-NEXT:    fcvt z1.h, p0/m, z1.d
-; CHECK-EXTEND-ROUND-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-EXTEND-ROUND-NEXT:    and z2.h, z2.h, #0x8000
-; CHECK-EXTEND-ROUND-NEXT:    and z3.h, z3.h, #0x7fff
 ; CHECK-EXTEND-ROUND-NEXT:    and z1.h, z1.h, #0x8000
-; CHECK-EXTEND-ROUND-NEXT:    and z0.h, z0.h, #0x7fff
 ; CHECK-EXTEND-ROUND-NEXT:    orr z2.d, z3.d, z2.d
 ; CHECK-EXTEND-ROUND-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-EXTEND-ROUND-NEXT:    uzp1 z0.s, z0.s, z2.s
@@ -239,13 +239,13 @@ define <vscale x 8 x half> @test_copysign_v8f16_v8f32(<vscale x 8 x half> %a, <v
 ; CHECK-EXTEND-ROUND:       // %bb.0:
 ; CHECK-EXTEND-ROUND-NEXT:    ptrue p0.s
 ; CHECK-EXTEND-ROUND-NEXT:    uunpkhi z3.s, z0.h
+; CHECK-EXTEND-ROUND-NEXT:    uunpklo z0.s, z0.h
+; CHECK-EXTEND-ROUND-NEXT:    and z3.h, z3.h, #0x7fff
+; CHECK-EXTEND-ROUND-NEXT:    and z0.h, z0.h, #0x7fff
 ; CHECK-EXTEND-ROUND-NEXT:    fcvt z2.h, p0/m, z2.s
 ; CHECK-EXTEND-ROUND-NEXT:    fcvt z1.h, p0/m, z1.s
-; CHECK-EXTEND-ROUND-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-EXTEND-ROUND-NEXT:    and z2.h, z2.h, #0x8000
-; CHECK-EXTEND-ROUND-NEXT:    and z3.h, z3.h, #0x7fff
 ; CHECK-EXTEND-ROUND-NEXT:    and z1.h, z1.h, #0x8000
-; CHECK-EXTEND-ROUND-NEXT:    and z0.h, z0.h, #0x7fff
 ; CHECK-EXTEND-ROUND-NEXT:    orr z2.d, z3.d, z2.d
 ; CHECK-EXTEND-ROUND-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-EXTEND-ROUND-NEXT:    uzp1 z0.h, z0.h, z2.h
@@ -261,9 +261,9 @@ define <vscale x 8 x half> @test_copysign_v8f16_v8f32(<vscale x 8 x half> %a, <v
 define <vscale x 4 x half> @test_copysign_nxv4f32_nxv4f16(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
 ; CHECK-NO-EXTEND-ROUND-LABEL: test_copysign_nxv4f32_nxv4f16:
 ; CHECK-NO-EXTEND-ROUND:       // %bb.0:
+; CHECK-NO-EXTEND-ROUND-NEXT:    ptrue p0.s
 ; CHECK-NO-EXTEND-ROUND-NEXT:    and z1.s, z1.s, #0x80000000
 ; CHECK-NO-EXTEND-ROUND-NEXT:    and z0.s, z0.s, #0x7fffffff
-; CHECK-NO-EXTEND-ROUND-NEXT:    ptrue p0.s
 ; CHECK-NO-EXTEND-ROUND-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NO-EXTEND-ROUND-NEXT:    fcvt z0.h, p0/m, z0.s
 ; CHECK-NO-EXTEND-ROUND-NEXT:    ret
@@ -285,9 +285,9 @@ define <vscale x 4 x half> @test_copysign_nxv4f32_nxv4f16(<vscale x 4 x float> %
 define <vscale x 2 x float> @test_copysign_nxv2f64_nxv2f32(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
 ; CHECK-NO-EXTEND-ROUND-LABEL: test_copysign_nxv2f64_nxv2f32:
 ; CHECK-NO-EXTEND-ROUND:       // %bb.0:
+; CHECK-NO-EXTEND-ROUND-NEXT:    ptrue p0.d
 ; CHECK-NO-EXTEND-ROUND-NEXT:    and z1.d, z1.d, #0x8000000000000000
 ; CHECK-NO-EXTEND-ROUND-NEXT:    and z0.d, z0.d, #0x7fffffffffffffff
-; CHECK-NO-EXTEND-ROUND-NEXT:    ptrue p0.d
 ; CHECK-NO-EXTEND-ROUND-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NO-EXTEND-ROUND-NEXT:    fcvt z0.s, p0/m, z0.d
 ; CHECK-NO-EXTEND-ROUND-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-fix-length-and-combine-512.ll b/llvm/test/CodeGen/AArch64/sve-fix-length-and-combine-512.ll
index 05944346e299c7..bf706f3122e3a3 100644
--- a/llvm/test/CodeGen/AArch64/sve-fix-length-and-combine-512.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fix-length-and-combine-512.ll
@@ -4,9 +4,9 @@
 define void @vls_sve_and_64xi8(ptr %ap, ptr %out) nounwind {
 ; CHECK-LABEL: vls_sve_and_64xi8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b, vl64
 ; CHECK-NEXT:    adrp x8, .LCPI0_0
 ; CHECK-NEXT:    add x8, x8, :lo12:.LCPI0_0
-; CHECK-NEXT:    ptrue p0.b, vl64
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x8]
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-addressing-modes.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-addressing-modes.ll
index 2cf0b47a421144..ed7ea657874a4a 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-addressing-modes.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-addressing-modes.ll
@@ -7,8 +7,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @masked_gather_base_plus_stride_v8f32(ptr %dst, ptr %src) #0 {
 ; CHECK-LABEL: masked_gather_base_plus_stride_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    index z0.s, #0, #7
 ; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    index z0.s, #0, #7
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1, z0.s, sxtw #2]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -21,8 +21,8 @@ define void @masked_gather_base_plus_stride_v8f32(ptr %dst, ptr %src) #0 {
 define void @masked_gather_base_plus_stride_v4f64(ptr %dst, ptr %src) #0 {
 ; CHECK-LABEL: masked_gather_base_plus_stride_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-32
 ; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    mov x8, #-32 // =0xffffffffffffffe0
 ; CHECK-NEXT:    index z0.d, #-2, x8
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1, z0.d, lsl #3]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
@@ -37,9 +37,9 @@ define void @masked_scatter_base_plus_stride_v8f32(ptr %dst, ptr %src) #0 {
 ; CHECK-LABEL: masked_scatter_base_plus_stride_v8f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl8
-; CHECK-NEXT:    index z1.s, #0, #-7
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1]
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0, z1.s, sxtw #2]
+; CHECK-NEXT:    index z0.s, #0, #-7
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x0, z0.s, sxtw #2]
 ; CHECK-NEXT:    ret
   %data = load <8 x float>, ptr %src, align 4
   %ptrs = getelementptr float, ptr %dst, <8 x i64> <i64 0, i64 -7, i64 -14, i64 -21, i64 -28, i64 -35, i64 -42, i64 -49>
@@ -51,9 +51,9 @@ define void @masked_scatter_base_plus_stride_v4f64(ptr %dst, ptr %src) #0 {
 ; CHECK-LABEL: masked_scatter_base_plus_stride_v4f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
-; CHECK-NEXT:    index z1.d, #-2, #3
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
-; CHECK-NEXT:    st1d { z0.d }, p0, [x0, z1.d, lsl #3]
+; CHECK-NEXT:    index z0.d, #-2, #3
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    st1d { z1.d }, p0, [x0, z0.d, lsl #3]
 ; CHECK-NEXT:    ret
   %data = load <4 x double>, ptr %src, align 8
   %ptrs = getelementptr double, ptr %dst, <4 x i64> <i64 -2, i64 1, i64 4, i64 7>

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll
index 6174ae3f1c1345..1e71c4b66156cb 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll
@@ -46,8 +46,8 @@ define void @ctlz_v32i8(ptr %a) vscale_range(2,0) #0 {
 define void @ctlz_v64i8(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: ctlz_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    clz z0.b, p0/m, z0.b
@@ -134,8 +134,8 @@ define void @ctlz_v16i16(ptr %a) vscale_range(2,0) #0 {
 define void @ctlz_v32i16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: ctlz_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    clz z0.h, p0/m, z0.h
@@ -222,8 +222,8 @@ define void @ctlz_v8i32(ptr %a) vscale_range(2,0) #0 {
 define void @ctlz_v16i32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: ctlz_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    clz z0.s, p0/m, z0.s
@@ -276,8 +276,8 @@ define void @ctlz_v64i32(ptr %a)  vscale_range(16,0) #0 {
 define <1 x i64> @ctlz_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ctlz_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -288,8 +288,8 @@ define <1 x i64> @ctlz_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
 define <2 x i64> @ctlz_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ctlz_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -314,8 +314,8 @@ define void @ctlz_v4i64(ptr %a) vscale_range(2,0) #0 {
 define void @ctlz_v8i64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: ctlz_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    clz z0.d, p0/m, z0.d
@@ -406,8 +406,8 @@ define void @ctpop_v32i8(ptr %a) vscale_range(2,0) #0 {
 define void @ctpop_v64i8(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: ctpop_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    cnt z0.b, p0/m, z0.b
@@ -496,8 +496,8 @@ define void @ctpop_v16i16(ptr %a) vscale_range(2,0) #0 {
 define void @ctpop_v32i16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: ctpop_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    cnt z0.h, p0/m, z0.h
@@ -588,8 +588,8 @@ define void @ctpop_v8i32(ptr %a) vscale_range(2,0) #0 {
 define void @ctpop_v16i32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: ctpop_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    cnt z0.s, p0/m, z0.s
@@ -682,8 +682,8 @@ define void @ctpop_v4i64(ptr %a) vscale_range(2,0) #0 {
 define void @ctpop_v8i64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: ctpop_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    cnt z0.d, p0/m, z0.d
@@ -740,8 +740,8 @@ define void @ctpop_v32i64(ptr %a) vscale_range(16,0) #0 {
 define <8 x i8> @cttz_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: cttz_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
 ; CHECK-NEXT:    clz v0.8b, v0.8b
 ; CHECK-NEXT:    ret
@@ -752,8 +752,8 @@ define <8 x i8> @cttz_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
 define <16 x i8> @cttz_v16i8(<16 x i8> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: cttz_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
 ; CHECK-NEXT:    clz v0.16b, v0.16b
 ; CHECK-NEXT:    ret
@@ -779,8 +779,8 @@ define void @cttz_v32i8(ptr %a) vscale_range(2,0) #0 {
 define void @cttz_v64i8(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: cttz_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    rbit z0.b, p0/m, z0.b
@@ -838,8 +838,8 @@ define void @cttz_v256i8(ptr %a) vscale_range(16,0) #0 {
 define <4 x i16> @cttz_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: cttz_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
 ; CHECK-NEXT:    clz v0.4h, v0.4h
 ; CHECK-NEXT:    ret
@@ -850,8 +850,8 @@ define <4 x i16> @cttz_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
 define <8 x i16> @cttz_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: cttz_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
 ; CHECK-NEXT:    clz v0.8h, v0.8h
 ; CHECK-NEXT:    ret
@@ -877,8 +877,8 @@ define void @cttz_v16i16(ptr %a) vscale_range(2,0) #0 {
 define void @cttz_v32i16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: cttz_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    rbit z0.h, p0/m, z0.h
@@ -937,8 +937,8 @@ define void @cttz_v128i16(ptr %a) vscale_range(16,0) #0 {
 define <2 x i32> @cttz_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: cttz_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
 ; CHECK-NEXT:    clz v0.2s, v0.2s
 ; CHECK-NEXT:    ret
@@ -950,8 +950,8 @@ define <2 x i32> @cttz_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
 define <4 x i32> @cttz_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: cttz_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
 ; CHECK-NEXT:    clz v0.4s, v0.4s
 ; CHECK-NEXT:    ret
@@ -977,8 +977,8 @@ define void @cttz_v8i32(ptr %a) vscale_range(2,0) #0 {
 define void @cttz_v16i32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: cttz_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    rbit z0.s, p0/m, z0.s
@@ -1036,8 +1036,8 @@ define void @cttz_v64i32(ptr %a) vscale_range(16,0) #0 {
 define <1 x i64> @cttz_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: cttz_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -1049,8 +1049,8 @@ define <1 x i64> @cttz_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
 define <2 x i64> @cttz_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: cttz_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -1077,8 +1077,8 @@ define void @cttz_v4i64(ptr %a) vscale_range(2,0) #0 {
 define void @cttz_v8i64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: cttz_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    rbit z0.d, p0/m, z0.d

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll
index a7ac7a8f131b79..e39aa474c25334 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll
@@ -47,8 +47,8 @@ define void @bitcast_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @bitcast_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: bitcast_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x8, lsl #1]
@@ -135,8 +135,8 @@ define void @bitcast_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @bitcast_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: bitcast_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
@@ -223,8 +223,8 @@ define void @bitcast_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @bitcast_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: bitcast_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll
index ec3e1d8aa7fdc8..3fdd08701053eb 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll
@@ -13,14 +13,14 @@ define <8 x i32> @fixed_bitselect_v8i32(ptr %pre_cond_ptr, ptr %left_ptr, ptr %r
 ; CHECK-LABEL: fixed_bitselect_v8i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl8
-; CHECK-NEXT:    mov z3.s, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z1.s, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x2]
-; CHECK-NEXT:    add z3.s, z0.s, z3.s
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
+; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x2]
+; CHECK-NEXT:    add z1.s, z0.s, z1.s
 ; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
-; CHECK-NEXT:    and z0.d, z0.d, z1.d
-; CHECK-NEXT:    and z1.d, z3.d, z2.d
+; CHECK-NEXT:    and z0.d, z0.d, z2.d
+; CHECK-NEXT:    and z1.d, z1.d, z3.d
 ; CHECK-NEXT:    orr z0.d, z1.d, z0.d
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll
index 8914905c579caa..0e3307d1617298 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll
@@ -42,8 +42,8 @@ define void @build_vector_0_dec3_v8i32(ptr %a) #0 {
 define void @build_vector_minus2_dec32_v4i64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: build_vector_minus2_dec32_v4i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #-32
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #-32
 ; VBITS_GE_256-NEXT:    index z0.d, #-2, x8
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
@@ -60,9 +60,9 @@ define void @build_vector_no_stride_v4i64(ptr %a) #0 {
 ; VBITS_GE_256-NEXT:    .xword  8
 ; VBITS_GE_256-LABEL: build_vector_no_stride_v4i64:
 ; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    adrp x8, .LCPI4_0
 ; VBITS_GE_256-NEXT:    add x8, x8, :lo12:.LCPI4_0
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x8]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll
index 41f6c1f5250e79..e54d22b140bf60 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll
@@ -35,12 +35,12 @@ define <16 x i8> @concat_v16i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #
 define void @concat_v32i8(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x1]
 ; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    splice z1.b, p0, z1.b, z0.b
-; CHECK-NEXT:    ptrue p0.b, vl32
-; CHECK-NEXT:    st1b { z1.b }, p0, [x2]
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    ptrue p1.b, vl32
+; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p1, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i8>, ptr %a
   %op2 = load <16 x i8>, ptr %b
@@ -56,7 +56,7 @@ define void @concat_v64i8(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-LABEL: concat_v64i8:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_256-NEXT:    mov w8, #32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x2, x8]
@@ -66,11 +66,11 @@ define void @concat_v64i8(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_512-LABEL: concat_v64i8:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_512-NEXT:    ptrue p1.b, vl64
 ; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    splice z0.b, p0, z0.b, z1.b
-; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
-; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x2]
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p1, [x2]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -90,11 +90,11 @@ define void @concat_v128i8(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: concat_v128i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b, vl64
+; CHECK-NEXT:    ptrue p1.b, vl128
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
-; CHECK-NEXT:    ptrue p0.b, vl128
-; CHECK-NEXT:    st1b { z0.b }, p0, [x2]
+; CHECK-NEXT:    st1b { z0.b }, p1, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <64 x i8>, ptr %a
   %op2 = load <64 x i8>, ptr %b
@@ -122,11 +122,11 @@ define void @concat_v256i8(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: concat_v256i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ptrue p1.b, vl256
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
-; CHECK-NEXT:    ptrue p0.b, vl256
-; CHECK-NEXT:    st1b { z0.b }, p0, [x2]
+; CHECK-NEXT:    st1b { z0.b }, p1, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, ptr %a
   %op2 = load <128 x i8>, ptr %b
@@ -195,12 +195,12 @@ define <8 x i16> @concat_v8i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0)
 define void @concat_v16i16(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    splice z1.h, p0, z1.h, z0.h
-; CHECK-NEXT:    ptrue p0.h, vl16
-; CHECK-NEXT:    st1h { z1.h }, p0, [x2]
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    ptrue p1.h, vl16
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p1, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %op2 = load <8 x i16>, ptr %b
@@ -214,7 +214,7 @@ define void @concat_v32i16(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-LABEL: concat_v32i16:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x2, x8, lsl #1]
@@ -224,11 +224,11 @@ define void @concat_v32i16(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_512-LABEL: concat_v32i16:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_512-NEXT:    ptrue p1.h, vl32
 ; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    splice z0.h, p0, z0.h, z1.h
-; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x2]
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p1, [x2]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -244,11 +244,11 @@ define void @concat_v64i16(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: concat_v64i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ptrue p1.h, vl64
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
-; CHECK-NEXT:    ptrue p0.h, vl64
-; CHECK-NEXT:    st1h { z0.h }, p0, [x2]
+; CHECK-NEXT:    st1h { z0.h }, p1, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i16>, ptr %a
   %op2 = load <32 x i16>, ptr %b
@@ -268,11 +268,11 @@ define void @concat_v128i16(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: concat_v128i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ptrue p1.h, vl128
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
-; CHECK-NEXT:    ptrue p0.h, vl128
-; CHECK-NEXT:    st1h { z0.h }, p0, [x2]
+; CHECK-NEXT:    st1h { z0.h }, p1, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, ptr %a
   %op2 = load <64 x i16>, ptr %b
@@ -325,12 +325,12 @@ define <4 x i32> @concat_v4i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0)
 define void @concat_v8i32(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x1]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    splice z1.s, p0, z1.s, z0.s
-; CHECK-NEXT:    ptrue p0.s, vl8
-; CHECK-NEXT:    st1w { z1.s }, p0, [x2]
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    ptrue p1.s, vl8
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p1, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %op2 = load <4 x i32>, ptr %b
@@ -343,7 +343,7 @@ define void @concat_v16i32(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-LABEL: concat_v16i32:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x2, x8, lsl #2]
@@ -353,11 +353,11 @@ define void @concat_v16i32(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_512-LABEL: concat_v16i32:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_512-NEXT:    ptrue p1.s, vl16
 ; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    splice z0.s, p0, z0.s, z1.s
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x2]
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p1, [x2]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -371,11 +371,11 @@ define void @concat_v32i32(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: concat_v32i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    ptrue p1.s, vl32
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
-; CHECK-NEXT:    ptrue p0.s, vl32
-; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
+; CHECK-NEXT:    st1w { z0.s }, p1, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i32>, ptr %a
   %op2 = load <16 x i32>, ptr %b
@@ -391,11 +391,11 @@ define void @concat_v64i32(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: concat_v64i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ptrue p1.s, vl64
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
-; CHECK-NEXT:    ptrue p0.s, vl64
-; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
+; CHECK-NEXT:    st1w { z0.s }, p1, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, ptr %a
   %op2 = load <32 x i32>, ptr %b
@@ -430,12 +430,12 @@ define <2 x i64> @concat_v2i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0)
 define void @concat_v4i64(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x1]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    splice z1.d, p0, z1.d, z0.d
-; CHECK-NEXT:    ptrue p0.d, vl4
-; CHECK-NEXT:    st1d { z1.d }, p0, [x2]
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    ptrue p1.d, vl4
+; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p1, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
@@ -448,7 +448,7 @@ define void @concat_v8i64(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-LABEL: concat_v8i64:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2, x8, lsl #3]
@@ -458,11 +458,11 @@ define void @concat_v8i64(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_512-LABEL: concat_v8i64:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_512-NEXT:    ptrue p1.d, vl8
 ; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    splice z0.d, p0, z0.d, z1.d
-; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x2]
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p1, [x2]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -475,11 +475,11 @@ define void @concat_v16i64(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: concat_v16i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl8
+; CHECK-NEXT:    ptrue p1.d, vl16
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
-; CHECK-NEXT:    ptrue p0.d, vl16
-; CHECK-NEXT:    st1d { z0.d }, p0, [x2]
+; CHECK-NEXT:    st1d { z0.d }, p1, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x i64>, ptr %a
   %op2 = load <8 x i64>, ptr %b
@@ -493,11 +493,11 @@ define void @concat_v32i64(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: concat_v32i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ptrue p1.d, vl32
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
-; CHECK-NEXT:    ptrue p0.d, vl32
-; CHECK-NEXT:    st1d { z0.d }, p0, [x2]
+; CHECK-NEXT:    st1d { z0.d }, p1, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, ptr %a
   %op2 = load <16 x i64>, ptr %b
@@ -538,12 +538,12 @@ define <8 x half> @concat_v8f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2
 define void @concat_v16f16(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    splice z1.h, p0, z1.h, z0.h
-; CHECK-NEXT:    ptrue p0.h, vl16
-; CHECK-NEXT:    st1h { z1.h }, p0, [x2]
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    ptrue p1.h, vl16
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p1, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %op2 = load <8 x half>, ptr %b
@@ -557,7 +557,7 @@ define void @concat_v32f16(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-LABEL: concat_v32f16:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x2, x8, lsl #1]
@@ -567,11 +567,11 @@ define void @concat_v32f16(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_512-LABEL: concat_v32f16:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_512-NEXT:    ptrue p1.h, vl32
 ; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    splice z0.h, p0, z0.h, z1.h
-; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x2]
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p1, [x2]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -587,11 +587,11 @@ define void @concat_v64f16(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: concat_v64f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    ptrue p1.h, vl64
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
-; CHECK-NEXT:    ptrue p0.h, vl64
-; CHECK-NEXT:    st1h { z0.h }, p0, [x2]
+; CHECK-NEXT:    st1h { z0.h }, p1, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x half>, ptr %a
   %op2 = load <32 x half>, ptr %b
@@ -611,11 +611,11 @@ define void @concat_v128f16(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: concat_v128f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    ptrue p1.h, vl128
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
-; CHECK-NEXT:    ptrue p0.h, vl128
-; CHECK-NEXT:    st1h { z0.h }, p0, [x2]
+; CHECK-NEXT:    st1h { z0.h }, p1, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <64 x half>, ptr %a
   %op2 = load <64 x half>, ptr %b
@@ -668,12 +668,12 @@ define <4 x float> @concat_v4f32(<2 x float> %op1, <2 x float> %op2) vscale_rang
 define void @concat_v8f32(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x1]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    splice z1.s, p0, z1.s, z0.s
-; CHECK-NEXT:    ptrue p0.s, vl8
-; CHECK-NEXT:    st1w { z1.s }, p0, [x2]
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    ptrue p1.s, vl8
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p1, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %op2 = load <4 x float>, ptr %b
@@ -686,7 +686,7 @@ define void @concat_v16f32(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-LABEL: concat_v16f32:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x2, x8, lsl #2]
@@ -696,11 +696,11 @@ define void @concat_v16f32(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_512-LABEL: concat_v16f32:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_512-NEXT:    ptrue p1.s, vl16
 ; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    splice z0.s, p0, z0.s, z1.s
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x2]
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p1, [x2]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -714,11 +714,11 @@ define void @concat_v32f32(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: concat_v32f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    ptrue p1.s, vl32
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
-; CHECK-NEXT:    ptrue p0.s, vl32
-; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
+; CHECK-NEXT:    st1w { z0.s }, p1, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x float>, ptr %a
   %op2 = load <16 x float>, ptr %b
@@ -734,11 +734,11 @@ define void @concat_v64f32(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: concat_v64f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    ptrue p1.s, vl64
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
-; CHECK-NEXT:    ptrue p0.s, vl64
-; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
+; CHECK-NEXT:    st1w { z0.s }, p1, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x float>, ptr %a
   %op2 = load <32 x float>, ptr %b
@@ -773,12 +773,12 @@ define <2 x double> @concat_v2f64(<1 x double> %op1, <1 x double> %op2) vscale_r
 define void @concat_v4f64(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x1]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    splice z1.d, p0, z1.d, z0.d
-; CHECK-NEXT:    ptrue p0.d, vl4
-; CHECK-NEXT:    st1d { z1.d }, p0, [x2]
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    ptrue p1.d, vl4
+; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p1, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <2 x double>, ptr %a
   %op2 = load <2 x double>, ptr %b
@@ -791,7 +791,7 @@ define void @concat_v8f64(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-LABEL: concat_v8f64:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x2, x8, lsl #3]
@@ -801,11 +801,11 @@ define void @concat_v8f64(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_512-LABEL: concat_v8f64:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_512-NEXT:    ptrue p1.d, vl8
 ; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    splice z0.d, p0, z0.d, z1.d
-; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x2]
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p1, [x2]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
@@ -818,11 +818,11 @@ define void @concat_v16f64(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: concat_v16f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl8
+; CHECK-NEXT:    ptrue p1.d, vl16
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
-; CHECK-NEXT:    ptrue p0.d, vl16
-; CHECK-NEXT:    st1d { z0.d }, p0, [x2]
+; CHECK-NEXT:    st1d { z0.d }, p1, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %op2 = load <8 x double>, ptr %b
@@ -836,11 +836,11 @@ define void @concat_v32f64(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: concat_v32f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    ptrue p1.d, vl32
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
-; CHECK-NEXT:    ptrue p0.d, vl32
-; CHECK-NEXT:    st1d { z0.d }, p0, [x2]
+; CHECK-NEXT:    st1d { z0.d }, p1, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x double>, ptr %a
   %op2 = load <16 x double>, ptr %b
@@ -859,8 +859,8 @@ define void @concat_v32f64(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
 define void @concat_v32i8_undef(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v32i8_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i8>, ptr %a
@@ -875,8 +875,8 @@ define void @concat_v32i8_undef(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @concat_v16i16_undef(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v16i16_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
@@ -889,8 +889,8 @@ define void @concat_v16i16_undef(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @concat_v8i32_undef(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v8i32_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
@@ -902,8 +902,8 @@ define void @concat_v8i32_undef(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @concat_v4i64_undef(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v4i64_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
@@ -919,8 +919,8 @@ define void @concat_v4i64_undef(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @concat_v32i8_4op(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v32i8_4op:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x i8>, ptr %a
@@ -937,8 +937,8 @@ define void @concat_v32i8_4op(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @concat_v16i16_4op(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v16i16_4op:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <4 x i16>, ptr %a
@@ -952,8 +952,8 @@ define void @concat_v16i16_4op(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @concat_v8i32_4op(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v8i32_4op:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <2 x i32>, ptr %a
@@ -966,8 +966,8 @@ define void @concat_v8i32_4op(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @concat_v4i64_4op(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: concat_v4i64_4op:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <1 x i64>, ptr %a

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
index d7bd08628ff538..94aef218d4de31 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
@@ -73,8 +73,8 @@ define <32 x i32> @load_zext_v32i16i32(ptr %ap) vscale_range(8,0) #0 {
 define <64 x i32> @load_zext_v64i16i32(ptr %ap) #0 {
 ; VBITS_GE_1024-LABEL: load_zext_v64i16i32:
 ; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    mov x9, #32 // =0x20
 ; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
+; VBITS_GE_1024-NEXT:    mov x9, #32 // =0x20
 ; VBITS_GE_1024-NEXT:    ld1h { z0.s }, p0/z, [x0, x9, lsl #1]
 ; VBITS_GE_1024-NEXT:    ld1h { z1.s }, p0/z, [x0]
 ; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x8, x9, lsl #2]
@@ -142,8 +142,8 @@ define <32 x i32> @load_sext_v32i16i32(ptr %ap) vscale_range(8,0) #0 {
 define <64 x i32> @load_sext_v64i16i32(ptr %ap) #0 {
 ; VBITS_GE_1024-LABEL: load_sext_v64i16i32:
 ; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    mov x9, #32 // =0x20
 ; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
+; VBITS_GE_1024-NEXT:    mov x9, #32 // =0x20
 ; VBITS_GE_1024-NEXT:    ld1sh { z0.s }, p0/z, [x0, x9, lsl #1]
 ; VBITS_GE_1024-NEXT:    ld1sh { z1.s }, p0/z, [x0]
 ; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x8, x9, lsl #2]
@@ -164,8 +164,8 @@ define <64 x i32> @load_sext_v64i16i32(ptr %ap) #0 {
 define <32 x i64> @load_zext_v32i8i64(ptr %ap) #0 {
 ; VBITS_GE_1024-LABEL: load_zext_v32i8i64:
 ; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    mov w9, #16 // =0x10
 ; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
+; VBITS_GE_1024-NEXT:    mov w9, #16 // =0x10
 ; VBITS_GE_1024-NEXT:    ld1b { z0.d }, p0/z, [x0, x9]
 ; VBITS_GE_1024-NEXT:    ld1b { z1.d }, p0/z, [x0]
 ; VBITS_GE_1024-NEXT:    mov x9, #16 // =0x10
@@ -187,8 +187,8 @@ define <32 x i64> @load_zext_v32i8i64(ptr %ap) #0 {
 define <32 x i64> @load_sext_v32i8i64(ptr %ap) #0 {
 ; VBITS_GE_1024-LABEL: load_sext_v32i8i64:
 ; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    mov w9, #16 // =0x10
 ; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
+; VBITS_GE_1024-NEXT:    mov w9, #16 // =0x10
 ; VBITS_GE_1024-NEXT:    ld1sb { z0.d }, p0/z, [x0, x9]
 ; VBITS_GE_1024-NEXT:    ld1sb { z1.d }, p0/z, [x0]
 ; VBITS_GE_1024-NEXT:    mov x9, #16 // =0x10
@@ -210,8 +210,8 @@ define <32 x i64> @load_sext_v32i8i64(ptr %ap) #0 {
 define <32 x i64> @load_zext_v32i16i64(ptr %ap) #0 {
 ; VBITS_GE_1024-LABEL: load_zext_v32i16i64:
 ; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    mov x9, #16 // =0x10
 ; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
+; VBITS_GE_1024-NEXT:    mov x9, #16 // =0x10
 ; VBITS_GE_1024-NEXT:    ld1h { z0.d }, p0/z, [x0, x9, lsl #1]
 ; VBITS_GE_1024-NEXT:    ld1h { z1.d }, p0/z, [x0]
 ; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
@@ -232,8 +232,8 @@ define <32 x i64> @load_zext_v32i16i64(ptr %ap) #0 {
 define <32 x i64> @load_sext_v32i16i64(ptr %ap) #0 {
 ; VBITS_GE_1024-LABEL: load_sext_v32i16i64:
 ; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    mov x9, #16 // =0x10
 ; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
+; VBITS_GE_1024-NEXT:    mov x9, #16 // =0x10
 ; VBITS_GE_1024-NEXT:    ld1sh { z0.d }, p0/z, [x0, x9, lsl #1]
 ; VBITS_GE_1024-NEXT:    ld1sh { z1.d }, p0/z, [x0]
 ; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
@@ -254,8 +254,8 @@ define <32 x i64> @load_sext_v32i16i64(ptr %ap) #0 {
 define <32 x i64> @load_zext_v32i32i64(ptr %ap) #0 {
 ; VBITS_GE_1024-LABEL: load_zext_v32i32i64:
 ; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    mov x9, #16 // =0x10
 ; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
+; VBITS_GE_1024-NEXT:    mov x9, #16 // =0x10
 ; VBITS_GE_1024-NEXT:    ld1w { z0.d }, p0/z, [x0, x9, lsl #2]
 ; VBITS_GE_1024-NEXT:    ld1w { z1.d }, p0/z, [x0]
 ; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
@@ -276,8 +276,8 @@ define <32 x i64> @load_zext_v32i32i64(ptr %ap) #0 {
 define <32 x i64> @load_sext_v32i32i64(ptr %ap) #0 {
 ; VBITS_GE_1024-LABEL: load_sext_v32i32i64:
 ; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    mov x9, #16 // =0x10
 ; VBITS_GE_1024-NEXT:    ptrue p0.d, vl16
+; VBITS_GE_1024-NEXT:    mov x9, #16 // =0x10
 ; VBITS_GE_1024-NEXT:    ld1sw { z0.d }, p0/z, [x0, x9, lsl #2]
 ; VBITS_GE_1024-NEXT:    ld1sw { z1.d }, p0/z, [x0]
 ; VBITS_GE_1024-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll
index 6f80ce10e643d7..066c06d5aa76c5 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll
@@ -45,8 +45,8 @@ define void @extract_subvector_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @extract_subvector_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: extract_subvector_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x1]
 ; VBITS_GE_256-NEXT:    ret
@@ -137,8 +137,8 @@ define void @extract_subvector_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @extract_subvector_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: extract_subvector_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1]
 ; VBITS_GE_256-NEXT:    ret
@@ -228,8 +228,8 @@ define void @extract_subvector_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @extract_subvector_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: extract_subvector_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1]
 ; VBITS_GE_256-NEXT:    ret
@@ -308,8 +308,8 @@ define void @extract_subvector_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @extract_subvector_v8i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: extract_subvector_v8i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #4
 ; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    mov x8, #4 // =0x4
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
@@ -322,14 +322,14 @@ define void @extract_subvector_v8i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @extract_subvector_v16i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: extract_subvector_v16i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    mov x9, #12
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #12 // =0xc
+; VBITS_GE_256-NEXT:    mov x9, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1]
 ; VBITS_GE_256-NEXT:    ret
   %op = load <16 x i64>, ptr %a
   %ret = call <8 x i64> @llvm.vector.extract.v8i64.v16i64(<16 x i64> %op, i64 8)
@@ -340,8 +340,8 @@ define void @extract_subvector_v16i64(ptr %a, ptr %b) #0 {
 define void @extract_subvector_v32i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: extract_subvector_v32i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #16
 ; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    mov x8, #16 // =0x10
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
@@ -392,8 +392,8 @@ define void @extract_subvector_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @extract_subvector_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: extract_subvector_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1]
 ; VBITS_GE_256-NEXT:    ret
@@ -483,8 +483,8 @@ define void @extract_subvector_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @extract_subvector_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: extract_subvector_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1]
 ; VBITS_GE_256-NEXT:    ret
@@ -563,8 +563,8 @@ define void @extract_subvector_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @extract_subvector_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: extract_subvector_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
 ; VBITS_GE_256-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll
index b8cac79f92bbb6..485124c1d59ed9 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll
@@ -46,8 +46,8 @@ define half @extractelement_v16f16(ptr %a) vscale_range(2,0) #0 {
 define half @extractelement_v32f16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: extractelement_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    mov z0.h, z0.h[15]
 ; VBITS_GE_256-NEXT:    // kill: def $h0 killed $h0 killed $z0
@@ -69,10 +69,10 @@ define half @extractelement_v64f16(ptr %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: extractelement_v64f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl64
-; CHECK-NEXT:    mov w8, #63
+; CHECK-NEXT:    mov w8, #63 // =0x3f
+; CHECK-NEXT:    whilels p1.h, xzr, x8
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    whilels p0.h, xzr, x8
-; CHECK-NEXT:    lastb h0, p0, z0.h
+; CHECK-NEXT:    lastb h0, p1, z0.h
 ; CHECK-NEXT:    ret
     %op1 = load <64 x half>, ptr %a
     %r = extractelement <64 x half> %op1, i64 63
@@ -83,10 +83,10 @@ define half @extractelement_v128f16(ptr %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: extractelement_v128f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl128
-; CHECK-NEXT:    mov w8, #127
+; CHECK-NEXT:    mov w8, #127 // =0x7f
+; CHECK-NEXT:    whilels p1.h, xzr, x8
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    whilels p0.h, xzr, x8
-; CHECK-NEXT:    lastb h0, p0, z0.h
+; CHECK-NEXT:    lastb h0, p1, z0.h
 ; CHECK-NEXT:    ret
     %op1 = load <128 x half>, ptr %a
     %r = extractelement <128 x half> %op1, i64 127
@@ -130,8 +130,8 @@ define float @extractelement_v8f32(ptr %a) vscale_range(2,0) #0 {
 define float @extractelement_v16f32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: extractelement_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    mov z0.s, z0.s[7]
 ; VBITS_GE_256-NEXT:    // kill: def $s0 killed $s0 killed $z0
@@ -153,10 +153,10 @@ define float @extractelement_v32f32(ptr %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: extractelement_v32f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl32
-; CHECK-NEXT:    mov w8, #31
+; CHECK-NEXT:    mov w8, #31 // =0x1f
+; CHECK-NEXT:    whilels p1.s, xzr, x8
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    whilels p0.s, xzr, x8
-; CHECK-NEXT:    lastb s0, p0, z0.s
+; CHECK-NEXT:    lastb s0, p1, z0.s
 ; CHECK-NEXT:    ret
     %op1 = load <32 x float>, ptr %a
     %r = extractelement <32 x float> %op1, i64 31
@@ -167,10 +167,10 @@ define float @extractelement_v64f32(ptr %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: extractelement_v64f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl64
-; CHECK-NEXT:    mov w8, #63
+; CHECK-NEXT:    mov w8, #63 // =0x3f
+; CHECK-NEXT:    whilels p1.s, xzr, x8
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    whilels p0.s, xzr, x8
-; CHECK-NEXT:    lastb s0, p0, z0.s
+; CHECK-NEXT:    lastb s0, p1, z0.s
 ; CHECK-NEXT:    ret
     %op1 = load <64 x float>, ptr %a
     %r = extractelement <64 x float> %op1, i64 63
@@ -212,8 +212,8 @@ define double @extractelement_v4f64(ptr %a) vscale_range(2,0) #0 {
 define double @extractelement_v8f64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: extractelement_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    mov z0.d, z0.d[3]
 ; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -235,10 +235,10 @@ define double @extractelement_v16f64(ptr %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: extractelement_v16f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl16
-; CHECK-NEXT:    mov w8, #15
+; CHECK-NEXT:    mov w8, #15 // =0xf
+; CHECK-NEXT:    whilels p1.d, xzr, x8
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    whilels p0.d, xzr, x8
-; CHECK-NEXT:    lastb d0, p0, z0.d
+; CHECK-NEXT:    lastb d0, p1, z0.d
 ; CHECK-NEXT:    ret
     %op1 = load <16 x double>, ptr %a
     %r = extractelement <16 x double> %op1, i64 15
@@ -249,10 +249,10 @@ define double @extractelement_v32f64(ptr %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: extractelement_v32f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl32
-; CHECK-NEXT:    mov w8, #31
+; CHECK-NEXT:    mov w8, #31 // =0x1f
+; CHECK-NEXT:    whilels p1.d, xzr, x8
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    whilels p0.d, xzr, x8
-; CHECK-NEXT:    lastb d0, p0, z0.d
+; CHECK-NEXT:    lastb d0, p1, z0.d
 ; CHECK-NEXT:    ret
     %op1 = load <32 x double>, ptr %a
     %r = extractelement <32 x double> %op1, i64 31

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll
index 675afe876bb09f..9a980432945841 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll
@@ -15,10 +15,10 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @test_copysign_v4f16_v4f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: test_copysign_v4f16_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    mvni v2.4h, #128, lsl #8
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    mvni v0.4h, #128, lsl #8
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
+; CHECK-NEXT:    bsl v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
   %a = load <4 x half>, ptr %ap
@@ -31,10 +31,10 @@ define void @test_copysign_v4f16_v4f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 define void @test_copysign_v8f16_v8f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: test_copysign_v8f16_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    mvni v2.8h, #128, lsl #8
-; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    mvni v0.8h, #128, lsl #8
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
+; CHECK-NEXT:    bsl v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
   %a = load <8 x half>, ptr %ap
@@ -65,15 +65,15 @@ define void @test_copysign_v16f16_v16f16(ptr %ap, ptr %bp) vscale_range(2,0) #0
 define void @test_copysign_v32f16_v32f16(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: test_copysign_v32f16_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    and z2.h, z2.h, #0x8000
 ; VBITS_GE_256-NEXT:    and z0.h, z0.h, #0x7fff
 ; VBITS_GE_256-NEXT:    and z1.h, z1.h, #0x7fff
-; VBITS_GE_256-NEXT:    and z2.h, z2.h, #0x8000
 ; VBITS_GE_256-NEXT:    and z3.h, z3.h, #0x8000
 ; VBITS_GE_256-NEXT:    orr z0.d, z0.d, z2.d
 ; VBITS_GE_256-NEXT:    orr z1.d, z1.d, z3.d
@@ -139,10 +139,10 @@ define void @test_copysign_v128f16_v128f16(ptr %ap, ptr %bp) vscale_range(16,0)
 define void @test_copysign_v2f32_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: test_copysign_v2f32_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    mvni v2.2s, #128, lsl #24
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    mvni v0.2s, #128, lsl #24
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
+; CHECK-NEXT:    bsl v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
   %a = load <2 x float>, ptr %ap
@@ -155,10 +155,10 @@ define void @test_copysign_v2f32_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 define void @test_copysign_v4f32_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: test_copysign_v4f32_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    mvni v2.4s, #128, lsl #24
-; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    mvni v0.4s, #128, lsl #24
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
+; CHECK-NEXT:    bsl v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
   %a = load <4 x float>, ptr %ap
@@ -189,15 +189,15 @@ define void @test_copysign_v8f32_v8f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 define void @test_copysign_v16f32_v16f32(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: test_copysign_v16f32_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    and z2.s, z2.s, #0x80000000
 ; VBITS_GE_256-NEXT:    and z0.s, z0.s, #0x7fffffff
 ; VBITS_GE_256-NEXT:    and z1.s, z1.s, #0x7fffffff
-; VBITS_GE_256-NEXT:    and z2.s, z2.s, #0x80000000
 ; VBITS_GE_256-NEXT:    and z3.s, z3.s, #0x80000000
 ; VBITS_GE_256-NEXT:    orr z0.d, z0.d, z2.d
 ; VBITS_GE_256-NEXT:    orr z1.d, z1.d, z3.d
@@ -298,15 +298,15 @@ define void @test_copysign_v4f64_v4f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 define void @test_copysign_v8f64_v8f64(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: test_copysign_v8f64_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    and z2.d, z2.d, #0x8000000000000000
 ; VBITS_GE_256-NEXT:    and z0.d, z0.d, #0x7fffffffffffffff
 ; VBITS_GE_256-NEXT:    and z1.d, z1.d, #0x7fffffffffffffff
-; VBITS_GE_256-NEXT:    and z2.d, z2.d, #0x8000000000000000
 ; VBITS_GE_256-NEXT:    and z3.d, z3.d, #0x8000000000000000
 ; VBITS_GE_256-NEXT:    orr z0.d, z0.d, z2.d
 ; VBITS_GE_256-NEXT:    orr z1.d, z1.d, z3.d
@@ -373,10 +373,10 @@ define void @test_copysign_v2f32_v2f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: test_copysign_v2f32_v2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    mvni v2.2s, #128, lsl #24
-; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    mvni v1.2s, #128, lsl #24
+; CHECK-NEXT:    ldr d2, [x0]
 ; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    bit v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    bit v0.8b, v2.8b, v1.8b
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
   %a = load <2 x float>, ptr %ap
@@ -395,10 +395,10 @@ define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mvni v2.4s, #128, lsl #24
-; CHECK-NEXT:    fcvt z1.s, p0/m, z1.d
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fcvt z1.s, p1/m, z1.d
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    str q0, [x0]
@@ -454,8 +454,8 @@ define void @test_copysign_v4f64_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK_EXTEND_ROUND-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK_EXTEND_ROUND-NEXT:    ldr q1, [x1]
 ; CHECK_EXTEND_ROUND-NEXT:    uunpklo z1.d, z1.s
-; CHECK_EXTEND_ROUND-NEXT:    fcvt z1.d, p0/m, z1.s
 ; CHECK_EXTEND_ROUND-NEXT:    and z0.d, z0.d, #0x7fffffffffffffff
+; CHECK_EXTEND_ROUND-NEXT:    fcvt z1.d, p0/m, z1.s
 ; CHECK_EXTEND_ROUND-NEXT:    and z1.d, z1.d, #0x8000000000000000
 ; CHECK_EXTEND_ROUND-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK_EXTEND_ROUND-NEXT:    st1d { z0.d }, p0, [x0]
@@ -474,10 +474,10 @@ define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: test_copysign_v4f16_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    mvni v2.4h, #128, lsl #8
-; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    mvni v1.4h, #128, lsl #8
+; CHECK-NEXT:    ldr d2, [x0]
 ; CHECK-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-NEXT:    bit v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    bit v0.8b, v2.8b, v1.8b
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
   %a = load <4 x half>, ptr %ap
@@ -493,10 +493,10 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mvni v2.4h, #128, lsl #8
-; CHECK-NEXT:    fcvt z1.h, p0/m, z1.d
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fcvt z1.h, p1/m, z1.d
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
@@ -520,10 +520,10 @@ define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mvni v2.8h, #128, lsl #8
-; CHECK-NEXT:    fcvt z1.h, p0/m, z1.s
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fcvt z1.h, p1/m, z1.s
 ; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    str q0, [x0]

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll
index 8698f4a4b27638..3f831ea54bc817 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll
@@ -48,8 +48,8 @@ define void @fadd_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fadd_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fadd_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -146,8 +146,8 @@ define void @fadd_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fadd_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fadd_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -244,8 +244,8 @@ define void @fadd_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fadd_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fadd_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
@@ -346,14 +346,14 @@ define void @fdiv_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fdiv_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fdiv_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    fdiv z0.h, p0/m, z0.h, z2.h
-; VBITS_GE_256-NEXT:    fdiv z1.h, p0/m, z1.h, z3.h
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    fdiv z1.h, p0/m, z1.h, z2.h
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
@@ -444,14 +444,14 @@ define void @fdiv_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fdiv_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fdiv_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    fdiv z0.s, p0/m, z0.s, z2.s
-; VBITS_GE_256-NEXT:    fdiv z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    fdiv z1.s, p0/m, z1.s, z2.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
@@ -542,14 +542,14 @@ define void @fdiv_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fdiv_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fdiv_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    fdiv z0.d, p0/m, z0.d, z2.d
-; VBITS_GE_256-NEXT:    fdiv z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    fdiv z1.d, p0/m, z1.d, z2.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
@@ -648,8 +648,8 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
 define void @fma_v32f16(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-LABEL: fma_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -758,8 +758,8 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
 define void @fma_v16f32(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-LABEL: fma_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -867,8 +867,8 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
 define void @fma_v8f64(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-LABEL: fma_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
@@ -977,8 +977,8 @@ define void @fmul_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fmul_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fmul_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -1075,8 +1075,8 @@ define void @fmul_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fmul_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fmul_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -1173,8 +1173,8 @@ define void @fmul_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fmul_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fmul_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
@@ -1273,8 +1273,8 @@ define void @fneg_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fneg_v32f16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: fneg_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fneg z0.h, p0/m, z0.h
@@ -1361,8 +1361,8 @@ define void @fneg_v8f32(ptr %a) vscale_range(2,0) #0 {
 define void @fneg_v16f32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: fneg_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fneg z0.s, p0/m, z0.s
@@ -1449,8 +1449,8 @@ define void @fneg_v4f64(ptr %a) vscale_range(2,0) #0 {
 define void @fneg_v8f64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: fneg_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fneg z0.d, p0/m, z0.d
@@ -1541,11 +1541,11 @@ define void @fsqrt_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fsqrt_v32f16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: fsqrt_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fsqrt z0.h, p0/m, z0.h
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fsqrt z1.h, p0/m, z1.h
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
@@ -1629,11 +1629,11 @@ define void @fsqrt_v8f32(ptr %a) vscale_range(2,0) #0 {
 define void @fsqrt_v16f32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: fsqrt_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fsqrt z0.s, p0/m, z0.s
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fsqrt z1.s, p0/m, z1.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
@@ -1717,11 +1717,11 @@ define void @fsqrt_v4f64(ptr %a) vscale_range(2,0) #0 {
 define void @fsqrt_v8f64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: fsqrt_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fsqrt z0.d, p0/m, z0.d
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fsqrt z1.d, p0/m, z1.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
@@ -1811,8 +1811,8 @@ define void @fsub_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fsub_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fsub_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -1909,8 +1909,8 @@ define void @fsub_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fsub_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fsub_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -2007,8 +2007,8 @@ define void @fsub_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fsub_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fsub_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
@@ -2107,8 +2107,8 @@ define void @fabs_v16f16(ptr %a) vscale_range(2,0) #0 {
 define void @fabs_v32f16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: fabs_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fabs z0.h, p0/m, z0.h
@@ -2195,8 +2195,8 @@ define void @fabs_v8f32(ptr %a) vscale_range(2,0) #0 {
 define void @fabs_v16f32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: fabs_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fabs z0.s, p0/m, z0.s
@@ -2283,8 +2283,8 @@ define void @fabs_v4f64(ptr %a) vscale_range(2,0) #0 {
 define void @fabs_v8f64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: fabs_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fabs z0.d, p0/m, z0.d

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-compares.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-compares.ll
index 582ff6ab747c68..b97a5f7b055973 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-compares.ll
@@ -52,8 +52,8 @@ define void @fcmp_oeq_v16f16(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
 define void @fcmp_oeq_v32f16(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-LABEL: fcmp_oeq_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -162,8 +162,8 @@ define void @fcmp_oeq_v8f32(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
 define void @fcmp_oeq_v16f32(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-LABEL: fcmp_oeq_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -272,8 +272,8 @@ define void @fcmp_oeq_v4f64(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
 define void @fcmp_oeq_v8f64(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-LABEL: fcmp_oeq_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll
index 0bf8b9372518a3..6da07b855a5c57 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll
@@ -54,8 +54,8 @@ define void @fcvt_v8f16_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fcvt_v16f16_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fcvt_v16f16_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1h { z0.s }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fcvt z0.s, p0/m, z0.h
@@ -157,8 +157,8 @@ define void @fcvt_v4f16_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fcvt_v8f16_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fcvt_v8f16_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1h { z0.d }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fcvt z0.d, p0/m, z0.h
@@ -257,8 +257,8 @@ define void @fcvt_v4f32_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fcvt_v8f32_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fcvt_v8f32_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1w { z0.d }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fcvt z0.d, p0/m, z0.s
@@ -357,8 +357,8 @@ define void @fcvt_v8f32_v8f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fcvt_v16f32_v16f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fcvt_v16f32_v16f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fcvt z0.h, p0/m, z0.s
@@ -430,8 +430,8 @@ define void @fcvt_v1f64_v1f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fcvt_v2f64_v2f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvt_v2f64_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
@@ -460,13 +460,13 @@ define void @fcvt_v4f64_v4f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fcvt_v8f64_v8f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fcvt_v8f64_v8f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT:    ptrue p1.d
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p0.d
-; VBITS_GE_256-NEXT:    fcvt z0.h, p0/m, z0.d
-; VBITS_GE_256-NEXT:    fcvt z1.h, p0/m, z1.d
+; VBITS_GE_256-NEXT:    fcvt z0.h, p1/m, z0.d
+; VBITS_GE_256-NEXT:    fcvt z1.h, p1/m, z1.d
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
@@ -562,8 +562,8 @@ define void @fcvt_v4f64_v4f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fcvt_v8f64_v8f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fcvt_v8f64_v8f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fcvt z0.s, p0/m, z0.d

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll
index a32932005770e7..7f65f4b10277ff 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll
@@ -55,8 +55,8 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
 define void @fma_v32f16(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-LABEL: fma_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -171,8 +171,8 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
 define void @fma_v16f32(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-LABEL: fma_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -286,8 +286,8 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
 define void @fma_v8f64(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_256-LABEL: fma_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll
index a736fdf8d5bf7b..6e81876adc3a03 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll
@@ -48,8 +48,8 @@ define void @fmaxnm_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fmaxnm_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-LABEL: fmaxnm_v32f16:
 ; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_EQ_256-NEXT:    ptrue p0.h, vl16
+; VBITS_EQ_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_EQ_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_EQ_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -146,8 +146,8 @@ define void @fmaxnm_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fmaxnm_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-LABEL: fmaxnm_v16f32:
 ; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_EQ_256-NEXT:    ptrue p0.s, vl8
+; VBITS_EQ_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_EQ_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_EQ_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -244,8 +244,8 @@ define void @fmaxnm_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fmaxnm_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-LABEL: fmaxnm_v8f64:
 ; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_EQ_256-NEXT:    ptrue p0.d, vl4
+; VBITS_EQ_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_EQ_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_EQ_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
@@ -346,8 +346,8 @@ define void @fminnm_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fminnm_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-LABEL: fminnm_v32f16:
 ; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_EQ_256-NEXT:    ptrue p0.h, vl16
+; VBITS_EQ_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_EQ_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_EQ_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -444,8 +444,8 @@ define void @fminnm_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fminnm_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-LABEL: fminnm_v16f32:
 ; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_EQ_256-NEXT:    ptrue p0.s, vl8
+; VBITS_EQ_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_EQ_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_EQ_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -542,8 +542,8 @@ define void @fminnm_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fminnm_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-LABEL: fminnm_v8f64:
 ; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_EQ_256-NEXT:    ptrue p0.d, vl4
+; VBITS_EQ_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_EQ_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_EQ_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
@@ -644,8 +644,8 @@ define void @fmax_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fmax_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-LABEL: fmax_v32f16:
 ; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_EQ_256-NEXT:    ptrue p0.h, vl16
+; VBITS_EQ_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_EQ_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_EQ_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -742,8 +742,8 @@ define void @fmax_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fmax_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-LABEL: fmax_v16f32:
 ; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_EQ_256-NEXT:    ptrue p0.s, vl8
+; VBITS_EQ_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_EQ_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_EQ_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -840,8 +840,8 @@ define void @fmax_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fmax_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-LABEL: fmax_v8f64:
 ; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_EQ_256-NEXT:    ptrue p0.d, vl4
+; VBITS_EQ_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_EQ_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_EQ_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
@@ -942,8 +942,8 @@ define void @fmin_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fmin_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-LABEL: fmin_v32f16:
 ; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_EQ_256-NEXT:    ptrue p0.h, vl16
+; VBITS_EQ_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_EQ_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_EQ_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -1040,8 +1040,8 @@ define void @fmin_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fmin_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-LABEL: fmin_v16f32:
 ; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_EQ_256-NEXT:    ptrue p0.s, vl8
+; VBITS_EQ_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_EQ_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_EQ_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -1138,8 +1138,8 @@ define void @fmin_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fmin_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-LABEL: fmin_v8f64:
 ; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_EQ_256-NEXT:    ptrue p0.d, vl4
+; VBITS_EQ_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_EQ_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_EQ_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll
index 7fc0ab12e8d28b..bbd9613819b19e 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll
@@ -13,8 +13,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define half @fadda_v4f16(half %start, <4 x half> %a) vscale_range(1,0) #0 {
 ; CHECK-LABEL: fadda_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fadda h0, p0, h0, z1.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
@@ -27,8 +27,8 @@ define half @fadda_v4f16(half %start, <4 x half> %a) vscale_range(1,0) #0 {
 define half @fadda_v8f16(half %start, <8 x half> %a) vscale_range(1,0) #0 {
 ; CHECK-LABEL: fadda_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fadda h0, p0, h0, z1.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
@@ -54,8 +54,8 @@ define half @fadda_v16f16(half %start, ptr %a) vscale_range(2,0) #0 {
 define half @fadda_v32f16(half %start, ptr %a) #0 {
 ; VBITS_GE_256-LABEL: fadda_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
@@ -109,8 +109,8 @@ define half @fadda_v128f16(half %start, ptr %a) vscale_range(16,0) #0 {
 define float @fadda_v2f32(float %start, <2 x float> %a) vscale_range(1,0) #0 {
 ; CHECK-LABEL: fadda_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fadda s0, p0, s0, z1.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
@@ -123,8 +123,8 @@ define float @fadda_v2f32(float %start, <2 x float> %a) vscale_range(1,0) #0 {
 define float @fadda_v4f32(float %start, <4 x float> %a) vscale_range(1,0) #0 {
 ; CHECK-LABEL: fadda_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fadda s0, p0, s0, z1.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
@@ -150,8 +150,8 @@ define float @fadda_v8f32(float %start, ptr %a) vscale_range(2,0) #0 {
 define float @fadda_v16f32(float %start, ptr %a) #0 {
 ; VBITS_GE_256-LABEL: fadda_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
@@ -215,8 +215,8 @@ define double @fadda_v1f64(double %start, <1 x double> %a) vscale_range(1,0) #0
 define double @fadda_v2f64(double %start, <2 x double> %a) vscale_range(1,0) #0 {
 ; CHECK-LABEL: fadda_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fadda d0, p0, d0, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -242,8 +242,8 @@ define double @fadda_v4f64(double %start, ptr %a) vscale_range(2,0) #0 {
 define double @fadda_v8f64(double %start, ptr %a) #0 {
 ; VBITS_GE_256-LABEL: fadda_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
@@ -301,8 +301,8 @@ define double @fadda_v32f64(double %start, ptr %a) vscale_range(16,0) #0 {
 define half @faddv_v4f16(half %start, <4 x half> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: faddv_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    faddv h1, p0, z1.h
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
@@ -314,8 +314,8 @@ define half @faddv_v4f16(half %start, <4 x half> %a) vscale_range(2,0) #0 {
 define half @faddv_v8f16(half %start, <8 x half> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: faddv_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    faddv h1, p0, z1.h
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
@@ -339,8 +339,8 @@ define half @faddv_v16f16(half %start, ptr %a) vscale_range(2,0) #0 {
 define half @faddv_v32f16(half %start, ptr %a) #0 {
 ; VBITS_GE_256-LABEL: faddv_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fadd z1.h, p0/m, z1.h, z2.h
@@ -401,8 +401,8 @@ define float @faddv_v2f32(float %start, <2 x float> %a) vscale_range(2,0) #0 {
 define float @faddv_v4f32(float %start, <4 x float> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: faddv_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    faddv s1, p0, z1.s
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
@@ -426,8 +426,8 @@ define float @faddv_v8f32(float %start, ptr %a) vscale_range(2,0) #0 {
 define float @faddv_v16f32(float %start, ptr %a) #0 {
 ; VBITS_GE_256-LABEL: faddv_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fadd z1.s, p0/m, z1.s, z2.s
@@ -510,8 +510,8 @@ define double @faddv_v4f64(double %start, ptr %a) vscale_range(2,0) #0 {
 define double @faddv_v8f64(double %start, ptr %a) #0 {
 ; VBITS_GE_256-LABEL: faddv_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fadd z1.d, p0/m, z1.d, z2.d
@@ -597,8 +597,8 @@ define half @fmaxv_v16f16(ptr %a) vscale_range(2,0) #0 {
 define half @fmaxv_v32f16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: fmaxv_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
@@ -680,8 +680,8 @@ define float @fmaxv_v8f32(ptr %a) vscale_range(2,0) #0 {
 define float @fmaxv_v16f32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: fmaxv_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
@@ -762,8 +762,8 @@ define double @fmaxv_v4f64(ptr %a) vscale_range(2,0) #0 {
 define double @fmaxv_v8f64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: fmaxv_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
@@ -849,8 +849,8 @@ define half @fminv_v16f16(ptr %a) vscale_range(2,0) #0 {
 define half @fminv_v32f16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: fminv_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
@@ -932,8 +932,8 @@ define float @fminv_v8f32(ptr %a) vscale_range(2,0) #0 {
 define float @fminv_v16f32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: fminv_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
@@ -1014,8 +1014,8 @@ define double @fminv_v4f64(ptr %a) vscale_range(2,0) #0 {
 define double @fminv_v8f64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: fminv_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
@@ -1099,8 +1099,8 @@ define half @fmaximumv_v16f16(ptr %a) vscale_range(2,0) #0 {
 define half @fmaximumv_v32f16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: fmaximumv_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
@@ -1182,8 +1182,8 @@ define float @fmaximumv_v8f32(ptr %a) vscale_range(2,0) #0 {
 define float @fmaximumv_v16f32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: fmaximumv_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
@@ -1264,8 +1264,8 @@ define double @fmaximumv_v4f64(ptr %a) vscale_range(2,0) #0 {
 define double @fmaximumv_v8f64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: fmaximumv_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fmax z0.d, p0/m, z0.d, z1.d
@@ -1349,8 +1349,8 @@ define half @fminimumv_v16f16(ptr %a) vscale_range(2,0) #0 {
 define half @fminimumv_v32f16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: fminimumv_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
@@ -1432,8 +1432,8 @@ define float @fminimumv_v8f32(ptr %a) vscale_range(2,0) #0 {
 define float @fminimumv_v16f32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: fminimumv_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
@@ -1514,8 +1514,8 @@ define double @fminimumv_v4f64(ptr %a) vscale_range(2,0) #0 {
 define double @fminimumv_v8f64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: fminimumv_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fmin z0.d, p0/m, z0.d, z1.d

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll
index ac46e1d8ddedf2..9ac6fc96e35468 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll
@@ -46,8 +46,8 @@ define void @frintp_v16f16(ptr %a) vscale_range(2,0) #0 {
 define void @frintp_v32f16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: frintp_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    frintp z0.h, p0/m, z0.h
@@ -134,8 +134,8 @@ define void @frintp_v8f32(ptr %a) vscale_range(2,0) #0 {
 define void @frintp_v16f32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: frintp_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    frintp z0.s, p0/m, z0.s
@@ -222,8 +222,8 @@ define void @frintp_v4f64(ptr %a) vscale_range(2,0) #0 {
 define void @frintp_v8f64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: frintp_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    frintp z0.d, p0/m, z0.d
@@ -314,8 +314,8 @@ define void @frintm_v16f16(ptr %a) vscale_range(2,0) #0 {
 define void @frintm_v32f16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: frintm_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    frintm z0.h, p0/m, z0.h
@@ -402,8 +402,8 @@ define void @frintm_v8f32(ptr %a) vscale_range(2,0) #0 {
 define void @frintm_v16f32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: frintm_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    frintm z0.s, p0/m, z0.s
@@ -490,8 +490,8 @@ define void @frintm_v4f64(ptr %a) vscale_range(2,0) #0 {
 define void @frintm_v8f64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: frintm_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    frintm z0.d, p0/m, z0.d
@@ -582,8 +582,8 @@ define void @frinti_v16f16(ptr %a) vscale_range(2,0) #0 {
 define void @frinti_v32f16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: frinti_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    frinti z0.h, p0/m, z0.h
@@ -670,8 +670,8 @@ define void @frinti_v8f32(ptr %a) vscale_range(2,0) #0 {
 define void @frinti_v16f32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: frinti_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    frinti z0.s, p0/m, z0.s
@@ -758,8 +758,8 @@ define void @frinti_v4f64(ptr %a) vscale_range(2,0) #0 {
 define void @frinti_v8f64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: frinti_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    frinti z0.d, p0/m, z0.d
@@ -850,8 +850,8 @@ define void @frintx_v16f16(ptr %a) vscale_range(2,0) #0 {
 define void @frintx_v32f16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: frintx_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    frintx z0.h, p0/m, z0.h
@@ -938,8 +938,8 @@ define void @frintx_v8f32(ptr %a) vscale_range(2,0) #0 {
 define void @frintx_v16f32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: frintx_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    frintx z0.s, p0/m, z0.s
@@ -1026,8 +1026,8 @@ define void @frintx_v4f64(ptr %a) vscale_range(2,0) #0 {
 define void @frintx_v8f64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: frintx_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    frintx z0.d, p0/m, z0.d
@@ -1118,8 +1118,8 @@ define void @frinta_v16f16(ptr %a) vscale_range(2,0) #0 {
 define void @frinta_v32f16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: frinta_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    frinta z0.h, p0/m, z0.h
@@ -1206,8 +1206,8 @@ define void @frinta_v8f32(ptr %a) vscale_range(2,0) #0 {
 define void @frinta_v16f32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: frinta_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    frinta z0.s, p0/m, z0.s
@@ -1294,8 +1294,8 @@ define void @frinta_v4f64(ptr %a) vscale_range(2,0) #0 {
 define void @frinta_v8f64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: frinta_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    frinta z0.d, p0/m, z0.d
@@ -1386,8 +1386,8 @@ define void @frintn_v16f16(ptr %a) vscale_range(2,0) #0 {
 define void @frintn_v32f16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: frintn_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    frintn z0.h, p0/m, z0.h
@@ -1474,8 +1474,8 @@ define void @frintn_v8f32(ptr %a) vscale_range(2,0) #0 {
 define void @frintn_v16f32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: frintn_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    frintn z0.s, p0/m, z0.s
@@ -1562,8 +1562,8 @@ define void @frintn_v4f64(ptr %a) vscale_range(2,0) #0 {
 define void @frintn_v8f64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: frintn_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    frintn z0.d, p0/m, z0.d
@@ -1654,8 +1654,8 @@ define void @frintz_v16f16(ptr %a) vscale_range(2,0) #0 {
 define void @frintz_v32f16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: frintz_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    frintz z0.h, p0/m, z0.h
@@ -1742,8 +1742,8 @@ define void @frintz_v8f32(ptr %a) vscale_range(2,0) #0 {
 define void @frintz_v16f32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: frintz_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    frintz z0.s, p0/m, z0.s
@@ -1830,8 +1830,8 @@ define void @frintz_v4f64(ptr %a) vscale_range(2,0) #0 {
 define void @frintz_v8f64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: frintz_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    frintz z0.d, p0/m, z0.d

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll
index 91c16d08b8fdf8..8b3946136f90f0 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll
@@ -34,14 +34,14 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) vsca
 define void @select_v16f16(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p0.h, vl16
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    cmpne p1.h, p1/z, z2.h, #0
-; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, w8
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x1]
+; CHECK-NEXT:    cmpne p1.h, p1/z, z0.h, #0
+; CHECK-NEXT:    sel z0.h, p1, z1.h, z2.h
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <16 x half>, ptr %a
@@ -54,32 +54,32 @@ define void @select_v16f16(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 define void @select_v32f16(ptr %a, ptr %b, i1 %mask) #0 {
 ; VBITS_GE_256-LABEL: select_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    and w9, w2, #0x1
+; VBITS_GE_256-NEXT:    and w8, w2, #0x1
 ; VBITS_GE_256-NEXT:    ptrue p1.h
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    mov z4.h, w9
-; VBITS_GE_256-NEXT:    cmpne p1.h, p1/z, z4.h, #0
+; VBITS_GE_256-NEXT:    mov z0.h, w8
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    cmpne p1.h, p1/z, z0.h, #0
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    sel z1.h, p1, z1.h, z3.h
-; VBITS_GE_256-NEXT:    sel z0.h, p1, z0.h, z2.h
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    mov z0.h, p1/m, z2.h
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: select_v32f16:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    and w8, w2, #0x1
 ; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    and w8, w2, #0x1
 ; VBITS_GE_512-NEXT:    ptrue p1.h
-; VBITS_GE_512-NEXT:    mov z2.h, w8
-; VBITS_GE_512-NEXT:    cmpne p1.h, p1/z, z2.h, #0
-; VBITS_GE_512-NEXT:    sel z0.h, p1, z0.h, z1.h
+; VBITS_GE_512-NEXT:    mov z0.h, w8
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z2.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    cmpne p1.h, p1/z, z0.h, #0
+; VBITS_GE_512-NEXT:    sel z0.h, p1, z1.h, z2.h
 ; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load volatile <32 x half>, ptr %a
@@ -92,14 +92,14 @@ define void @select_v32f16(ptr %a, ptr %b, i1 %mask) #0 {
 define void @select_v64f16(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 ; CHECK-LABEL: select_v64f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p0.h, vl64
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    cmpne p1.h, p1/z, z2.h, #0
-; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, w8
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x1]
+; CHECK-NEXT:    cmpne p1.h, p1/z, z0.h, #0
+; CHECK-NEXT:    sel z0.h, p1, z1.h, z2.h
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <64 x half>, ptr %a
@@ -112,14 +112,14 @@ define void @select_v64f16(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 define void @select_v128f16(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 {
 ; CHECK-LABEL: select_v128f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p0.h, vl128
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    cmpne p1.h, p1/z, z2.h, #0
-; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, w8
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x1]
+; CHECK-NEXT:    cmpne p1.h, p1/z, z0.h, #0
+; CHECK-NEXT:    sel z0.h, p1, z1.h, z2.h
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <128 x half>, ptr %a
@@ -158,14 +158,14 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) v
 define void @select_v8f32(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p0.s, vl8
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    mov z2.s, w8
-; CHECK-NEXT:    cmpne p1.s, p1/z, z2.s, #0
-; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, w8
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
+; CHECK-NEXT:    cmpne p1.s, p1/z, z0.s, #0
+; CHECK-NEXT:    sel z0.s, p1, z1.s, z2.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <8 x float>, ptr %a
@@ -178,32 +178,32 @@ define void @select_v8f32(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 define void @select_v16f32(ptr %a, ptr %b, i1 %mask) #0 {
 ; VBITS_GE_256-LABEL: select_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    and w9, w2, #0x1
+; VBITS_GE_256-NEXT:    and w8, w2, #0x1
 ; VBITS_GE_256-NEXT:    ptrue p1.s
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    mov z4.s, w9
-; VBITS_GE_256-NEXT:    cmpne p1.s, p1/z, z4.s, #0
+; VBITS_GE_256-NEXT:    mov z0.s, w8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    cmpne p1.s, p1/z, z0.s, #0
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    sel z1.s, p1, z1.s, z3.s
-; VBITS_GE_256-NEXT:    sel z0.s, p1, z0.s, z2.s
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    mov z0.s, p1/m, z2.s
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: select_v16f32:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    and w8, w2, #0x1
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    and w8, w2, #0x1
 ; VBITS_GE_512-NEXT:    ptrue p1.s
-; VBITS_GE_512-NEXT:    mov z2.s, w8
-; VBITS_GE_512-NEXT:    cmpne p1.s, p1/z, z2.s, #0
-; VBITS_GE_512-NEXT:    sel z0.s, p1, z0.s, z1.s
+; VBITS_GE_512-NEXT:    mov z0.s, w8
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z2.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    cmpne p1.s, p1/z, z0.s, #0
+; VBITS_GE_512-NEXT:    sel z0.s, p1, z1.s, z2.s
 ; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load volatile <16 x float>, ptr %a
@@ -216,14 +216,14 @@ define void @select_v16f32(ptr %a, ptr %b, i1 %mask) #0 {
 define void @select_v32f32(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 ; CHECK-LABEL: select_v32f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p0.s, vl32
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    mov z2.s, w8
-; CHECK-NEXT:    cmpne p1.s, p1/z, z2.s, #0
-; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, w8
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
+; CHECK-NEXT:    cmpne p1.s, p1/z, z0.s, #0
+; CHECK-NEXT:    sel z0.s, p1, z1.s, z2.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <32 x float>, ptr %a
@@ -236,14 +236,14 @@ define void @select_v32f32(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 define void @select_v64f32(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 {
 ; CHECK-LABEL: select_v64f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p0.s, vl64
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    mov z2.s, w8
-; CHECK-NEXT:    cmpne p1.s, p1/z, z2.s, #0
-; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, w8
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
+; CHECK-NEXT:    cmpne p1.s, p1/z, z0.s, #0
+; CHECK-NEXT:    sel z0.s, p1, z1.s, z2.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <64 x float>, ptr %a
@@ -282,15 +282,15 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask
 define void @select_v4f64(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    and x8, x2, #0x1
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    cmpne p1.d, p1/z, z2.d, #0
-; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, x8
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
+; CHECK-NEXT:    cmpne p1.d, p1/z, z0.d, #0
+; CHECK-NEXT:    sel z0.d, p1, z1.d, z2.d
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <4 x double>, ptr %a
@@ -303,34 +303,34 @@ define void @select_v4f64(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 define void @select_v8f64(ptr %a, ptr %b, i1 %mask) #0 {
 ; VBITS_GE_256-LABEL: select_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    and x9, x2, #0x1
+; VBITS_GE_256-NEXT:    // kill: def $w2 killed $w2 def $x2
+; VBITS_GE_256-NEXT:    and x8, x2, #0x1
 ; VBITS_GE_256-NEXT:    ptrue p1.d
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    mov z4.d, x9
-; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z4.d, #0
+; VBITS_GE_256-NEXT:    mov z0.d, x8
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z0.d, #0
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    sel z1.d, p1, z1.d, z3.d
-; VBITS_GE_256-NEXT:    sel z0.d, p1, z0.d, z2.d
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    mov z0.d, p1/m, z2.d
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: select_v8f64:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; VBITS_GE_512-NEXT:    and x8, x2, #0x1
-; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    ptrue p1.d
-; VBITS_GE_512-NEXT:    mov z2.d, x8
-; VBITS_GE_512-NEXT:    cmpne p1.d, p1/z, z2.d, #0
-; VBITS_GE_512-NEXT:    sel z0.d, p1, z0.d, z1.d
+; VBITS_GE_512-NEXT:    mov z0.d, x8
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z2.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    cmpne p1.d, p1/z, z0.d, #0
+; VBITS_GE_512-NEXT:    sel z0.d, p1, z1.d, z2.d
 ; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load volatile <8 x double>, ptr %a
@@ -343,15 +343,15 @@ define void @select_v8f64(ptr %a, ptr %b, i1 %mask) #0 {
 define void @select_v16f64(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 ; CHECK-LABEL: select_v16f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    and x8, x2, #0x1
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    cmpne p1.d, p1/z, z2.d, #0
-; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, x8
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
+; CHECK-NEXT:    cmpne p1.d, p1/z, z0.d, #0
+; CHECK-NEXT:    sel z0.d, p1, z1.d, z2.d
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <16 x double>, ptr %a
@@ -364,15 +364,15 @@ define void @select_v16f64(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 define void @select_v32f64(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 {
 ; CHECK-LABEL: select_v32f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    and x8, x2, #0x1
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    cmpne p1.d, p1/z, z2.d, #0
-; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, x8
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
+; CHECK-NEXT:    cmpne p1.d, p1/z, z0.d, #0
+; CHECK-NEXT:    sel z0.d, p1, z1.d, z2.d
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <32 x double>, ptr %a

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll
index 57f635ba9418ae..da0cf927d74d24 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll
@@ -50,8 +50,8 @@ define void @fcvtzu_v16f16_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fcvtzu_v32f16_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fcvtzu_v32f16_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fcvtzu z0.h, p0/m, z0.h
@@ -131,8 +131,8 @@ define <4 x i32> @fcvtzu_v4f16_v4i32(<4 x half> %op1) vscale_range(2,0) #0 {
 define void @fcvtzu_v8f16_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v8f16_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
@@ -147,7 +147,7 @@ define void @fcvtzu_v16f16_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fcvtzu_v16f16_v16i32:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    uunpklo z1.s, z0.h
@@ -250,16 +250,16 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fcvtzu_v8f16_v8i64:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ldr q0, [x0]
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    fcvtzu z0.d, p0/m, z0.h
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
 ; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    fcvtzu z0.d, p0/m, z0.h
 ; VBITS_GE_256-NEXT:    fcvtzu z1.d, p0/m, z1.h
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ret
 ;
@@ -326,8 +326,8 @@ define <4 x i16> @fcvtzu_v4f32_v4i16(<4 x float> %op1) vscale_range(2,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtzu v1.4s, v0.4s
 ; CHECK-NEXT:    mov w8, v1.s[1]
-; CHECK-NEXT:    mov w9, v1.s[2]
 ; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    mov w9, v1.s[2]
 ; CHECK-NEXT:    mov v0.h[1], w8
 ; CHECK-NEXT:    mov w8, v1.s[3]
 ; CHECK-NEXT:    mov v0.h[2], w9
@@ -355,18 +355,18 @@ define <8 x i16> @fcvtzu_v8f32_v8i16(ptr %a) vscale_range(2,0) #0 {
 define void @fcvtzu_v16f32_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fcvtzu_v16f32_v16i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
+; VBITS_GE_256-NEXT:    ptrue p1.h, vl16
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fcvtzu z0.s, p0/m, z0.s
 ; VBITS_GE_256-NEXT:    fcvtzu z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
 ; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
 ; VBITS_GE_256-NEXT:    splice z1.h, p0, z1.h, z0.h
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p1, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fcvtzu_v16f32_v16i16:
@@ -451,8 +451,8 @@ define void @fcvtzu_v8f32_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fcvtzu_v16f32_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fcvtzu_v16f32_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fcvtzu z0.s, p0/m, z0.s
@@ -532,8 +532,8 @@ define <2 x i64> @fcvtzu_v2f32_v2i64(<2 x float> %op1) vscale_range(2,0) #0 {
 define void @fcvtzu_v4f32_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v4f32_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
@@ -548,7 +548,7 @@ define void @fcvtzu_v8f32_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fcvtzu_v8f32_v8i64:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    uunpklo z1.d, z0.s
@@ -610,8 +610,8 @@ define void @fcvtzu_v32f32_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <1 x i16> @fcvtzu_v1f64_v1i16(<1 x double> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v1f64_v1i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
@@ -650,8 +650,8 @@ define <4 x i16> @fcvtzu_v4f64_v4i16(ptr %a) vscale_range(2,0) #0 {
 define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: fcvtzu_v8f64_v8i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fcvtzu z0.d, p0/m, z0.d
@@ -750,18 +750,18 @@ define <4 x i32> @fcvtzu_v4f64_v4i32(ptr %a) vscale_range(2,0) #0 {
 define void @fcvtzu_v8f64_v8i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fcvtzu_v8f64_v8i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fcvtzu z0.d, p0/m, z0.d
 ; VBITS_GE_256-NEXT:    fcvtzu z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p1, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fcvtzu_v8f64_v8i32:
@@ -847,8 +847,8 @@ define void @fcvtzu_v4f64_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fcvtzu_v8f64_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fcvtzu_v8f64_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fcvtzu z0.d, p0/m, z0.d
@@ -943,8 +943,8 @@ define void @fcvtzs_v16f16_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fcvtzs_v32f16_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fcvtzs_v32f16_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fcvtzs z0.h, p0/m, z0.h
@@ -1024,8 +1024,8 @@ define <4 x i32> @fcvtzs_v4f16_v4i32(<4 x half> %op1) vscale_range(2,0) #0 {
 define void @fcvtzs_v8f16_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v8f16_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
@@ -1040,7 +1040,7 @@ define void @fcvtzs_v16f16_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fcvtzs_v16f16_v16i32:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    uunpklo z1.s, z0.h
@@ -1143,16 +1143,16 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fcvtzs_v8f16_v8i64:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ldr q0, [x0]
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    fcvtzs z0.d, p0/m, z0.h
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
 ; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    fcvtzs z0.d, p0/m, z0.h
 ; VBITS_GE_256-NEXT:    fcvtzs z1.d, p0/m, z1.h
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ret
 ;
@@ -1219,8 +1219,8 @@ define <4 x i16> @fcvtzs_v4f32_v4i16(<4 x float> %op1) vscale_range(2,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtzs v1.4s, v0.4s
 ; CHECK-NEXT:    mov w8, v1.s[1]
-; CHECK-NEXT:    mov w9, v1.s[2]
 ; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    mov w9, v1.s[2]
 ; CHECK-NEXT:    mov v0.h[1], w8
 ; CHECK-NEXT:    mov w8, v1.s[3]
 ; CHECK-NEXT:    mov v0.h[2], w9
@@ -1248,18 +1248,18 @@ define <8 x i16> @fcvtzs_v8f32_v8i16(ptr %a) vscale_range(2,0) #0 {
 define void @fcvtzs_v16f32_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fcvtzs_v16f32_v16i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
+; VBITS_GE_256-NEXT:    ptrue p1.h, vl16
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; VBITS_GE_256-NEXT:    fcvtzs z1.s, p0/m, z1.s
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
 ; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
 ; VBITS_GE_256-NEXT:    splice z1.h, p0, z1.h, z0.h
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p1, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fcvtzs_v16f32_v16i16:
@@ -1344,8 +1344,8 @@ define void @fcvtzs_v8f32_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fcvtzs_v16f32_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fcvtzs_v16f32_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fcvtzs z0.s, p0/m, z0.s
@@ -1425,8 +1425,8 @@ define <2 x i64> @fcvtzs_v2f32_v2i64(<2 x float> %op1) vscale_range(2,0) #0 {
 define void @fcvtzs_v4f32_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v4f32_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
@@ -1441,7 +1441,7 @@ define void @fcvtzs_v8f32_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fcvtzs_v8f32_v8i64:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    uunpklo z1.d, z0.s
@@ -1503,8 +1503,8 @@ define void @fcvtzs_v32f32_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <1 x i16> @fcvtzs_v1f64_v1i16(<1 x double> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v1f64_v1i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
@@ -1543,8 +1543,8 @@ define <4 x i16> @fcvtzs_v4f64_v4i16(ptr %a) vscale_range(2,0) #0 {
 define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: fcvtzs_v8f64_v8i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fcvtzs z0.d, p0/m, z0.d
@@ -1643,18 +1643,18 @@ define <4 x i32> @fcvtzs_v4f64_v4i32(ptr %a) vscale_range(2,0) #0 {
 define void @fcvtzs_v8f64_v8i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fcvtzs_v8f64_v8i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; VBITS_GE_256-NEXT:    fcvtzs z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p1, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fcvtzs_v8f64_v8i32:
@@ -1740,8 +1740,8 @@ define void @fcvtzs_v4f64_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @fcvtzs_v8f64_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: fcvtzs_v8f64_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fcvtzs z0.d, p0/m, z0.d

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll
index d79ef79760ca54..3566bbc2b45614 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll
@@ -51,8 +51,8 @@ define void @select_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @select_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: select_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -164,8 +164,8 @@ define void @select_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @select_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: select_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -278,8 +278,8 @@ define void @select_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @select_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: select_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll
index 31ff9287046cd5..9b0d86556f0b72 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll
@@ -13,8 +13,8 @@ define void @fcvt_v4f64_v4f128(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    sub sp, sp, #48
 ; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    add x8, sp, #48
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    str z0, [x8, #1, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #16
 ; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
@@ -62,8 +62,8 @@ define void @fcvt_v4f128_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    sub sp, sp, #128
 ; CHECK-NEXT:    ldr q1, [x0, #64]
-; CHECK-NEXT:    mov x19, x1
 ; CHECK-NEXT:    ldr q0, [x0, #80]
+; CHECK-NEXT:    mov x19, x1
 ; CHECK-NEXT:    stp q0, q1, [sp, #96] // 32-byte Folded Spill
 ; CHECK-NEXT:    ldr q1, [x0, #96]
 ; CHECK-NEXT:    ldr q0, [x0, #112]
@@ -88,10 +88,10 @@ define void @fcvt_v4f128_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl __trunctfdf2
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    add x8, sp, #128
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    add x8, sp, #128
 ; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    ldr z1, [x8, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
@@ -112,14 +112,14 @@ define void @fcvt_v4f128_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ldr q0, [sp, #112] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl __trunctfdf2
 ; CHECK-NEXT:    ldr q1, [sp, #96] // 16-byte Folded Reload
-; CHECK-NEXT:    add x9, sp, #128
+; CHECK-NEXT:    ptrue p1.d, vl2
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    mov x8, #4
-; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    ldr z1, [x9] // 16-byte Folded Reload
-; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    add x8, sp, #128
 ; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ldr z1, [x8] // 16-byte Folded Reload
+; CHECK-NEXT:    mov x8, #4 // =0x4
+; CHECK-NEXT:    splice z0.d, p1, z0.d, z1.d
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x19, x8, lsl #3]
 ; CHECK-NEXT:    add x8, sp, #128
 ; CHECK-NEXT:    ldr z0, [x8, #1, mul vl] // 16-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll
index bf69d2d1bbe7c3..d73ef4731d48fc 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll
@@ -11,58 +11,53 @@ target triple = "aarch64-unknown-linux-gnu"
 define dso_local void @func1(ptr %v1, ptr %v2, ptr %v3, ptr %v4, ptr %v5, ptr %v6, ptr %v7, ptr %v8,
 ; CHECK-LABEL: func1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x25, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    str x29, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w20, -16
 ; CHECK-NEXT:    .cfi_offset w21, -24
 ; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w23, -40
-; CHECK-NEXT:    .cfi_offset w24, -48
-; CHECK-NEXT:    .cfi_offset w25, -56
-; CHECK-NEXT:    .cfi_offset w29, -64
-; CHECK-NEXT:    add x8, sp, #64
-; CHECK-NEXT:    add x9, sp, #128
-; CHECK-NEXT:    add x10, sp, #160
-; CHECK-NEXT:    add x11, sp, #192
+; CHECK-NEXT:    .cfi_offset w29, -48
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    add x20, sp, #192
+; CHECK-NEXT:    add x8, sp, #48
+; CHECK-NEXT:    add x9, sp, #112
+; CHECK-NEXT:    add x10, sp, #144
+; CHECK-NEXT:    add x11, sp, #176
+; CHECK-NEXT:    add x20, sp, #176
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x9]
 ; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x10]
 ; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x11]
-; CHECK-NEXT:    ldp x18, x19, [sp, #368]
-; CHECK-NEXT:    add x21, sp, #160
-; CHECK-NEXT:    add x22, sp, #128
-; CHECK-NEXT:    ldp x24, x14, [sp, #296]
-; CHECK-NEXT:    add x23, sp, #64
-; CHECK-NEXT:    ldr x25, [sp, #288]
-; CHECK-NEXT:    ldp x9, x8, [sp, #344]
-; CHECK-NEXT:    ldp x11, x10, [sp, #328]
-; CHECK-NEXT:    ldp x13, x12, [sp, #312]
-; CHECK-NEXT:    ldr x15, [sp, #120]
-; CHECK-NEXT:    ldur q4, [sp, #104]
-; CHECK-NEXT:    ldp x16, x17, [sp, #224]
+; CHECK-NEXT:    ldp x9, x8, [sp, #328]
+; CHECK-NEXT:    ldp x11, x10, [sp, #312]
+; CHECK-NEXT:    ldr x15, [sp, #104]
+; CHECK-NEXT:    ldp x13, x12, [sp, #296]
+; CHECK-NEXT:    ldur q4, [sp, #88]
+; CHECK-NEXT:    ldp x18, x14, [sp, #280]
+; CHECK-NEXT:    ldr x19, [sp, #272]
+; CHECK-NEXT:    ldp x16, x17, [sp, #208]
+; CHECK-NEXT:    ldp x21, x22, [sp, #352]
 ; CHECK-NEXT:    st1d { z3.d }, p0, [x20]
-; CHECK-NEXT:    st1d { z2.d }, p0, [x21]
-; CHECK-NEXT:    st1d { z1.d }, p0, [x22]
-; CHECK-NEXT:    st1d { z0.d }, p0, [x23]
-; CHECK-NEXT:    stp x18, x19, [sp, #368]
-; CHECK-NEXT:    stp x25, x24, [sp, #288]
-; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    stp x16, x17, [sp, #224]
-; CHECK-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    stur q4, [sp, #104]
-; CHECK-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    str x15, [sp, #120]
-; CHECK-NEXT:    stp x14, x13, [sp, #304]
-; CHECK-NEXT:    stp x12, x11, [sp, #320]
-; CHECK-NEXT:    stp x10, x9, [sp, #336]
-; CHECK-NEXT:    str x8, [sp, #352]
-; CHECK-NEXT:    ldp x29, x25, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT:    add x20, sp, #144
+; CHECK-NEXT:    st1d { z2.d }, p0, [x20]
+; CHECK-NEXT:    add x20, sp, #112
+; CHECK-NEXT:    st1d { z1.d }, p0, [x20]
+; CHECK-NEXT:    add x20, sp, #48
+; CHECK-NEXT:    st1d { z0.d }, p0, [x20]
+; CHECK-NEXT:    stp x21, x22, [sp, #352]
+; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    stp x19, x18, [sp, #272]
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    stp x16, x17, [sp, #208]
+; CHECK-NEXT:    stur q4, [sp, #88]
+; CHECK-NEXT:    str x15, [sp, #104]
+; CHECK-NEXT:    stp x14, x13, [sp, #288]
+; CHECK-NEXT:    stp x12, x11, [sp, #304]
+; CHECK-NEXT:    stp x10, x9, [sp, #320]
+; CHECK-NEXT:    str x8, [sp, #336]
+; CHECK-NEXT:    ldr x29, [sp], #48 // 8-byte Folded Reload
 ; CHECK-NEXT:    b func2
                              ptr %v9, ptr %v10, ptr %v11, ptr %v12, ptr %v13, ptr %v14,  ptr %v15, ptr %v16,
                              ptr %v17, ptr %v18, ptr %v19, ptr %v20, ptr %v21, ptr %v22, ptr %v23, ptr %v24,

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll
index 4fdeb8112c0349..2dd06e08d16b63 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll
@@ -36,16 +36,16 @@ define <8 x half> @insertelement_v8f16(<8 x half> %op1) vscale_range(2,0) #0 {
 define <16 x half> @insertelement_v16f16(ptr %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: insertelement_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w9, #15
 ; CHECK-NEXT:    ptrue p0.h, vl16
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    fmov h2, #5.00000000
-; CHECK-NEXT:    index z3.h, #0, #1
+; CHECK-NEXT:    mov w9, #15 // =0xf
+; CHECK-NEXT:    index z0.h, #0, #1
 ; CHECK-NEXT:    ptrue p1.h
 ; CHECK-NEXT:    mov z1.h, w9
-; CHECK-NEXT:    cmpeq p1.h, p1/z, z3.h, z1.h
-; CHECK-NEXT:    mov z0.h, p1/m, h2
-; CHECK-NEXT:    st1h { z0.h }, p0, [x8]
+; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x0]
+; CHECK-NEXT:    cmpeq p1.h, p1/z, z0.h, z1.h
+; CHECK-NEXT:    fmov h0, #5.00000000
+; CHECK-NEXT:    mov z2.h, p1/m, h0
+; CHECK-NEXT:    st1h { z2.h }, p0, [x8]
 ; CHECK-NEXT:    ret
     %op1 = load <16 x half>, ptr %a
     %r = insertelement <16 x half> %op1, half 5.0, i64 15
@@ -55,33 +55,33 @@ define <16 x half> @insertelement_v16f16(ptr %a) vscale_range(2,0) #0 {
 define <32 x half> @insertelement_v32f16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: insertelement_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #16
-; VBITS_GE_256-NEXT:    mov w10, #15
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    fmov h3, #5.00000000
-; VBITS_GE_256-NEXT:    index z4.h, #0, #1
+; VBITS_GE_256-NEXT:    mov w9, #15 // =0xf
+; VBITS_GE_256-NEXT:    index z0.h, #0, #1
 ; VBITS_GE_256-NEXT:    ptrue p1.h
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    mov z2.h, w10
-; VBITS_GE_256-NEXT:    cmpeq p1.h, p1/z, z4.h, z2.h
-; VBITS_GE_256-NEXT:    mov z0.h, p1/m, h3
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x8, x9, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x8]
+; VBITS_GE_256-NEXT:    mov z1.h, w9
+; VBITS_GE_256-NEXT:    mov x9, #16 // =0x10
+; VBITS_GE_256-NEXT:    fmov h2, #5.00000000
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x0, x9, lsl #1]
+; VBITS_GE_256-NEXT:    cmpeq p1.h, p1/z, z0.h, z1.h
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    mov z3.h, p1/m, h2
+; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x8, x9, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x8]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: insertelement_v32f16:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    mov w9, #31
 ; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    fmov h2, #5.00000000
-; VBITS_GE_512-NEXT:    index z3.h, #0, #1
+; VBITS_GE_512-NEXT:    mov w9, #31 // =0x1f
+; VBITS_GE_512-NEXT:    index z0.h, #0, #1
 ; VBITS_GE_512-NEXT:    ptrue p1.h
 ; VBITS_GE_512-NEXT:    mov z1.h, w9
-; VBITS_GE_512-NEXT:    cmpeq p1.h, p1/z, z3.h, z1.h
-; VBITS_GE_512-NEXT:    mov z0.h, p1/m, h2
-; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x8]
+; VBITS_GE_512-NEXT:    ld1h { z2.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    cmpeq p1.h, p1/z, z0.h, z1.h
+; VBITS_GE_512-NEXT:    fmov h0, #5.00000000
+; VBITS_GE_512-NEXT:    mov z2.h, p1/m, h0
+; VBITS_GE_512-NEXT:    st1h { z2.h }, p0, [x8]
 ; VBITS_GE_512-NEXT:    ret
     %op1 = load <32 x half>, ptr %a
     %r = insertelement <32 x half> %op1, half 5.0, i64 31
@@ -91,16 +91,16 @@ define <32 x half> @insertelement_v32f16(ptr %a) #0 {
 define <64 x half> @insertelement_v64f16(ptr %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: insertelement_v64f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w9, #63
 ; CHECK-NEXT:    ptrue p0.h, vl64
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    fmov h2, #5.00000000
-; CHECK-NEXT:    index z3.h, #0, #1
+; CHECK-NEXT:    mov w9, #63 // =0x3f
+; CHECK-NEXT:    index z0.h, #0, #1
 ; CHECK-NEXT:    ptrue p1.h
 ; CHECK-NEXT:    mov z1.h, w9
-; CHECK-NEXT:    cmpeq p1.h, p1/z, z3.h, z1.h
-; CHECK-NEXT:    mov z0.h, p1/m, h2
-; CHECK-NEXT:    st1h { z0.h }, p0, [x8]
+; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x0]
+; CHECK-NEXT:    cmpeq p1.h, p1/z, z0.h, z1.h
+; CHECK-NEXT:    fmov h0, #5.00000000
+; CHECK-NEXT:    mov z2.h, p1/m, h0
+; CHECK-NEXT:    st1h { z2.h }, p0, [x8]
 ; CHECK-NEXT:    ret
     %op1 = load <64 x half>, ptr %a
     %r = insertelement <64 x half> %op1, half 5.0, i64 63
@@ -110,16 +110,16 @@ define <64 x half> @insertelement_v64f16(ptr %a) vscale_range(8,0) #0 {
 define <128 x half> @insertelement_v128f16(ptr %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: insertelement_v128f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w9, #127
 ; CHECK-NEXT:    ptrue p0.h, vl128
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    fmov h2, #5.00000000
-; CHECK-NEXT:    index z3.h, #0, #1
+; CHECK-NEXT:    mov w9, #127 // =0x7f
+; CHECK-NEXT:    index z0.h, #0, #1
 ; CHECK-NEXT:    ptrue p1.h
 ; CHECK-NEXT:    mov z1.h, w9
-; CHECK-NEXT:    cmpeq p1.h, p1/z, z3.h, z1.h
-; CHECK-NEXT:    mov z0.h, p1/m, h2
-; CHECK-NEXT:    st1h { z0.h }, p0, [x8]
+; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x0]
+; CHECK-NEXT:    cmpeq p1.h, p1/z, z0.h, z1.h
+; CHECK-NEXT:    fmov h0, #5.00000000
+; CHECK-NEXT:    mov z2.h, p1/m, h0
+; CHECK-NEXT:    st1h { z2.h }, p0, [x8]
 ; CHECK-NEXT:    ret
     %op1 = load <128 x half>, ptr %a
     %r = insertelement <128 x half> %op1, half 5.0, i64 127
@@ -153,16 +153,16 @@ define <4 x float> @insertelement_v4f32(<4 x float> %op1) vscale_range(2,0) #0 {
 define <8 x float> @insertelement_v8f32(ptr %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: insertelement_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w9, #7
 ; CHECK-NEXT:    ptrue p0.s, vl8
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    fmov s2, #5.00000000
-; CHECK-NEXT:    index z3.s, #0, #1
+; CHECK-NEXT:    mov w9, #7 // =0x7
+; CHECK-NEXT:    index z0.s, #0, #1
 ; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    mov z1.s, w9
-; CHECK-NEXT:    cmpeq p1.s, p1/z, z3.s, z1.s
-; CHECK-NEXT:    mov z0.s, p1/m, s2
-; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0]
+; CHECK-NEXT:    cmpeq p1.s, p1/z, z0.s, z1.s
+; CHECK-NEXT:    fmov s0, #5.00000000
+; CHECK-NEXT:    mov z2.s, p1/m, s0
+; CHECK-NEXT:    st1w { z2.s }, p0, [x8]
 ; CHECK-NEXT:    ret
     %op1 = load <8 x float>, ptr %a
     %r = insertelement <8 x float> %op1, float 5.0, i64 7
@@ -172,33 +172,33 @@ define <8 x float> @insertelement_v8f32(ptr %a) vscale_range(2,0) #0 {
 define <16 x float> @insertelement_v16f32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: insertelement_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov w10, #7
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    fmov s3, #5.00000000
-; VBITS_GE_256-NEXT:    index z4.s, #0, #1
+; VBITS_GE_256-NEXT:    mov w9, #7 // =0x7
+; VBITS_GE_256-NEXT:    index z0.s, #0, #1
 ; VBITS_GE_256-NEXT:    ptrue p1.s
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    mov z2.s, w10
-; VBITS_GE_256-NEXT:    cmpeq p1.s, p1/z, z4.s, z2.s
-; VBITS_GE_256-NEXT:    mov z0.s, p1/m, s3
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8]
+; VBITS_GE_256-NEXT:    mov z1.s, w9
+; VBITS_GE_256-NEXT:    mov x9, #8 // =0x8
+; VBITS_GE_256-NEXT:    fmov s2, #5.00000000
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT:    cmpeq p1.s, p1/z, z0.s, z1.s
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    mov z3.s, p1/m, s2
+; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: insertelement_v16f32:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    mov w9, #15
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    fmov s2, #5.00000000
-; VBITS_GE_512-NEXT:    index z3.s, #0, #1
+; VBITS_GE_512-NEXT:    mov w9, #15 // =0xf
+; VBITS_GE_512-NEXT:    index z0.s, #0, #1
 ; VBITS_GE_512-NEXT:    ptrue p1.s
 ; VBITS_GE_512-NEXT:    mov z1.s, w9
-; VBITS_GE_512-NEXT:    cmpeq p1.s, p1/z, z3.s, z1.s
-; VBITS_GE_512-NEXT:    mov z0.s, p1/m, s2
-; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x8]
+; VBITS_GE_512-NEXT:    ld1w { z2.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    cmpeq p1.s, p1/z, z0.s, z1.s
+; VBITS_GE_512-NEXT:    fmov s0, #5.00000000
+; VBITS_GE_512-NEXT:    mov z2.s, p1/m, s0
+; VBITS_GE_512-NEXT:    st1w { z2.s }, p0, [x8]
 ; VBITS_GE_512-NEXT:    ret
     %op1 = load <16 x float>, ptr %a
     %r = insertelement <16 x float> %op1, float 5.0, i64 15
@@ -208,16 +208,16 @@ define <16 x float> @insertelement_v16f32(ptr %a) #0 {
 define <32 x float> @insertelement_v32f32(ptr %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: insertelement_v32f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w9, #31
 ; CHECK-NEXT:    ptrue p0.s, vl32
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    fmov s2, #5.00000000
-; CHECK-NEXT:    index z3.s, #0, #1
+; CHECK-NEXT:    mov w9, #31 // =0x1f
+; CHECK-NEXT:    index z0.s, #0, #1
 ; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    mov z1.s, w9
-; CHECK-NEXT:    cmpeq p1.s, p1/z, z3.s, z1.s
-; CHECK-NEXT:    mov z0.s, p1/m, s2
-; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0]
+; CHECK-NEXT:    cmpeq p1.s, p1/z, z0.s, z1.s
+; CHECK-NEXT:    fmov s0, #5.00000000
+; CHECK-NEXT:    mov z2.s, p1/m, s0
+; CHECK-NEXT:    st1w { z2.s }, p0, [x8]
 ; CHECK-NEXT:    ret
     %op1 = load <32 x float>, ptr %a
     %r = insertelement <32 x float> %op1, float 5.0, i64 31
@@ -227,16 +227,16 @@ define <32 x float> @insertelement_v32f32(ptr %a) vscale_range(8,0) #0 {
 define <64 x float> @insertelement_v64f32(ptr %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: insertelement_v64f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w9, #63
 ; CHECK-NEXT:    ptrue p0.s, vl64
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    fmov s2, #5.00000000
-; CHECK-NEXT:    index z3.s, #0, #1
+; CHECK-NEXT:    mov w9, #63 // =0x3f
+; CHECK-NEXT:    index z0.s, #0, #1
 ; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    mov z1.s, w9
-; CHECK-NEXT:    cmpeq p1.s, p1/z, z3.s, z1.s
-; CHECK-NEXT:    mov z0.s, p1/m, s2
-; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0]
+; CHECK-NEXT:    cmpeq p1.s, p1/z, z0.s, z1.s
+; CHECK-NEXT:    fmov s0, #5.00000000
+; CHECK-NEXT:    mov z2.s, p1/m, s0
+; CHECK-NEXT:    st1w { z2.s }, p0, [x8]
 ; CHECK-NEXT:    ret
     %op1 = load <64 x float>, ptr %a
     %r = insertelement <64 x float> %op1, float 5.0, i64 63
@@ -247,7 +247,7 @@ define <64 x float> @insertelement_v64f32(ptr %a) vscale_range(16,0) #0 {
 define <1 x double> @insertelement_v1f64(<1 x double> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: insertelement_v1f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #4617315517961601024
+; CHECK-NEXT:    mov x8, #4617315517961601024 // =0x4014000000000000
 ; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    ret
     %r = insertelement <1 x double> %op1, double 5.0, i64 0
@@ -268,16 +268,16 @@ define <2 x double> @insertelement_v2f64(<2 x double> %op1) vscale_range(2,0) #0
 define <4 x double> @insertelement_v4f64(ptr %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: insertelement_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w9, #3
 ; CHECK-NEXT:    ptrue p0.d, vl4
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    fmov d2, #5.00000000
-; CHECK-NEXT:    index z3.d, #0, #1
+; CHECK-NEXT:    mov w9, #3 // =0x3
+; CHECK-NEXT:    index z0.d, #0, #1
 ; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    mov z1.d, x9
-; CHECK-NEXT:    cmpeq p1.d, p1/z, z3.d, z1.d
-; CHECK-NEXT:    mov z0.d, p1/m, d2
-; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0]
+; CHECK-NEXT:    cmpeq p1.d, p1/z, z0.d, z1.d
+; CHECK-NEXT:    fmov d0, #5.00000000
+; CHECK-NEXT:    mov z2.d, p1/m, d0
+; CHECK-NEXT:    st1d { z2.d }, p0, [x8]
 ; CHECK-NEXT:    ret
     %op1 = load <4 x double>, ptr %a
     %r = insertelement <4 x double> %op1, double 5.0, i64 3
@@ -287,33 +287,33 @@ define <4 x double> @insertelement_v4f64(ptr %a) vscale_range(2,0) #0 {
 define <8 x double> @insertelement_v8f64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: insertelement_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #4
-; VBITS_GE_256-NEXT:    mov w10, #3
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    fmov d3, #5.00000000
-; VBITS_GE_256-NEXT:    index z4.d, #0, #1
+; VBITS_GE_256-NEXT:    mov w9, #3 // =0x3
+; VBITS_GE_256-NEXT:    index z0.d, #0, #1
 ; VBITS_GE_256-NEXT:    ptrue p1.d
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    mov z2.d, x10
-; VBITS_GE_256-NEXT:    cmpeq p1.d, p1/z, z4.d, z2.d
-; VBITS_GE_256-NEXT:    mov z0.d, p1/m, d3
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8, x9, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8]
+; VBITS_GE_256-NEXT:    mov z1.d, x9
+; VBITS_GE_256-NEXT:    mov x9, #4 // =0x4
+; VBITS_GE_256-NEXT:    fmov d2, #5.00000000
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0, x9, lsl #3]
+; VBITS_GE_256-NEXT:    cmpeq p1.d, p1/z, z0.d, z1.d
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    mov z3.d, p1/m, d2
+; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x8, x9, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: insertelement_v8f64:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    mov w9, #7
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    fmov d2, #5.00000000
-; VBITS_GE_512-NEXT:    index z3.d, #0, #1
+; VBITS_GE_512-NEXT:    mov w9, #7 // =0x7
+; VBITS_GE_512-NEXT:    index z0.d, #0, #1
 ; VBITS_GE_512-NEXT:    ptrue p1.d
 ; VBITS_GE_512-NEXT:    mov z1.d, x9
-; VBITS_GE_512-NEXT:    cmpeq p1.d, p1/z, z3.d, z1.d
-; VBITS_GE_512-NEXT:    mov z0.d, p1/m, d2
-; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x8]
+; VBITS_GE_512-NEXT:    ld1d { z2.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    cmpeq p1.d, p1/z, z0.d, z1.d
+; VBITS_GE_512-NEXT:    fmov d0, #5.00000000
+; VBITS_GE_512-NEXT:    mov z2.d, p1/m, d0
+; VBITS_GE_512-NEXT:    st1d { z2.d }, p0, [x8]
 ; VBITS_GE_512-NEXT:    ret
     %op1 = load <8 x double>, ptr %a
     %r = insertelement <8 x double> %op1, double 5.0, i64 7
@@ -323,16 +323,16 @@ define <8 x double> @insertelement_v8f64(ptr %a) #0 {
 define <16 x double> @insertelement_v16f64(ptr %a) vscale_range(8,0) #0 {
 ; CHECK-LABEL: insertelement_v16f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w9, #15
 ; CHECK-NEXT:    ptrue p0.d, vl16
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    fmov d2, #5.00000000
-; CHECK-NEXT:    index z3.d, #0, #1
+; CHECK-NEXT:    mov w9, #15 // =0xf
+; CHECK-NEXT:    index z0.d, #0, #1
 ; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    mov z1.d, x9
-; CHECK-NEXT:    cmpeq p1.d, p1/z, z3.d, z1.d
-; CHECK-NEXT:    mov z0.d, p1/m, d2
-; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0]
+; CHECK-NEXT:    cmpeq p1.d, p1/z, z0.d, z1.d
+; CHECK-NEXT:    fmov d0, #5.00000000
+; CHECK-NEXT:    mov z2.d, p1/m, d0
+; CHECK-NEXT:    st1d { z2.d }, p0, [x8]
 ; CHECK-NEXT:    ret
     %op1 = load <16 x double>, ptr %a
     %r = insertelement <16 x double> %op1, double 5.0, i64 15
@@ -342,16 +342,16 @@ define <16 x double> @insertelement_v16f64(ptr %a) vscale_range(8,0) #0 {
 define <32 x double> @insertelement_v32f64(ptr %a) vscale_range(16,0) #0 {
 ; CHECK-LABEL: insertelement_v32f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w9, #31
 ; CHECK-NEXT:    ptrue p0.d, vl32
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    fmov d2, #5.00000000
-; CHECK-NEXT:    index z3.d, #0, #1
+; CHECK-NEXT:    mov w9, #31 // =0x1f
+; CHECK-NEXT:    index z0.d, #0, #1
 ; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    mov z1.d, x9
-; CHECK-NEXT:    cmpeq p1.d, p1/z, z3.d, z1.d
-; CHECK-NEXT:    mov z0.d, p1/m, d2
-; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0]
+; CHECK-NEXT:    cmpeq p1.d, p1/z, z0.d, z1.d
+; CHECK-NEXT:    fmov d0, #5.00000000
+; CHECK-NEXT:    mov z2.d, p1/m, d0
+; CHECK-NEXT:    st1d { z2.d }, p0, [x8]
 ; CHECK-NEXT:    ret
     %op1 = load <32 x double>, ptr %a
     %r = insertelement <32 x double> %op1, double 5.0, i64 31

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
index 91e0c62c8002c6..6a8d7e47a2bd3a 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
@@ -48,8 +48,8 @@ define void @add_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @add_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: add_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
@@ -146,8 +146,8 @@ define void @add_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @add_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: add_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -244,8 +244,8 @@ define void @add_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @add_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: add_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -342,8 +342,8 @@ define void @add_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @add_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: add_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
@@ -388,8 +388,8 @@ define void @add_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
 define void @add_v32i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: add_v32i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #16
 ; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    mov x8, #16 // =0x10
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
@@ -449,8 +449,8 @@ define void @mul_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @mul_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: mul_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
@@ -547,8 +547,8 @@ define void @mul_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @mul_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: mul_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -645,8 +645,8 @@ define void @mul_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @mul_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: mul_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -707,8 +707,8 @@ define void @mul_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
 ; CHECK-LABEL: mul_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -720,8 +720,8 @@ define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
 define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
 ; CHECK-LABEL: mul_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -749,8 +749,8 @@ define void @mul_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @mul_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: mul_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
@@ -851,8 +851,8 @@ define void @sub_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @sub_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: sub_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
@@ -949,8 +949,8 @@ define void @sub_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @sub_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: sub_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -1047,8 +1047,8 @@ define void @sub_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @sub_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: sub_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -1145,8 +1145,8 @@ define void @sub_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @sub_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: sub_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
@@ -1246,8 +1246,8 @@ define void @abs_v32i8(ptr %a) vscale_range(2,0) #0 {
 define void @abs_v64i8(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: abs_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    abs z0.b, p0/m, z0.b
@@ -1334,8 +1334,8 @@ define void @abs_v16i16(ptr %a) vscale_range(2,0) #0 {
 define void @abs_v32i16(ptr %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: abs_v32i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #16
 ; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    mov x8, #16 // =0x10
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; CHECK-NEXT:    abs z0.h, p0/m, z0.h
@@ -1352,21 +1352,21 @@ define void @abs_v32i16(ptr %a) vscale_range(2,0) #0 {
 define void @abs_v64i16(ptr %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: abs_v64i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #48
-; CHECK-NEXT:    mov x9, #16
-; CHECK-NEXT:    mov x10, #32
 ; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    mov x8, #32 // =0x20
+; CHECK-NEXT:    mov x9, #48 // =0x30
+; CHECK-NEXT:    mov x10, #16 // =0x10
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
 ; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
 ; CHECK-NEXT:    ld1h { z3.h }, p0/z, [x0]
-; CHECK-NEXT:    abs z1.h, p0/m, z1.h
 ; CHECK-NEXT:    abs z0.h, p0/m, z0.h
+; CHECK-NEXT:    abs z1.h, p0/m, z1.h
 ; CHECK-NEXT:    abs z2.h, p0/m, z2.h
 ; CHECK-NEXT:    abs z3.h, p0/m, z3.h
-; CHECK-NEXT:    st1h { z2.h }, p0, [x0, x10, lsl #1]
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
 ; CHECK-NEXT:    st1h { z1.h }, p0, [x0, x9, lsl #1]
+; CHECK-NEXT:    st1h { z2.h }, p0, [x0, x10, lsl #1]
 ; CHECK-NEXT:    st1h { z3.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, ptr %a
@@ -1378,38 +1378,42 @@ define void @abs_v64i16(ptr %a) vscale_range(2,0) #0 {
 define void @abs_v128i16(ptr %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: abs_v128i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #96
-; CHECK-NEXT:    mov x9, #48
-; CHECK-NEXT:    mov x10, #16
-; CHECK-NEXT:    mov x11, #80
-; CHECK-NEXT:    mov x12, #32
-; CHECK-NEXT:    mov x13, #112
-; CHECK-NEXT:    mov x14, #64
 ; CHECK-NEXT:    ptrue p0.h, vl16
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, x10, lsl #1]
-; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x0, x11, lsl #1]
-; CHECK-NEXT:    ld1h { z3.h }, p0/z, [x0, x12, lsl #1]
-; CHECK-NEXT:    ld1h { z4.h }, p0/z, [x0, x13, lsl #1]
-; CHECK-NEXT:    ld1h { z5.h }, p0/z, [x0, x14, lsl #1]
-; CHECK-NEXT:    ld1h { z6.h }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT:    mov x8, #96 // =0x60
+; CHECK-NEXT:    mov x9, #112 // =0x70
+; CHECK-NEXT:    mov x10, #64 // =0x40
+; CHECK-NEXT:    mov x11, #80 // =0x50
+; CHECK-NEXT:    mov x12, #32 // =0x20
+; CHECK-NEXT:    mov x13, #48 // =0x30
+; CHECK-NEXT:    mov x14, #16 // =0x10
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
+; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
+; CHECK-NEXT:    ld1h { z3.h }, p0/z, [x0, x11, lsl #1]
+; CHECK-NEXT:    ld1h { z4.h }, p0/z, [x0, x12, lsl #1]
+; CHECK-NEXT:    ld1h { z5.h }, p0/z, [x0, x13, lsl #1]
+; CHECK-NEXT:    ld1h { z6.h }, p0/z, [x0, x14, lsl #1]
 ; CHECK-NEXT:    ld1h { z7.h }, p0/z, [x0]
-; CHECK-NEXT:    abs z1.h, p0/m, z1.h
 ; CHECK-NEXT:    abs z0.h, p0/m, z0.h
-; CHECK-NEXT:    abs z3.h, p0/m, z3.h
+; CHECK-NEXT:    abs z1.h, p0/m, z1.h
 ; CHECK-NEXT:    abs z2.h, p0/m, z2.h
-; CHECK-NEXT:    abs z5.h, p0/m, z5.h
-; CHECK-NEXT:    abs z4.h, p0/m, z4.h
-; CHECK-NEXT:    abs z6.h, p0/m, z6.h
-; CHECK-NEXT:    abs z7.h, p0/m, z7.h
-; CHECK-NEXT:    st1h { z6.h }, p0, [x0, x8, lsl #1]
-; CHECK-NEXT:    st1h { z4.h }, p0, [x0, x13, lsl #1]
-; CHECK-NEXT:    st1h { z5.h }, p0, [x0, x14, lsl #1]
-; CHECK-NEXT:    st1h { z2.h }, p0, [x0, x11, lsl #1]
-; CHECK-NEXT:    st1h { z3.h }, p0, [x0, x12, lsl #1]
-; CHECK-NEXT:    st1h { z0.h }, p0, [x0, x9, lsl #1]
-; CHECK-NEXT:    st1h { z1.h }, p0, [x0, x10, lsl #1]
-; CHECK-NEXT:    st1h { z7.h }, p0, [x0]
+; CHECK-NEXT:    abs z3.h, p0/m, z3.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; CHECK-NEXT:    movprfx z0, z4
+; CHECK-NEXT:    abs z0.h, p0/m, z4.h
+; CHECK-NEXT:    st1h { z1.h }, p0, [x0, x9, lsl #1]
+; CHECK-NEXT:    movprfx z1, z5
+; CHECK-NEXT:    abs z1.h, p0/m, z5.h
+; CHECK-NEXT:    st1h { z2.h }, p0, [x0, x10, lsl #1]
+; CHECK-NEXT:    movprfx z2, z6
+; CHECK-NEXT:    abs z2.h, p0/m, z6.h
+; CHECK-NEXT:    st1h { z3.h }, p0, [x0, x11, lsl #1]
+; CHECK-NEXT:    movprfx z3, z7
+; CHECK-NEXT:    abs z3.h, p0/m, z7.h
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0, x12, lsl #1]
+; CHECK-NEXT:    st1h { z1.h }, p0, [x0, x13, lsl #1]
+; CHECK-NEXT:    st1h { z2.h }, p0, [x0, x14, lsl #1]
+; CHECK-NEXT:    st1h { z3.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, ptr %a
   %res = call <128 x i16> @llvm.abs.v128i16(<128 x i16> %op1, i1 false)
@@ -1454,8 +1458,8 @@ define void @abs_v8i32(ptr %a) vscale_range(2,0) #0 {
 define void @abs_v16i32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: abs_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    abs z0.s, p0/m, z0.s
@@ -1542,8 +1546,8 @@ define void @abs_v4i64(ptr %a) vscale_range(2,0) #0 {
 define void @abs_v8i64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: abs_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    abs z0.d, p0/m, z0.d

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-compares.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-compares.ll
index 2962e35131b92f..254dd912545573 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-compares.ll
@@ -52,8 +52,8 @@ define void @icmp_eq_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @icmp_eq_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: icmp_eq_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
@@ -162,8 +162,8 @@ define void @icmp_eq_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @icmp_eq_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: icmp_eq_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -272,8 +272,8 @@ define void @icmp_eq_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @icmp_eq_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: icmp_eq_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -382,8 +382,8 @@ define void @icmp_eq_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @icmp_eq_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: icmp_eq_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
index cfd755e20f1267..a01ef8bf064e83 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
@@ -15,8 +15,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; VBITS_GE_128-LABEL: sdiv_v8i8:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    sshll v1.8h, v1.8b, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    sshll v1.8h, v1.8b, #0
 ; VBITS_GE_128-NEXT:    sshll v0.8h, v0.8b, #0
 ; VBITS_GE_128-NEXT:    sshll2 v2.4s, v1.8h, #0
 ; VBITS_GE_128-NEXT:    sshll2 v3.4s, v0.8h, #0
@@ -94,26 +94,26 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_128-LABEL: sdiv_v16i8:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    sshll2 v2.8h, v1.16b, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    sshll2 v2.8h, v1.16b, #0
 ; VBITS_GE_128-NEXT:    sshll2 v3.8h, v0.16b, #0
+; VBITS_GE_128-NEXT:    sshll v1.8h, v1.8b, #0
+; VBITS_GE_128-NEXT:    sshll v0.8h, v0.8b, #0
 ; VBITS_GE_128-NEXT:    sshll2 v4.4s, v2.8h, #0
 ; VBITS_GE_128-NEXT:    sshll2 v5.4s, v3.8h, #0
 ; VBITS_GE_128-NEXT:    sshll v2.4s, v2.4h, #0
 ; VBITS_GE_128-NEXT:    sshll v3.4s, v3.4h, #0
 ; VBITS_GE_128-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
-; VBITS_GE_128-NEXT:    sshll v1.8h, v1.8b, #0
-; VBITS_GE_128-NEXT:    sshll v0.8h, v0.8b, #0
+; VBITS_GE_128-NEXT:    sshll2 v5.4s, v0.8h, #0
+; VBITS_GE_128-NEXT:    sshll v0.4s, v0.4h, #0
 ; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_128-NEXT:    sshll2 v3.4s, v1.8h, #0
-; VBITS_GE_128-NEXT:    sshll2 v5.4s, v0.8h, #0
 ; VBITS_GE_128-NEXT:    sshll v1.4s, v1.4h, #0
-; VBITS_GE_128-NEXT:    sshll v0.4s, v0.4h, #0
 ; VBITS_GE_128-NEXT:    sdivr z3.s, p0/m, z3.s, z5.s
-; VBITS_GE_128-NEXT:    uzp1 v2.8h, v2.8h, v4.8h
 ; VBITS_GE_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_128-NEXT:    uzp1 v1.8h, v2.8h, v4.8h
 ; VBITS_GE_128-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
-; VBITS_GE_128-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
+; VBITS_GE_128-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: sdiv_v16i8:
@@ -131,11 +131,11 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
-; VBITS_GE_256-NEXT:    splice z2.h, p0, z2.h, z0.h
-; VBITS_GE_256-NEXT:    uzp1 z0.b, z2.b, z2.b
+; VBITS_GE_256-NEXT:    uzp1 z1.h, z2.h, z2.h
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    splice z1.h, p0, z1.h, z0.h
+; VBITS_GE_256-NEXT:    uzp1 z0.b, z1.b, z1.b
 ; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; VBITS_GE_256-NEXT:    ret
 ;
@@ -203,6 +203,7 @@ define void @sdiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: sdiv_v128i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    ptrue p1.h, vl128
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
 ; CHECK-NEXT:    ptrue p0.s, vl64
@@ -216,12 +217,11 @@ define void @sdiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    ptrue p0.h, vl64
-; CHECK-NEXT:    splice z2.h, p0, z2.h, z0.h
-; CHECK-NEXT:    ptrue p0.h, vl128
-; CHECK-NEXT:    st1b { z2.h }, p0, [x0]
+; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    splice z1.h, p0, z1.h, z0.h
+; CHECK-NEXT:    st1b { z1.h }, p1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, ptr %a
   %op2 = load <128 x i8>, ptr %b
@@ -237,41 +237,40 @@ define void @sdiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p1.s, vl64
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; CHECK-NEXT:    ptrue p2.h, vl64
 ; CHECK-NEXT:    sunpklo z2.h, z1.b
 ; CHECK-NEXT:    sunpklo z3.h, z0.b
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
+; CHECK-NEXT:    sunpklo z1.h, z1.b
 ; CHECK-NEXT:    sunpklo z4.s, z2.h
 ; CHECK-NEXT:    sunpklo z5.s, z3.h
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #128
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
-; CHECK-NEXT:    sunpklo z1.h, z1.b
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
-; CHECK-NEXT:    sdivr z4.s, p1/m, z4.s, z5.s
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
-; CHECK-NEXT:    sunpklo z5.s, z1.h
+; CHECK-NEXT:    sdivr z4.s, p1/m, z4.s, z5.s
+; CHECK-NEXT:    sunpklo z5.s, z0.h
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
+; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
-; CHECK-NEXT:    uzp1 z3.h, z4.h, z4.h
-; CHECK-NEXT:    sunpklo z4.s, z0.h
+; CHECK-NEXT:    sunpklo z3.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    sdivr z3.s, p1/m, z3.s, z5.s
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    sdiv z0.s, p1/m, z0.s, z1.s
-; CHECK-NEXT:    splice z3.h, p2, z3.h, z2.h
-; CHECK-NEXT:    movprfx z2, z4
-; CHECK-NEXT:    sdiv z2.s, p1/m, z2.s, z5.s
-; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT:    ptrue p1.h, vl64
+; CHECK-NEXT:    uzp1 z1.h, z4.h, z4.h
+; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT:    splice z1.h, p1, z1.h, z2.h
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    uzp1 z2.b, z3.b, z3.b
-; CHECK-NEXT:    splice z1.h, p2, z1.h, z0.h
+; CHECK-NEXT:    splice z3.h, p1, z3.h, z0.h
 ; CHECK-NEXT:    ptrue p1.b, vl128
 ; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
-; CHECK-NEXT:    splice z2.b, p1, z2.b, z0.b
-; CHECK-NEXT:    st1b { z2.b }, p0, [x0]
+; CHECK-NEXT:    uzp1 z1.b, z3.b, z3.b
+; CHECK-NEXT:    splice z0.b, p1, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, ptr %a
   %op2 = load <256 x i8>, ptr %b
@@ -285,8 +284,8 @@ define void @sdiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ; VBITS_GE_128-LABEL: sdiv_v4i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    sshll v1.4s, v1.4h, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    sshll v1.4s, v1.4h, #0
 ; VBITS_GE_128-NEXT:    sshll v0.4s, v0.4h, #0
 ; VBITS_GE_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; VBITS_GE_128-NEXT:    xtn v0.4h, v0.4s
@@ -294,13 +293,13 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ;
 ; VBITS_GE_256-LABEL: sdiv_v4i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    sshll v1.4s, v1.4h, #0
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_256-NEXT:    sshll v1.4s, v1.4h, #0
 ; VBITS_GE_256-NEXT:    sshll v0.4s, v0.4h, #0
 ; VBITS_GE_256-NEXT:    sdivr z1.s, p0/m, z1.s, z0.s
 ; VBITS_GE_256-NEXT:    mov w8, v1.s[1]
-; VBITS_GE_256-NEXT:    mov w9, v1.s[2]
 ; VBITS_GE_256-NEXT:    mov v0.16b, v1.16b
+; VBITS_GE_256-NEXT:    mov w9, v1.s[2]
 ; VBITS_GE_256-NEXT:    mov v0.h[1], w8
 ; VBITS_GE_256-NEXT:    mov w8, v1.s[3]
 ; VBITS_GE_256-NEXT:    mov v0.h[2], w9
@@ -310,13 +309,13 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ;
 ; VBITS_GE_512-LABEL: sdiv_v4i16:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    sshll v1.4s, v1.4h, #0
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_512-NEXT:    sshll v1.4s, v1.4h, #0
 ; VBITS_GE_512-NEXT:    sshll v0.4s, v0.4h, #0
 ; VBITS_GE_512-NEXT:    sdivr z1.s, p0/m, z1.s, z0.s
 ; VBITS_GE_512-NEXT:    mov w8, v1.s[1]
-; VBITS_GE_512-NEXT:    mov w9, v1.s[2]
 ; VBITS_GE_512-NEXT:    mov v0.16b, v1.16b
+; VBITS_GE_512-NEXT:    mov w9, v1.s[2]
 ; VBITS_GE_512-NEXT:    mov v0.h[1], w8
 ; VBITS_GE_512-NEXT:    mov w8, v1.s[3]
 ; VBITS_GE_512-NEXT:    mov v0.h[2], w9
@@ -330,8 +329,8 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ; VBITS_GE_128-LABEL: sdiv_v8i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    sshll2 v2.4s, v1.8h, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    sshll2 v2.4s, v1.8h, #0
 ; VBITS_GE_128-NEXT:    sshll2 v3.4s, v0.8h, #0
 ; VBITS_GE_128-NEXT:    sshll v1.4s, v1.4h, #0
 ; VBITS_GE_128-NEXT:    sshll v0.4s, v0.4h, #0
@@ -342,9 +341,9 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ;
 ; VBITS_GE_256-LABEL: sdiv_v8i16:
 ; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
@@ -354,9 +353,9 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ;
 ; VBITS_GE_512-LABEL: sdiv_v8i16:
 ; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_512-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_512-NEXT:    sunpklo z1.s, z1.h
 ; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
 ; VBITS_GE_512-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
@@ -370,24 +369,24 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 define void @sdiv_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: sdiv_v16i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ldp q3, q0, [x1]
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    ldp q4, q1, [x1]
+; VBITS_GE_128-NEXT:    ldr q0, [x0, #16]
+; VBITS_GE_128-NEXT:    sshll2 v2.4s, v1.8h, #0
+; VBITS_GE_128-NEXT:    sshll2 v3.4s, v0.8h, #0
+; VBITS_GE_128-NEXT:    sshll2 v5.4s, v4.8h, #0
+; VBITS_GE_128-NEXT:    sshll v4.4s, v4.4h, #0
+; VBITS_GE_128-NEXT:    sshll v1.4s, v1.4h, #0
+; VBITS_GE_128-NEXT:    sshll v0.4s, v0.4h, #0
+; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT:    ldr q3, [x0]
 ; VBITS_GE_128-NEXT:    sshll2 v6.4s, v3.8h, #0
 ; VBITS_GE_128-NEXT:    sshll v3.4s, v3.4h, #0
-; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
-; VBITS_GE_128-NEXT:    sshll2 v4.4s, v0.8h, #0
-; VBITS_GE_128-NEXT:    sshll v0.4s, v0.4h, #0
-; VBITS_GE_128-NEXT:    sshll2 v7.4s, v1.8h, #0
-; VBITS_GE_128-NEXT:    sshll v1.4s, v1.4h, #0
-; VBITS_GE_128-NEXT:    sshll2 v5.4s, v2.8h, #0
-; VBITS_GE_128-NEXT:    sshll v2.4s, v2.4h, #0
-; VBITS_GE_128-NEXT:    sdiv z1.s, p0/m, z1.s, z3.s
-; VBITS_GE_128-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
-; VBITS_GE_128-NEXT:    sdivr z0.s, p0/m, z0.s, z2.s
-; VBITS_GE_128-NEXT:    movprfx z2, z7
-; VBITS_GE_128-NEXT:    sdiv z2.s, p0/m, z2.s, z6.s
-; VBITS_GE_128-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
-; VBITS_GE_128-NEXT:    uzp1 v0.8h, v0.8h, v4.8h
+; VBITS_GE_128-NEXT:    sdivr z5.s, p0/m, z5.s, z6.s
+; VBITS_GE_128-NEXT:    sdiv z3.s, p0/m, z3.s, z4.s
+; VBITS_GE_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_128-NEXT:    uzp1 v1.8h, v3.8h, v5.8h
+; VBITS_GE_128-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
 ; VBITS_GE_128-NEXT:    stp q1, q0, [x0]
 ; VBITS_GE_128-NEXT:    ret
 ;
@@ -405,11 +404,11 @@ define void @sdiv_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    sdiv z0.s, p1/m, z0.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
-; VBITS_GE_256-NEXT:    splice z2.h, p1, z2.h, z0.h
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    uzp1 z1.h, z2.h, z2.h
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    splice z1.h, p1, z1.h, z0.h
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: sdiv_v16i16:
@@ -483,11 +482,11 @@ define void @sdiv_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
 ; CHECK-NEXT:    sdiv z0.s, p1/m, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    ptrue p1.h, vl64
-; CHECK-NEXT:    splice z2.h, p1, z2.h, z0.h
-; CHECK-NEXT:    st1h { z2.h }, p0, [x0]
+; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    splice z1.h, p1, z1.h, z0.h
+; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, ptr %a
   %op2 = load <128 x i16>, ptr %b
@@ -500,8 +499,8 @@ define void @sdiv_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: sdiv_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -514,8 +513,8 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #
 define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: sdiv_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -543,31 +542,30 @@ define void @sdiv_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @sdiv_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: sdiv_v16i32:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ldp q0, q1, [x0, #32]
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
-; VBITS_GE_128-NEXT:    ldp q4, q5, [x1, #32]
-; VBITS_GE_128-NEXT:    sdiv z0.s, p0/m, z0.s, z4.s
-; VBITS_GE_128-NEXT:    sdiv z1.s, p0/m, z1.s, z5.s
-; VBITS_GE_128-NEXT:    ldp q2, q3, [x0]
-; VBITS_GE_128-NEXT:    ldp q6, q4, [x1]
-; VBITS_GE_128-NEXT:    stp q0, q1, [x0, #32]
-; VBITS_GE_128-NEXT:    movprfx z0, z2
-; VBITS_GE_128-NEXT:    sdiv z0.s, p0/m, z0.s, z6.s
-; VBITS_GE_128-NEXT:    movprfx z1, z3
+; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
+; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
+; VBITS_GE_128-NEXT:    ldp q5, q4, [x1, #32]
+; VBITS_GE_128-NEXT:    sdivr z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_128-NEXT:    ldr q1, [x0, #48]
 ; VBITS_GE_128-NEXT:    sdiv z1.s, p0/m, z1.s, z4.s
-; VBITS_GE_128-NEXT:    stp q0, q1, [x0]
+; VBITS_GE_128-NEXT:    ldr q4, [x0, #32]
+; VBITS_GE_128-NEXT:    sdiv z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_128-NEXT:    sdiv z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT:    stp q4, q1, [x0, #32]
+; VBITS_GE_128-NEXT:    stp q0, q2, [x0]
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: sdiv_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    sdiv z0.s, p0/m, z0.s, z2.s
-; VBITS_GE_256-NEXT:    sdiv z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    sdiv z1.s, p0/m, z1.s, z2.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
@@ -623,8 +621,8 @@ define void @sdiv_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: sdiv_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -637,8 +635,8 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #
 define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: sdiv_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -666,31 +664,30 @@ define void @sdiv_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @sdiv_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: sdiv_v8i64:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ldp q0, q1, [x0, #32]
 ; VBITS_GE_128-NEXT:    ptrue p0.d, vl2
-; VBITS_GE_128-NEXT:    ldp q4, q5, [x1, #32]
-; VBITS_GE_128-NEXT:    sdiv z0.d, p0/m, z0.d, z4.d
-; VBITS_GE_128-NEXT:    sdiv z1.d, p0/m, z1.d, z5.d
-; VBITS_GE_128-NEXT:    ldp q2, q3, [x0]
-; VBITS_GE_128-NEXT:    ldp q6, q4, [x1]
-; VBITS_GE_128-NEXT:    stp q0, q1, [x0, #32]
-; VBITS_GE_128-NEXT:    movprfx z0, z2
-; VBITS_GE_128-NEXT:    sdiv z0.d, p0/m, z0.d, z6.d
-; VBITS_GE_128-NEXT:    movprfx z1, z3
+; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
+; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
+; VBITS_GE_128-NEXT:    ldp q5, q4, [x1, #32]
+; VBITS_GE_128-NEXT:    sdivr z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_128-NEXT:    ldr q1, [x0, #48]
 ; VBITS_GE_128-NEXT:    sdiv z1.d, p0/m, z1.d, z4.d
-; VBITS_GE_128-NEXT:    stp q0, q1, [x0]
+; VBITS_GE_128-NEXT:    ldr q4, [x0, #32]
+; VBITS_GE_128-NEXT:    sdiv z4.d, p0/m, z4.d, z5.d
+; VBITS_GE_128-NEXT:    sdiv z2.d, p0/m, z2.d, z3.d
+; VBITS_GE_128-NEXT:    stp q4, q1, [x0, #32]
+; VBITS_GE_128-NEXT:    stp q0, q2, [x0]
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: sdiv_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    sdiv z0.d, p0/m, z0.d, z2.d
-; VBITS_GE_256-NEXT:    sdiv z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    sdiv z1.d, p0/m, z1.d, z2.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
@@ -751,8 +748,8 @@ define void @sdiv_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; VBITS_GE_128-LABEL: udiv_v8i8:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ushll v1.8h, v1.8b, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    ushll v1.8h, v1.8b, #0
 ; VBITS_GE_128-NEXT:    ushll v0.8h, v0.8b, #0
 ; VBITS_GE_128-NEXT:    ushll2 v2.4s, v1.8h, #0
 ; VBITS_GE_128-NEXT:    ushll2 v3.4s, v0.8h, #0
@@ -830,26 +827,26 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_128-LABEL: udiv_v16i8:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ushll2 v2.8h, v1.16b, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    ushll2 v2.8h, v1.16b, #0
 ; VBITS_GE_128-NEXT:    ushll2 v3.8h, v0.16b, #0
+; VBITS_GE_128-NEXT:    ushll v1.8h, v1.8b, #0
+; VBITS_GE_128-NEXT:    ushll v0.8h, v0.8b, #0
 ; VBITS_GE_128-NEXT:    ushll2 v4.4s, v2.8h, #0
 ; VBITS_GE_128-NEXT:    ushll2 v5.4s, v3.8h, #0
 ; VBITS_GE_128-NEXT:    ushll v2.4s, v2.4h, #0
 ; VBITS_GE_128-NEXT:    ushll v3.4s, v3.4h, #0
 ; VBITS_GE_128-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
-; VBITS_GE_128-NEXT:    ushll v1.8h, v1.8b, #0
-; VBITS_GE_128-NEXT:    ushll v0.8h, v0.8b, #0
+; VBITS_GE_128-NEXT:    ushll2 v5.4s, v0.8h, #0
+; VBITS_GE_128-NEXT:    ushll v0.4s, v0.4h, #0
 ; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_128-NEXT:    ushll2 v3.4s, v1.8h, #0
-; VBITS_GE_128-NEXT:    ushll2 v5.4s, v0.8h, #0
 ; VBITS_GE_128-NEXT:    ushll v1.4s, v1.4h, #0
-; VBITS_GE_128-NEXT:    ushll v0.4s, v0.4h, #0
 ; VBITS_GE_128-NEXT:    udivr z3.s, p0/m, z3.s, z5.s
-; VBITS_GE_128-NEXT:    uzp1 v2.8h, v2.8h, v4.8h
 ; VBITS_GE_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_128-NEXT:    uzp1 v1.8h, v2.8h, v4.8h
 ; VBITS_GE_128-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
-; VBITS_GE_128-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
+; VBITS_GE_128-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: udiv_v16i8:
@@ -867,11 +864,11 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
-; VBITS_GE_256-NEXT:    splice z2.h, p0, z2.h, z0.h
-; VBITS_GE_256-NEXT:    uzp1 z0.b, z2.b, z2.b
+; VBITS_GE_256-NEXT:    uzp1 z1.h, z2.h, z2.h
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    splice z1.h, p0, z1.h, z0.h
+; VBITS_GE_256-NEXT:    uzp1 z0.b, z1.b, z1.b
 ; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; VBITS_GE_256-NEXT:    ret
 ;
@@ -940,11 +937,11 @@ define void @udiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
 ; CHECK-NEXT:    udivr z0.s, p1/m, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    ptrue p1.h, vl64
-; CHECK-NEXT:    splice z2.h, p1, z2.h, z0.h
-; CHECK-NEXT:    st1b { z2.h }, p0, [x0]
+; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    splice z1.h, p1, z1.h, z0.h
+; CHECK-NEXT:    st1b { z1.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, ptr %a
   %op2 = load <128 x i8>, ptr %b
@@ -960,41 +957,40 @@ define void @udiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p1.s, vl64
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; CHECK-NEXT:    ptrue p2.h, vl64
 ; CHECK-NEXT:    uunpklo z2.h, z1.b
 ; CHECK-NEXT:    uunpklo z3.h, z0.b
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
+; CHECK-NEXT:    uunpklo z1.h, z1.b
 ; CHECK-NEXT:    uunpklo z4.s, z2.h
 ; CHECK-NEXT:    uunpklo z5.s, z3.h
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #128
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
-; CHECK-NEXT:    uunpklo z1.h, z1.b
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
-; CHECK-NEXT:    udivr z4.s, p1/m, z4.s, z5.s
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    uunpklo z5.s, z1.h
+; CHECK-NEXT:    udivr z4.s, p1/m, z4.s, z5.s
+; CHECK-NEXT:    uunpklo z5.s, z0.h
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
+; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
-; CHECK-NEXT:    uzp1 z3.h, z4.h, z4.h
-; CHECK-NEXT:    uunpklo z4.s, z0.h
+; CHECK-NEXT:    uunpklo z3.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #128
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #128
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    udivr z3.s, p1/m, z3.s, z5.s
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    udiv z0.s, p1/m, z0.s, z1.s
-; CHECK-NEXT:    splice z3.h, p2, z3.h, z2.h
-; CHECK-NEXT:    movprfx z2, z4
-; CHECK-NEXT:    udiv z2.s, p1/m, z2.s, z5.s
-; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT:    ptrue p1.h, vl64
+; CHECK-NEXT:    uzp1 z1.h, z4.h, z4.h
+; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT:    splice z1.h, p1, z1.h, z2.h
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    uzp1 z2.b, z3.b, z3.b
-; CHECK-NEXT:    splice z1.h, p2, z1.h, z0.h
+; CHECK-NEXT:    splice z3.h, p1, z3.h, z0.h
 ; CHECK-NEXT:    ptrue p1.b, vl128
 ; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
-; CHECK-NEXT:    splice z2.b, p1, z2.b, z0.b
-; CHECK-NEXT:    st1b { z2.b }, p0, [x0]
+; CHECK-NEXT:    uzp1 z1.b, z3.b, z3.b
+; CHECK-NEXT:    splice z0.b, p1, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, ptr %a
   %op2 = load <256 x i8>, ptr %b
@@ -1008,8 +1004,8 @@ define void @udiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ; VBITS_GE_128-LABEL: udiv_v4i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ushll v1.4s, v1.4h, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    ushll v1.4s, v1.4h, #0
 ; VBITS_GE_128-NEXT:    ushll v0.4s, v0.4h, #0
 ; VBITS_GE_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; VBITS_GE_128-NEXT:    xtn v0.4h, v0.4s
@@ -1017,13 +1013,13 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ;
 ; VBITS_GE_256-LABEL: udiv_v4i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ushll v1.4s, v1.4h, #0
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_256-NEXT:    ushll v1.4s, v1.4h, #0
 ; VBITS_GE_256-NEXT:    ushll v0.4s, v0.4h, #0
 ; VBITS_GE_256-NEXT:    udivr z1.s, p0/m, z1.s, z0.s
 ; VBITS_GE_256-NEXT:    mov w8, v1.s[1]
-; VBITS_GE_256-NEXT:    mov w9, v1.s[2]
 ; VBITS_GE_256-NEXT:    mov v0.16b, v1.16b
+; VBITS_GE_256-NEXT:    mov w9, v1.s[2]
 ; VBITS_GE_256-NEXT:    mov v0.h[1], w8
 ; VBITS_GE_256-NEXT:    mov w8, v1.s[3]
 ; VBITS_GE_256-NEXT:    mov v0.h[2], w9
@@ -1033,13 +1029,13 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ;
 ; VBITS_GE_512-LABEL: udiv_v4i16:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ushll v1.4s, v1.4h, #0
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_512-NEXT:    ushll v1.4s, v1.4h, #0
 ; VBITS_GE_512-NEXT:    ushll v0.4s, v0.4h, #0
 ; VBITS_GE_512-NEXT:    udivr z1.s, p0/m, z1.s, z0.s
 ; VBITS_GE_512-NEXT:    mov w8, v1.s[1]
-; VBITS_GE_512-NEXT:    mov w9, v1.s[2]
 ; VBITS_GE_512-NEXT:    mov v0.16b, v1.16b
+; VBITS_GE_512-NEXT:    mov w9, v1.s[2]
 ; VBITS_GE_512-NEXT:    mov v0.h[1], w8
 ; VBITS_GE_512-NEXT:    mov w8, v1.s[3]
 ; VBITS_GE_512-NEXT:    mov v0.h[2], w9
@@ -1053,8 +1049,8 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ; VBITS_GE_128-LABEL: udiv_v8i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ushll2 v2.4s, v1.8h, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    ushll2 v2.4s, v1.8h, #0
 ; VBITS_GE_128-NEXT:    ushll2 v3.4s, v0.8h, #0
 ; VBITS_GE_128-NEXT:    ushll v1.4s, v1.4h, #0
 ; VBITS_GE_128-NEXT:    ushll v0.4s, v0.4h, #0
@@ -1065,9 +1061,9 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ;
 ; VBITS_GE_256-LABEL: udiv_v8i16:
 ; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
@@ -1077,9 +1073,9 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ;
 ; VBITS_GE_512-LABEL: udiv_v8i16:
 ; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_512-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_512-NEXT:    uunpklo z1.s, z1.h
 ; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
 ; VBITS_GE_512-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
@@ -1093,24 +1089,24 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 define void @udiv_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: udiv_v16i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ldp q3, q0, [x1]
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    ldp q4, q1, [x1]
+; VBITS_GE_128-NEXT:    ldr q0, [x0, #16]
+; VBITS_GE_128-NEXT:    ushll2 v2.4s, v1.8h, #0
+; VBITS_GE_128-NEXT:    ushll2 v3.4s, v0.8h, #0
+; VBITS_GE_128-NEXT:    ushll2 v5.4s, v4.8h, #0
+; VBITS_GE_128-NEXT:    ushll v4.4s, v4.4h, #0
+; VBITS_GE_128-NEXT:    ushll v1.4s, v1.4h, #0
+; VBITS_GE_128-NEXT:    ushll v0.4s, v0.4h, #0
+; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT:    ldr q3, [x0]
 ; VBITS_GE_128-NEXT:    ushll2 v6.4s, v3.8h, #0
 ; VBITS_GE_128-NEXT:    ushll v3.4s, v3.4h, #0
-; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
-; VBITS_GE_128-NEXT:    ushll2 v4.4s, v0.8h, #0
-; VBITS_GE_128-NEXT:    ushll v0.4s, v0.4h, #0
-; VBITS_GE_128-NEXT:    ushll2 v7.4s, v1.8h, #0
-; VBITS_GE_128-NEXT:    ushll v1.4s, v1.4h, #0
-; VBITS_GE_128-NEXT:    ushll2 v5.4s, v2.8h, #0
-; VBITS_GE_128-NEXT:    ushll v2.4s, v2.4h, #0
-; VBITS_GE_128-NEXT:    udiv z1.s, p0/m, z1.s, z3.s
-; VBITS_GE_128-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
-; VBITS_GE_128-NEXT:    udivr z0.s, p0/m, z0.s, z2.s
-; VBITS_GE_128-NEXT:    movprfx z2, z7
-; VBITS_GE_128-NEXT:    udiv z2.s, p0/m, z2.s, z6.s
-; VBITS_GE_128-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
-; VBITS_GE_128-NEXT:    uzp1 v0.8h, v0.8h, v4.8h
+; VBITS_GE_128-NEXT:    udivr z5.s, p0/m, z5.s, z6.s
+; VBITS_GE_128-NEXT:    udiv z3.s, p0/m, z3.s, z4.s
+; VBITS_GE_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_128-NEXT:    uzp1 v1.8h, v3.8h, v5.8h
+; VBITS_GE_128-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
 ; VBITS_GE_128-NEXT:    stp q1, q0, [x0]
 ; VBITS_GE_128-NEXT:    ret
 ;
@@ -1128,11 +1124,11 @@ define void @udiv_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    udiv z0.s, p1/m, z0.s, z1.s
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
-; VBITS_GE_256-NEXT:    splice z2.h, p1, z2.h, z0.h
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    uzp1 z1.h, z2.h, z2.h
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    splice z1.h, p1, z1.h, z0.h
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: udiv_v16i16:
@@ -1197,11 +1193,11 @@ define void @udiv_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
 ; CHECK-NEXT:    udiv z0.s, p1/m, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    ptrue p1.h, vl64
-; CHECK-NEXT:    splice z2.h, p1, z2.h, z0.h
-; CHECK-NEXT:    st1h { z2.h }, p0, [x0]
+; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    splice z1.h, p1, z1.h, z0.h
+; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, ptr %a
   %op2 = load <128 x i16>, ptr %b
@@ -1214,8 +1210,8 @@ define void @udiv_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: udiv_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -1228,8 +1224,8 @@ define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(1,0) #
 define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: udiv_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -1257,31 +1253,30 @@ define void @udiv_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @udiv_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: udiv_v16i32:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ldp q0, q1, [x0, #32]
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
-; VBITS_GE_128-NEXT:    ldp q4, q5, [x1, #32]
-; VBITS_GE_128-NEXT:    udiv z0.s, p0/m, z0.s, z4.s
-; VBITS_GE_128-NEXT:    udiv z1.s, p0/m, z1.s, z5.s
-; VBITS_GE_128-NEXT:    ldp q2, q3, [x0]
-; VBITS_GE_128-NEXT:    ldp q6, q4, [x1]
-; VBITS_GE_128-NEXT:    stp q0, q1, [x0, #32]
-; VBITS_GE_128-NEXT:    movprfx z0, z2
-; VBITS_GE_128-NEXT:    udiv z0.s, p0/m, z0.s, z6.s
-; VBITS_GE_128-NEXT:    movprfx z1, z3
+; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
+; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
+; VBITS_GE_128-NEXT:    ldp q5, q4, [x1, #32]
+; VBITS_GE_128-NEXT:    udivr z0.s, p0/m, z0.s, z1.s
+; VBITS_GE_128-NEXT:    ldr q1, [x0, #48]
 ; VBITS_GE_128-NEXT:    udiv z1.s, p0/m, z1.s, z4.s
-; VBITS_GE_128-NEXT:    stp q0, q1, [x0]
+; VBITS_GE_128-NEXT:    ldr q4, [x0, #32]
+; VBITS_GE_128-NEXT:    udiv z4.s, p0/m, z4.s, z5.s
+; VBITS_GE_128-NEXT:    udiv z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT:    stp q4, q1, [x0, #32]
+; VBITS_GE_128-NEXT:    stp q0, q2, [x0]
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: udiv_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    udiv z0.s, p0/m, z0.s, z2.s
-; VBITS_GE_256-NEXT:    udiv z1.s, p0/m, z1.s, z3.s
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    udiv z1.s, p0/m, z1.s, z2.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
@@ -1337,8 +1332,8 @@ define void @udiv_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: udiv_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -1351,8 +1346,8 @@ define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #
 define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: udiv_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -1380,31 +1375,30 @@ define void @udiv_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @udiv_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: udiv_v8i64:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ldp q0, q1, [x0, #32]
 ; VBITS_GE_128-NEXT:    ptrue p0.d, vl2
-; VBITS_GE_128-NEXT:    ldp q4, q5, [x1, #32]
-; VBITS_GE_128-NEXT:    udiv z0.d, p0/m, z0.d, z4.d
-; VBITS_GE_128-NEXT:    udiv z1.d, p0/m, z1.d, z5.d
-; VBITS_GE_128-NEXT:    ldp q2, q3, [x0]
-; VBITS_GE_128-NEXT:    ldp q6, q4, [x1]
-; VBITS_GE_128-NEXT:    stp q0, q1, [x0, #32]
-; VBITS_GE_128-NEXT:    movprfx z0, z2
-; VBITS_GE_128-NEXT:    udiv z0.d, p0/m, z0.d, z6.d
-; VBITS_GE_128-NEXT:    movprfx z1, z3
+; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
+; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
+; VBITS_GE_128-NEXT:    ldp q5, q4, [x1, #32]
+; VBITS_GE_128-NEXT:    udivr z0.d, p0/m, z0.d, z1.d
+; VBITS_GE_128-NEXT:    ldr q1, [x0, #48]
 ; VBITS_GE_128-NEXT:    udiv z1.d, p0/m, z1.d, z4.d
-; VBITS_GE_128-NEXT:    stp q0, q1, [x0]
+; VBITS_GE_128-NEXT:    ldr q4, [x0, #32]
+; VBITS_GE_128-NEXT:    udiv z4.d, p0/m, z4.d, z5.d
+; VBITS_GE_128-NEXT:    udiv z2.d, p0/m, z2.d, z3.d
+; VBITS_GE_128-NEXT:    stp q4, q1, [x0, #32]
+; VBITS_GE_128-NEXT:    stp q0, q2, [x0]
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: udiv_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    udiv z0.d, p0/m, z0.d, z2.d
-; VBITS_GE_256-NEXT:    udiv z1.d, p0/m, z1.d, z3.d
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    udiv z1.d, p0/m, z1.d, z2.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll
index 76b5df751a89fc..756e5f4cddf809 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll
@@ -58,8 +58,8 @@ define void @sext_v4i3_v4i64(<4 x i3> %a, ptr %out) vscale_range(2,0) #0 {
 define void @sext_v16i8_v16i16(<16 x i8> %a, ptr %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sext_v16i8_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -73,7 +73,7 @@ define void @sext_v32i8_v32i16(ptr %in, ptr %out) #0 {
 ; VBITS_GE_256-LABEL: sext_v32i8_v32i16:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
 ; VBITS_GE_256-NEXT:    add z0.b, z0.b, z0.b
@@ -157,13 +157,13 @@ define void @sext_v16i8_v16i32(<16 x i8> %a, ptr %out) #0 {
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    sunpklo z0.h, z0.b
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
+; VBITS_GE_256-NEXT:    sunpklo z0.h, z0.b
 ; VBITS_GE_256-NEXT:    sunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ret
 ;
@@ -242,14 +242,14 @@ define void @sext_v8i8_v8i64(<8 x i8> %a, ptr %out) #0 {
 ; VBITS_GE_256-LABEL: sext_v8i8_v8i64:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    sshll v0.8h, v0.8b, #0
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ret
 ;
@@ -308,8 +308,8 @@ define void @sext_v32i8_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 {
 define void @sext_v8i16_v8i32(<8 x i16> %a, ptr %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sext_v8i16_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -322,7 +322,7 @@ define void @sext_v16i16_v16i32(ptr %in, ptr %out) #0 {
 ; VBITS_GE_256-LABEL: sext_v16i16_v16i32:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    add z0.h, z0.h, z0.h
@@ -406,13 +406,13 @@ define void @sext_v8i16_v8i64(<8 x i16> %a, ptr %out) #0 {
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ret
 ;
@@ -472,8 +472,8 @@ define void @sext_v32i16_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 {
 define void @sext_v4i32_v4i64(<4 x i32> %a, ptr %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sext_v4i32_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -486,7 +486,7 @@ define void @sext_v8i32_v8i64(ptr %in, ptr %out) #0 {
 ; VBITS_GE_256-LABEL: sext_v8i32_v8i64:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    add z0.s, z0.s, z0.s
@@ -554,8 +554,8 @@ define void @sext_v32i32_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 {
 define void @zext_v16i8_v16i16(<16 x i8> %a, ptr %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: zext_v16i8_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -569,7 +569,7 @@ define void @zext_v32i8_v32i16(ptr %in, ptr %out) #0 {
 ; VBITS_GE_256-LABEL: zext_v32i8_v32i16:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_256-NEXT:    mov x8, #16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
 ; VBITS_GE_256-NEXT:    add z0.b, z0.b, z0.b
@@ -653,13 +653,13 @@ define void @zext_v16i8_v16i32(<16 x i8> %a, ptr %out) #0 {
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    uunpklo z0.h, z0.b
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
+; VBITS_GE_256-NEXT:    uunpklo z0.h, z0.b
 ; VBITS_GE_256-NEXT:    uunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ret
 ;
@@ -738,14 +738,14 @@ define void @zext_v8i8_v8i64(<8 x i8> %a, ptr %out) #0 {
 ; VBITS_GE_256-LABEL: zext_v8i8_v8i64:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ushll v0.8h, v0.8b, #0
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ret
 ;
@@ -804,8 +804,8 @@ define void @zext_v32i8_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 {
 define void @zext_v8i16_v8i32(<8 x i16> %a, ptr %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: zext_v8i16_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -818,7 +818,7 @@ define void @zext_v16i16_v16i32(ptr %in, ptr %out) #0 {
 ; VBITS_GE_256-LABEL: zext_v16i16_v16i32:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    add z0.h, z0.h, z0.h
@@ -902,13 +902,13 @@ define void @zext_v8i16_v8i64(<8 x i16> %a, ptr %out) #0 {
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ret
 ;
@@ -968,8 +968,8 @@ define void @zext_v32i16_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 {
 define void @zext_v4i32_v4i64(<4 x i32> %a, ptr %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: zext_v4i32_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -982,7 +982,7 @@ define void @zext_v8i32_v8i64(ptr %in, ptr %out) #0 {
 ; VBITS_GE_256-LABEL: zext_v8i32_v8i64:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    add z0.s, z0.s, z0.s

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll
index d8e63745be1a94..149ce49665c64f 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll
@@ -48,8 +48,8 @@ define void @and_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @and_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: and_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
@@ -146,8 +146,8 @@ define void @and_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @and_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: and_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -244,8 +244,8 @@ define void @and_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @and_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: and_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -342,8 +342,8 @@ define void @and_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @and_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: and_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
@@ -444,8 +444,8 @@ define void @or_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @or_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: or_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
@@ -542,8 +542,8 @@ define void @or_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @or_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: or_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -640,8 +640,8 @@ define void @or_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @or_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: or_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -738,8 +738,8 @@ define void @or_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @or_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: or_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
@@ -840,8 +840,8 @@ define void @xor_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @xor_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: xor_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
@@ -938,8 +938,8 @@ define void @xor_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @xor_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: xor_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -1036,8 +1036,8 @@ define void @xor_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @xor_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: xor_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -1134,8 +1134,8 @@ define void @xor_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @xor_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: xor_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll
index b13852a24800db..4091c01fe93fbd 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll
@@ -48,8 +48,8 @@ define void @smax_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @smax_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: smax_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
@@ -146,8 +146,8 @@ define void @smax_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @smax_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: smax_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -244,8 +244,8 @@ define void @smax_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @smax_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: smax_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -307,8 +307,8 @@ define void @smax_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <1 x i64> @smax_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smax_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -321,8 +321,8 @@ define <1 x i64> @smax_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #
 define <2 x i64> @smax_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smax_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -350,8 +350,8 @@ define void @smax_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @smax_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: smax_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
@@ -452,8 +452,8 @@ define void @smin_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @smin_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: smin_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
@@ -550,8 +550,8 @@ define void @smin_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @smin_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: smin_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -648,8 +648,8 @@ define void @smin_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @smin_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: smin_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -711,8 +711,8 @@ define void @smin_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <1 x i64> @smin_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smin_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -725,8 +725,8 @@ define <1 x i64> @smin_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #
 define <2 x i64> @smin_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smin_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -754,8 +754,8 @@ define void @smin_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @smin_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: smin_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
@@ -856,8 +856,8 @@ define void @umax_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @umax_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: umax_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
@@ -954,8 +954,8 @@ define void @umax_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @umax_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: umax_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -1052,8 +1052,8 @@ define void @umax_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @umax_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: umax_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -1115,8 +1115,8 @@ define void @umax_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <1 x i64> @umax_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umax_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -1129,8 +1129,8 @@ define <1 x i64> @umax_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #
 define <2 x i64> @umax_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umax_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -1158,8 +1158,8 @@ define void @umax_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @umax_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: umax_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
@@ -1260,8 +1260,8 @@ define void @umin_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @umin_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: umin_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
@@ -1358,8 +1358,8 @@ define void @umin_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @umin_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: umin_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -1456,8 +1456,8 @@ define void @umin_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @umin_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: umin_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -1519,8 +1519,8 @@ define void @umin_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <1 x i64> @umin_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umin_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -1533,8 +1533,8 @@ define <1 x i64> @umin_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #
 define <2 x i64> @umin_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umin_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -1562,8 +1562,8 @@ define void @umin_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @umin_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: umin_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
index ce29b0aae00771..ae230fc2c92d93 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll
@@ -16,8 +16,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smulh_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -36,8 +36,8 @@ define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
 define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smulh_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -73,8 +73,8 @@ define void @smulh_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @smulh_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: smulh_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
@@ -150,8 +150,8 @@ define void @smulh_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smulh_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -168,8 +168,8 @@ define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0)
 define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smulh_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -205,8 +205,8 @@ define void @smulh_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @smulh_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: smulh_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -280,8 +280,8 @@ define void @smulh_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smulh_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -298,8 +298,8 @@ define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0)
 define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smulh_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -335,8 +335,8 @@ define void @smulh_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @smulh_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: smulh_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -410,8 +410,8 @@ define void @smulh_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smulh_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -430,8 +430,8 @@ define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0)
 define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smulh_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -467,8 +467,8 @@ define void @smulh_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @smulh_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: smulh_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
@@ -547,8 +547,8 @@ define void @smulh_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umulh_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -565,8 +565,8 @@ define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
 define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umulh_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -602,8 +602,8 @@ define void @umulh_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @umulh_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: umulh_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
@@ -680,8 +680,8 @@ define void @umulh_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umulh_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -698,8 +698,8 @@ define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0)
 define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umulh_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -735,8 +735,8 @@ define void @umulh_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @umulh_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: umulh_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -810,8 +810,8 @@ define void @umulh_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umulh_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -828,8 +828,8 @@ define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0)
 define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umulh_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -867,8 +867,8 @@ define void @umulh_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @umulh_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: umulh_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -942,8 +942,8 @@ define void @umulh_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umulh_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -960,8 +960,8 @@ define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0)
 define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umulh_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -997,8 +997,8 @@ define void @umulh_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @umulh_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: umulh_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
index 942faba2eacf62..752c2cd34bfe48 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
@@ -48,8 +48,8 @@ define i8 @uaddv_v32i8(ptr %a) vscale_range(2,0) #0 {
 define i8 @uaddv_v64i8(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: uaddv_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    add z0.b, z1.b, z0.b
@@ -138,8 +138,8 @@ define i16 @uaddv_v16i16(ptr %a) vscale_range(2,0) #0 {
 define i16 @uaddv_v32i16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: uaddv_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    add z0.h, z1.h, z0.h
@@ -228,8 +228,8 @@ define i32 @uaddv_v8i32(ptr %a) vscale_range(2,0) #0 {
 define i32 @uaddv_v16i32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: uaddv_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    add z0.s, z1.s, z0.s
@@ -317,8 +317,8 @@ define i64 @uaddv_v4i64(ptr %a) vscale_range(2,0) #0 {
 define i64 @uaddv_v8i64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: uaddv_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    add z0.d, z1.d, z0.d
@@ -406,8 +406,8 @@ define i8 @smaxv_v32i8(ptr %a) vscale_range(2,0) #0 {
 define i8 @smaxv_v64i8(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: smaxv_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    smax z0.b, p0/m, z0.b, z1.b
@@ -491,8 +491,8 @@ define i16 @smaxv_v16i16(ptr %a) vscale_range(2,0) #0 {
 define i16 @smaxv_v32i16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: smaxv_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    smax z0.h, p0/m, z0.h, z1.h
@@ -576,8 +576,8 @@ define i32 @smaxv_v8i32(ptr %a) vscale_range(2,0) #0 {
 define i32 @smaxv_v16i32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: smaxv_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    smax z0.s, p0/m, z0.s, z1.s
@@ -638,8 +638,8 @@ define i64 @smaxv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
 define i64 @smaxv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: smaxv_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    smaxv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -663,8 +663,8 @@ define i64 @smaxv_v4i64(ptr %a) vscale_range(2,0) #0 {
 define i64 @smaxv_v8i64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: smaxv_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    smax z0.d, p0/m, z0.d, z1.d
@@ -752,8 +752,8 @@ define i8 @sminv_v32i8(ptr %a) vscale_range(2,0) #0 {
 define i8 @sminv_v64i8(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: sminv_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    smin z0.b, p0/m, z0.b, z1.b
@@ -837,8 +837,8 @@ define i16 @sminv_v16i16(ptr %a) vscale_range(2,0) #0 {
 define i16 @sminv_v32i16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: sminv_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    smin z0.h, p0/m, z0.h, z1.h
@@ -922,8 +922,8 @@ define i32 @sminv_v8i32(ptr %a) vscale_range(2,0) #0 {
 define i32 @sminv_v16i32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: sminv_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    smin z0.s, p0/m, z0.s, z1.s
@@ -984,8 +984,8 @@ define i64 @sminv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
 define i64 @sminv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sminv_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    sminv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -1009,8 +1009,8 @@ define i64 @sminv_v4i64(ptr %a) vscale_range(2,0) #0 {
 define i64 @sminv_v8i64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: sminv_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    smin z0.d, p0/m, z0.d, z1.d
@@ -1098,8 +1098,8 @@ define i8 @umaxv_v32i8(ptr %a) vscale_range(2,0) #0 {
 define i8 @umaxv_v64i8(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: umaxv_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    umax z0.b, p0/m, z0.b, z1.b
@@ -1183,8 +1183,8 @@ define i16 @umaxv_v16i16(ptr %a) vscale_range(2,0) #0 {
 define i16 @umaxv_v32i16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: umaxv_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    umax z0.h, p0/m, z0.h, z1.h
@@ -1268,8 +1268,8 @@ define i32 @umaxv_v8i32(ptr %a) vscale_range(2,0) #0 {
 define i32 @umaxv_v16i32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: umaxv_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    umax z0.s, p0/m, z0.s, z1.s
@@ -1330,8 +1330,8 @@ define i64 @umaxv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
 define i64 @umaxv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: umaxv_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    umaxv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -1355,8 +1355,8 @@ define i64 @umaxv_v4i64(ptr %a) vscale_range(2,0) #0 {
 define i64 @umaxv_v8i64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: umaxv_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    umax z0.d, p0/m, z0.d, z1.d
@@ -1444,8 +1444,8 @@ define i8 @uminv_v32i8(ptr %a) vscale_range(2,0) #0 {
 define i8 @uminv_v64i8(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: uminv_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    umin z0.b, p0/m, z0.b, z1.b
@@ -1529,8 +1529,8 @@ define i16 @uminv_v16i16(ptr %a) vscale_range(2,0) #0 {
 define i16 @uminv_v32i16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: uminv_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    umin z0.h, p0/m, z0.h, z1.h
@@ -1614,8 +1614,8 @@ define i32 @uminv_v8i32(ptr %a) vscale_range(2,0) #0 {
 define i32 @uminv_v16i32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: uminv_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    umin z0.s, p0/m, z0.s, z1.s
@@ -1676,8 +1676,8 @@ define i64 @uminv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
 define i64 @uminv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: uminv_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    uminv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -1701,8 +1701,8 @@ define i64 @uminv_v4i64(ptr %a) vscale_range(2,0) #0 {
 define i64 @uminv_v8i64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: uminv_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    umin z0.d, p0/m, z0.d, z1.d

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
index 376214c6f65b6a..b0edf0ddebd4e0 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
@@ -15,8 +15,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; VBITS_GE_128-LABEL: srem_v8i8:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    sshll v2.8h, v1.8b, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    sshll v2.8h, v1.8b, #0
 ; VBITS_GE_128-NEXT:    sshll v3.8h, v0.8b, #0
 ; VBITS_GE_128-NEXT:    sshll2 v4.4s, v2.8h, #0
 ; VBITS_GE_128-NEXT:    sshll2 v5.4s, v3.8h, #0
@@ -97,26 +97,25 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_128-LABEL: srem_v16i8:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    sshll2 v2.8h, v1.16b, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    sshll2 v2.8h, v1.16b, #0
 ; VBITS_GE_128-NEXT:    sshll2 v3.8h, v0.16b, #0
 ; VBITS_GE_128-NEXT:    sshll2 v4.4s, v2.8h, #0
 ; VBITS_GE_128-NEXT:    sshll2 v5.4s, v3.8h, #0
 ; VBITS_GE_128-NEXT:    sshll v2.4s, v2.4h, #0
 ; VBITS_GE_128-NEXT:    sshll v3.4s, v3.4h, #0
 ; VBITS_GE_128-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
-; VBITS_GE_128-NEXT:    sshll v6.8h, v1.8b, #0
-; VBITS_GE_128-NEXT:    sshll v7.8h, v0.8b, #0
+; VBITS_GE_128-NEXT:    sshll v5.8h, v0.8b, #0
+; VBITS_GE_128-NEXT:    sshll2 v7.4s, v5.8h, #0
+; VBITS_GE_128-NEXT:    sshll v5.4s, v5.4h, #0
 ; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_GE_128-NEXT:    sshll2 v3.4s, v6.8h, #0
-; VBITS_GE_128-NEXT:    sshll2 v5.4s, v7.8h, #0
-; VBITS_GE_128-NEXT:    sshll v6.4s, v6.4h, #0
-; VBITS_GE_128-NEXT:    sdivr z3.s, p0/m, z3.s, z5.s
-; VBITS_GE_128-NEXT:    sshll v7.4s, v7.4h, #0
+; VBITS_GE_128-NEXT:    sshll v3.8h, v1.8b, #0
+; VBITS_GE_128-NEXT:    sshll2 v6.4s, v3.8h, #0
+; VBITS_GE_128-NEXT:    sshll v3.4s, v3.4h, #0
+; VBITS_GE_128-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
 ; VBITS_GE_128-NEXT:    uzp1 v2.8h, v2.8h, v4.8h
-; VBITS_GE_128-NEXT:    movprfx z4, z7
-; VBITS_GE_128-NEXT:    sdiv z4.s, p0/m, z4.s, z6.s
-; VBITS_GE_128-NEXT:    uzp1 v3.8h, v4.8h, v3.8h
+; VBITS_GE_128-NEXT:    sdivr z3.s, p0/m, z3.s, z5.s
+; VBITS_GE_128-NEXT:    uzp1 v3.8h, v3.8h, v6.8h
 ; VBITS_GE_128-NEXT:    uzp1 v2.16b, v3.16b, v2.16b
 ; VBITS_GE_128-NEXT:    mls v0.16b, v2.16b, v1.16b
 ; VBITS_GE_128-NEXT:    ret
@@ -127,20 +126,20 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; VBITS_GE_256-NEXT:    sunpklo z2.h, z1.b
 ; VBITS_GE_256-NEXT:    sunpklo z3.h, z0.b
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    sunpklo z4.s, z2.h
 ; VBITS_GE_256-NEXT:    sunpklo z5.s, z3.h
 ; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
 ; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    sunpklo z2.s, z2.h
 ; VBITS_GE_256-NEXT:    sunpklo z3.s, z3.h
 ; VBITS_GE_256-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
 ; VBITS_GE_256-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_GE_256-NEXT:    uzp1 z4.h, z4.h, z4.h
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
-; VBITS_GE_256-NEXT:    splice z4.h, p0, z4.h, z2.h
-; VBITS_GE_256-NEXT:    uzp1 z2.b, z4.b, z4.b
+; VBITS_GE_256-NEXT:    uzp1 z3.h, z4.h, z4.h
+; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_256-NEXT:    splice z3.h, p0, z3.h, z2.h
+; VBITS_GE_256-NEXT:    uzp1 z2.b, z3.b, z3.b
 ; VBITS_GE_256-NEXT:    mls v0.16b, v2.16b, v1.16b
 ; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; VBITS_GE_256-NEXT:    ret
@@ -229,11 +228,11 @@ define void @srem_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sdivr z4.s, p1/m, z4.s, z5.s
 ; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    ptrue p1.h, vl64
-; CHECK-NEXT:    splice z4.h, p1, z4.h, z2.h
-; CHECK-NEXT:    uzp1 z2.b, z4.b, z4.b
+; CHECK-NEXT:    uzp1 z3.h, z4.h, z4.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    splice z3.h, p1, z3.h, z2.h
+; CHECK-NEXT:    uzp1 z2.b, z3.b, z3.b
 ; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -251,42 +250,42 @@ define void @srem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p1.s, vl64
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; CHECK-NEXT:    ptrue p2.h, vl64
 ; CHECK-NEXT:    sunpklo z2.h, z1.b
 ; CHECK-NEXT:    sunpklo z3.h, z0.b
-; CHECK-NEXT:    sunpklo z5.s, z2.h
-; CHECK-NEXT:    sunpklo z6.s, z3.h
+; CHECK-NEXT:    sunpklo z4.s, z2.h
+; CHECK-NEXT:    sunpklo z5.s, z3.h
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #128
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    sdivr z5.s, p1/m, z5.s, z6.s
-; CHECK-NEXT:    mov z6.d, z0.d
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
-; CHECK-NEXT:    ext z4.b, z4.b, z1.b, #128
+; CHECK-NEXT:    sdivr z4.s, p1/m, z4.s, z5.s
+; CHECK-NEXT:    mov z5.d, z0.d
+; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #128
+; CHECK-NEXT:    sunpklo z5.h, z5.b
+; CHECK-NEXT:    sunpklo z7.s, z5.h
+; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #128
 ; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
-; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #128
-; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    sunpklo z3.h, z4.b
-; CHECK-NEXT:    sunpklo z4.h, z6.b
-; CHECK-NEXT:    splice z5.h, p2, z5.h, z2.h
-; CHECK-NEXT:    sunpklo z2.s, z3.h
-; CHECK-NEXT:    sunpklo z6.s, z4.h
+; CHECK-NEXT:    mov z3.d, z1.d
+; CHECK-NEXT:    sunpklo z5.s, z5.h
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #128
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    sunpklo z3.h, z3.b
+; CHECK-NEXT:    sunpklo z6.s, z3.h
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
-; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #128
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
-; CHECK-NEXT:    sunpklo z4.s, z4.h
-; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z6.s
-; CHECK-NEXT:    sdivr z3.s, p1/m, z3.s, z4.s
+; CHECK-NEXT:    sdivr z6.s, p1/m, z6.s, z7.s
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    sdivr z3.s, p1/m, z3.s, z5.s
+; CHECK-NEXT:    ptrue p1.h, vl64
+; CHECK-NEXT:    uzp1 z5.h, z6.h, z6.h
+; CHECK-NEXT:    splice z4.h, p1, z4.h, z2.h
+; CHECK-NEXT:    uzp1 z2.b, z4.b, z4.b
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    uzp1 z4.b, z5.b, z5.b
-; CHECK-NEXT:    splice z2.h, p2, z2.h, z3.h
+; CHECK-NEXT:    splice z5.h, p1, z5.h, z3.h
 ; CHECK-NEXT:    ptrue p1.b, vl128
-; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
-; CHECK-NEXT:    splice z4.b, p1, z4.b, z2.b
-; CHECK-NEXT:    mls z0.b, p0/m, z4.b, z1.b
+; CHECK-NEXT:    uzp1 z3.b, z5.b, z5.b
+; CHECK-NEXT:    splice z2.b, p1, z2.b, z3.b
+; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, ptr %a
@@ -301,8 +300,8 @@ define void @srem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ; VBITS_GE_128-LABEL: srem_v4i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    sshll v2.4s, v1.4h, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    sshll v2.4s, v1.4h, #0
 ; VBITS_GE_128-NEXT:    sshll v3.4s, v0.4h, #0
 ; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_128-NEXT:    xtn v2.4h, v2.4s
@@ -311,13 +310,13 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ;
 ; VBITS_GE_256-LABEL: srem_v4i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    sshll v2.4s, v1.4h, #0
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_256-NEXT:    sshll v2.4s, v1.4h, #0
 ; VBITS_GE_256-NEXT:    sshll v3.4s, v0.4h, #0
 ; VBITS_GE_256-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    mov w8, v2.s[1]
-; VBITS_GE_256-NEXT:    mov w9, v2.s[2]
 ; VBITS_GE_256-NEXT:    mov v3.16b, v2.16b
+; VBITS_GE_256-NEXT:    mov w9, v2.s[2]
 ; VBITS_GE_256-NEXT:    mov v3.h[1], w8
 ; VBITS_GE_256-NEXT:    mov w8, v2.s[3]
 ; VBITS_GE_256-NEXT:    mov v3.h[2], w9
@@ -327,13 +326,13 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ;
 ; VBITS_GE_512-LABEL: srem_v4i16:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    sshll v2.4s, v1.4h, #0
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_512-NEXT:    sshll v2.4s, v1.4h, #0
 ; VBITS_GE_512-NEXT:    sshll v3.4s, v0.4h, #0
 ; VBITS_GE_512-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_512-NEXT:    mov w8, v2.s[1]
-; VBITS_GE_512-NEXT:    mov w9, v2.s[2]
 ; VBITS_GE_512-NEXT:    mov v3.16b, v2.16b
+; VBITS_GE_512-NEXT:    mov w9, v2.s[2]
 ; VBITS_GE_512-NEXT:    mov v3.h[1], w8
 ; VBITS_GE_512-NEXT:    mov w8, v2.s[3]
 ; VBITS_GE_512-NEXT:    mov v3.h[2], w9
@@ -347,23 +346,22 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ; VBITS_GE_128-LABEL: srem_v8i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    sshll2 v2.4s, v1.8h, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    sshll2 v2.4s, v1.8h, #0
 ; VBITS_GE_128-NEXT:    sshll2 v3.4s, v0.8h, #0
-; VBITS_GE_128-NEXT:    sshll v4.4s, v1.4h, #0
+; VBITS_GE_128-NEXT:    sshll v4.4s, v0.4h, #0
 ; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; VBITS_GE_128-NEXT:    sshll v5.4s, v0.4h, #0
-; VBITS_GE_128-NEXT:    movprfx z3, z5
-; VBITS_GE_128-NEXT:    sdiv z3.s, p0/m, z3.s, z4.s
+; VBITS_GE_128-NEXT:    sshll v3.4s, v1.4h, #0
+; VBITS_GE_128-NEXT:    sdivr z3.s, p0/m, z3.s, z4.s
 ; VBITS_GE_128-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
 ; VBITS_GE_128-NEXT:    mls v0.8h, v2.8h, v1.8h
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: srem_v8i16:
 ; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    sunpklo z2.s, z1.h
 ; VBITS_GE_256-NEXT:    sunpklo z3.s, z0.h
 ; VBITS_GE_256-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
@@ -374,9 +372,9 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ;
 ; VBITS_GE_512-LABEL: srem_v8i16:
 ; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_512-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_512-NEXT:    sunpklo z2.s, z1.h
 ; VBITS_GE_512-NEXT:    sunpklo z3.s, z0.h
 ; VBITS_GE_512-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
@@ -391,26 +389,27 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 define void @srem_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: srem_v16i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ldp q0, q1, [x1]
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
-; VBITS_GE_128-NEXT:    sshll2 v5.4s, v0.8h, #0
-; VBITS_GE_128-NEXT:    sshll v7.4s, v0.4h, #0
-; VBITS_GE_128-NEXT:    ldp q2, q3, [x0]
-; VBITS_GE_128-NEXT:    sshll2 v4.4s, v1.8h, #0
-; VBITS_GE_128-NEXT:    sshll2 v6.4s, v2.8h, #0
-; VBITS_GE_128-NEXT:    sshll v16.4s, v2.4h, #0
-; VBITS_GE_128-NEXT:    sshll2 v17.4s, v3.8h, #0
+; VBITS_GE_128-NEXT:    ldp q4, q1, [x1]
+; VBITS_GE_128-NEXT:    ldr q0, [x0, #16]
+; VBITS_GE_128-NEXT:    sshll2 v2.4s, v1.8h, #0
+; VBITS_GE_128-NEXT:    sshll2 v3.4s, v0.8h, #0
+; VBITS_GE_128-NEXT:    sshll2 v5.4s, v4.8h, #0
+; VBITS_GE_128-NEXT:    sshll v16.4s, v0.4h, #0
+; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT:    ldr q3, [x0]
+; VBITS_GE_128-NEXT:    sshll2 v6.4s, v3.8h, #0
+; VBITS_GE_128-NEXT:    sshll v7.4s, v3.4h, #0
 ; VBITS_GE_128-NEXT:    sdivr z5.s, p0/m, z5.s, z6.s
-; VBITS_GE_128-NEXT:    sshll v6.4s, v1.4h, #0
+; VBITS_GE_128-NEXT:    sshll v6.4s, v4.4h, #0
+; VBITS_GE_128-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
+; VBITS_GE_128-NEXT:    sshll v7.4s, v1.4h, #0
 ; VBITS_GE_128-NEXT:    sdivr z7.s, p0/m, z7.s, z16.s
-; VBITS_GE_128-NEXT:    sshll v16.4s, v3.4h, #0
-; VBITS_GE_128-NEXT:    sdivr z4.s, p0/m, z4.s, z17.s
-; VBITS_GE_128-NEXT:    uzp1 v5.8h, v7.8h, v5.8h
-; VBITS_GE_128-NEXT:    sdivr z6.s, p0/m, z6.s, z16.s
-; VBITS_GE_128-NEXT:    uzp1 v4.8h, v6.8h, v4.8h
-; VBITS_GE_128-NEXT:    mls v2.8h, v5.8h, v0.8h
-; VBITS_GE_128-NEXT:    mls v3.8h, v4.8h, v1.8h
-; VBITS_GE_128-NEXT:    stp q2, q3, [x0]
+; VBITS_GE_128-NEXT:    uzp1 v5.8h, v6.8h, v5.8h
+; VBITS_GE_128-NEXT:    mls v3.8h, v5.8h, v4.8h
+; VBITS_GE_128-NEXT:    uzp1 v2.8h, v7.8h, v2.8h
+; VBITS_GE_128-NEXT:    mls v0.8h, v2.8h, v1.8h
+; VBITS_GE_128-NEXT:    stp q3, q0, [x0]
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: srem_v16i16:
@@ -419,19 +418,19 @@ define void @srem_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    mov z3.d, z1.d
+; VBITS_GE_256-NEXT:    sunpklo z2.s, z1.h
+; VBITS_GE_256-NEXT:    sunpklo z3.s, z0.h
 ; VBITS_GE_256-NEXT:    mov z4.d, z0.d
-; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z1.b, #16
 ; VBITS_GE_256-NEXT:    ext z4.b, z4.b, z0.b, #16
-; VBITS_GE_256-NEXT:    sunpklo z2.s, z1.h
-; VBITS_GE_256-NEXT:    sunpklo z5.s, z0.h
-; VBITS_GE_256-NEXT:    sunpklo z3.s, z3.h
+; VBITS_GE_256-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
+; VBITS_GE_256-NEXT:    mov z3.d, z1.d
 ; VBITS_GE_256-NEXT:    sunpklo z4.s, z4.h
-; VBITS_GE_256-NEXT:    sdivr z2.s, p1/m, z2.s, z5.s
+; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z1.b, #16
+; VBITS_GE_256-NEXT:    sunpklo z3.s, z3.h
 ; VBITS_GE_256-NEXT:    sdivr z3.s, p1/m, z3.s, z4.s
+; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
 ; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; VBITS_GE_256-NEXT:    uzp1 z3.h, z3.h, z3.h
-; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
 ; VBITS_GE_256-NEXT:    splice z2.h, p1, z2.h, z3.h
 ; VBITS_GE_256-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
@@ -506,19 +505,19 @@ define void @srem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p1.s, vl64
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; CHECK-NEXT:    mov z3.d, z1.d
+; CHECK-NEXT:    sunpklo z2.s, z1.h
+; CHECK-NEXT:    sunpklo z3.s, z0.h
 ; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #128
 ; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #128
-; CHECK-NEXT:    sunpklo z2.s, z1.h
-; CHECK-NEXT:    sunpklo z5.s, z0.h
-; CHECK-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    mov z3.d, z1.d
 ; CHECK-NEXT:    sunpklo z4.s, z4.h
-; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z5.s
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #128
+; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sdivr z3.s, p1/m, z3.s, z4.s
+; CHECK-NEXT:    ptrue p1.h, vl64
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    ptrue p1.h, vl64
 ; CHECK-NEXT:    splice z2.h, p1, z2.h, z3.h
 ; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
@@ -583,41 +582,41 @@ define void @srem_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @srem_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: srem_v16i32:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ldp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
-; VBITS_GE_128-NEXT:    ldp q2, q3, [x0]
-; VBITS_GE_128-NEXT:    ldp q4, q5, [x1, #32]
-; VBITS_GE_128-NEXT:    movprfx z16, z0
-; VBITS_GE_128-NEXT:    sdiv z16.s, p0/m, z16.s, z4.s
-; VBITS_GE_128-NEXT:    mls v0.4s, v16.4s, v4.4s
-; VBITS_GE_128-NEXT:    movprfx z16, z1
-; VBITS_GE_128-NEXT:    sdiv z16.s, p0/m, z16.s, z5.s
-; VBITS_GE_128-NEXT:    ldp q7, q6, [x1]
-; VBITS_GE_128-NEXT:    movprfx z4, z3
-; VBITS_GE_128-NEXT:    sdiv z4.s, p0/m, z4.s, z6.s
-; VBITS_GE_128-NEXT:    mls v1.4s, v16.4s, v5.4s
-; VBITS_GE_128-NEXT:    movprfx z5, z2
-; VBITS_GE_128-NEXT:    sdiv z5.s, p0/m, z5.s, z7.s
-; VBITS_GE_128-NEXT:    mls v2.4s, v5.4s, v7.4s
-; VBITS_GE_128-NEXT:    mls v3.4s, v4.4s, v6.4s
-; VBITS_GE_128-NEXT:    stp q0, q1, [x0, #32]
-; VBITS_GE_128-NEXT:    stp q2, q3, [x0]
+; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
+; VBITS_GE_128-NEXT:    movprfx z4, z1
+; VBITS_GE_128-NEXT:    sdiv z4.s, p0/m, z4.s, z0.s
+; VBITS_GE_128-NEXT:    ldp q16, q5, [x0, #32]
+; VBITS_GE_128-NEXT:    movprfx z19, z2
+; VBITS_GE_128-NEXT:    sdiv z19.s, p0/m, z19.s, z3.s
+; VBITS_GE_128-NEXT:    ldp q17, q6, [x1, #32]
+; VBITS_GE_128-NEXT:    movprfx z7, z5
+; VBITS_GE_128-NEXT:    sdiv z7.s, p0/m, z7.s, z6.s
+; VBITS_GE_128-NEXT:    movprfx z18, z16
+; VBITS_GE_128-NEXT:    sdiv z18.s, p0/m, z18.s, z17.s
+; VBITS_GE_128-NEXT:    mls v1.4s, v4.4s, v0.4s
+; VBITS_GE_128-NEXT:    mls v2.4s, v19.4s, v3.4s
+; VBITS_GE_128-NEXT:    mls v16.4s, v18.4s, v17.4s
+; VBITS_GE_128-NEXT:    mls v5.4s, v7.4s, v6.4s
+; VBITS_GE_128-NEXT:    stp q1, q2, [x0]
+; VBITS_GE_128-NEXT:    stp q16, q5, [x0, #32]
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: srem_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    movprfx z4, z0
-; VBITS_GE_256-NEXT:    sdiv z4.s, p0/m, z4.s, z2.s
+; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    movprfx z3, z0
+; VBITS_GE_256-NEXT:    sdiv z3.s, p0/m, z3.s, z2.s
 ; VBITS_GE_256-NEXT:    movprfx z5, z1
-; VBITS_GE_256-NEXT:    sdiv z5.s, p0/m, z5.s, z3.s
-; VBITS_GE_256-NEXT:    mls z0.s, p0/m, z4.s, z2.s
-; VBITS_GE_256-NEXT:    mls z1.s, p0/m, z5.s, z3.s
+; VBITS_GE_256-NEXT:    sdiv z5.s, p0/m, z5.s, z4.s
+; VBITS_GE_256-NEXT:    mls z0.s, p0/m, z3.s, z2.s
+; VBITS_GE_256-NEXT:    mls z1.s, p0/m, z5.s, z4.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
@@ -680,8 +679,8 @@ define void @srem_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: srem_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
@@ -697,8 +696,8 @@ define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #
 define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: srem_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
@@ -730,43 +729,42 @@ define void @srem_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @srem_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: srem_v8i64:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ldp q0, q1, [x0, #32]
 ; VBITS_GE_128-NEXT:    ptrue p0.d, vl2
-; VBITS_GE_128-NEXT:    ldp q2, q3, [x1, #32]
-; VBITS_GE_128-NEXT:    movprfx z16, z1
-; VBITS_GE_128-NEXT:    sdiv z16.d, p0/m, z16.d, z3.d
-; VBITS_GE_128-NEXT:    mls z1.d, p0/m, z16.d, z3.d
-; VBITS_GE_128-NEXT:    movprfx z3, z0
-; VBITS_GE_128-NEXT:    sdiv z3.d, p0/m, z3.d, z2.d
-; VBITS_GE_128-NEXT:    mls z0.d, p0/m, z3.d, z2.d
-; VBITS_GE_128-NEXT:    ldp q4, q5, [x0]
-; VBITS_GE_128-NEXT:    ldp q7, q6, [x1]
-; VBITS_GE_128-NEXT:    movprfx z16, z5
-; VBITS_GE_128-NEXT:    sdiv z16.d, p0/m, z16.d, z6.d
-; VBITS_GE_128-NEXT:    movprfx z2, z4
-; VBITS_GE_128-NEXT:    sdiv z2.d, p0/m, z2.d, z7.d
-; VBITS_GE_128-NEXT:    stp q0, q1, [x0, #32]
-; VBITS_GE_128-NEXT:    movprfx z0, z4
-; VBITS_GE_128-NEXT:    mls z0.d, p0/m, z2.d, z7.d
-; VBITS_GE_128-NEXT:    movprfx z1, z5
-; VBITS_GE_128-NEXT:    mls z1.d, p0/m, z16.d, z6.d
+; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
+; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
+; VBITS_GE_128-NEXT:    movprfx z4, z1
+; VBITS_GE_128-NEXT:    sdiv z4.d, p0/m, z4.d, z0.d
+; VBITS_GE_128-NEXT:    ldp q16, q5, [x0, #32]
+; VBITS_GE_128-NEXT:    movprfx z19, z2
+; VBITS_GE_128-NEXT:    sdiv z19.d, p0/m, z19.d, z3.d
+; VBITS_GE_128-NEXT:    ldp q17, q6, [x1, #32]
+; VBITS_GE_128-NEXT:    movprfx z7, z5
+; VBITS_GE_128-NEXT:    sdiv z7.d, p0/m, z7.d, z6.d
+; VBITS_GE_128-NEXT:    movprfx z18, z16
+; VBITS_GE_128-NEXT:    sdiv z18.d, p0/m, z18.d, z17.d
+; VBITS_GE_128-NEXT:    mls z16.d, p0/m, z18.d, z17.d
+; VBITS_GE_128-NEXT:    mls z5.d, p0/m, z7.d, z6.d
+; VBITS_GE_128-NEXT:    msb z0.d, p0/m, z4.d, z1.d
+; VBITS_GE_128-NEXT:    movprfx z1, z2
+; VBITS_GE_128-NEXT:    mls z1.d, p0/m, z19.d, z3.d
+; VBITS_GE_128-NEXT:    stp q16, q5, [x0, #32]
 ; VBITS_GE_128-NEXT:    stp q0, q1, [x0]
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: srem_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    movprfx z4, z0
-; VBITS_GE_256-NEXT:    sdiv z4.d, p0/m, z4.d, z2.d
+; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    movprfx z3, z0
+; VBITS_GE_256-NEXT:    sdiv z3.d, p0/m, z3.d, z2.d
 ; VBITS_GE_256-NEXT:    movprfx z5, z1
-; VBITS_GE_256-NEXT:    sdiv z5.d, p0/m, z5.d, z3.d
-; VBITS_GE_256-NEXT:    mls z0.d, p0/m, z4.d, z2.d
-; VBITS_GE_256-NEXT:    mls z1.d, p0/m, z5.d, z3.d
+; VBITS_GE_256-NEXT:    sdiv z5.d, p0/m, z5.d, z4.d
+; VBITS_GE_256-NEXT:    mls z0.d, p0/m, z3.d, z2.d
+; VBITS_GE_256-NEXT:    mls z1.d, p0/m, z5.d, z4.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
@@ -833,8 +831,8 @@ define void @srem_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; VBITS_GE_128-LABEL: urem_v8i8:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ushll v2.8h, v1.8b, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    ushll v2.8h, v1.8b, #0
 ; VBITS_GE_128-NEXT:    ushll v3.8h, v0.8b, #0
 ; VBITS_GE_128-NEXT:    ushll2 v4.4s, v2.8h, #0
 ; VBITS_GE_128-NEXT:    ushll2 v5.4s, v3.8h, #0
@@ -915,26 +913,25 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_128-LABEL: urem_v16i8:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ushll2 v2.8h, v1.16b, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    ushll2 v2.8h, v1.16b, #0
 ; VBITS_GE_128-NEXT:    ushll2 v3.8h, v0.16b, #0
 ; VBITS_GE_128-NEXT:    ushll2 v4.4s, v2.8h, #0
 ; VBITS_GE_128-NEXT:    ushll2 v5.4s, v3.8h, #0
 ; VBITS_GE_128-NEXT:    ushll v2.4s, v2.4h, #0
 ; VBITS_GE_128-NEXT:    ushll v3.4s, v3.4h, #0
 ; VBITS_GE_128-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
-; VBITS_GE_128-NEXT:    ushll v6.8h, v1.8b, #0
-; VBITS_GE_128-NEXT:    ushll v7.8h, v0.8b, #0
+; VBITS_GE_128-NEXT:    ushll v5.8h, v0.8b, #0
+; VBITS_GE_128-NEXT:    ushll2 v7.4s, v5.8h, #0
+; VBITS_GE_128-NEXT:    ushll v5.4s, v5.4h, #0
 ; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_GE_128-NEXT:    ushll2 v3.4s, v6.8h, #0
-; VBITS_GE_128-NEXT:    ushll2 v5.4s, v7.8h, #0
-; VBITS_GE_128-NEXT:    ushll v6.4s, v6.4h, #0
-; VBITS_GE_128-NEXT:    udivr z3.s, p0/m, z3.s, z5.s
-; VBITS_GE_128-NEXT:    ushll v7.4s, v7.4h, #0
+; VBITS_GE_128-NEXT:    ushll v3.8h, v1.8b, #0
+; VBITS_GE_128-NEXT:    ushll2 v6.4s, v3.8h, #0
+; VBITS_GE_128-NEXT:    ushll v3.4s, v3.4h, #0
+; VBITS_GE_128-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
 ; VBITS_GE_128-NEXT:    uzp1 v2.8h, v2.8h, v4.8h
-; VBITS_GE_128-NEXT:    movprfx z4, z7
-; VBITS_GE_128-NEXT:    udiv z4.s, p0/m, z4.s, z6.s
-; VBITS_GE_128-NEXT:    uzp1 v3.8h, v4.8h, v3.8h
+; VBITS_GE_128-NEXT:    udivr z3.s, p0/m, z3.s, z5.s
+; VBITS_GE_128-NEXT:    uzp1 v3.8h, v3.8h, v6.8h
 ; VBITS_GE_128-NEXT:    uzp1 v2.16b, v3.16b, v2.16b
 ; VBITS_GE_128-NEXT:    mls v0.16b, v2.16b, v1.16b
 ; VBITS_GE_128-NEXT:    ret
@@ -945,20 +942,20 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; VBITS_GE_256-NEXT:    uunpklo z2.h, z1.b
 ; VBITS_GE_256-NEXT:    uunpklo z3.h, z0.b
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    uunpklo z4.s, z2.h
 ; VBITS_GE_256-NEXT:    uunpklo z5.s, z3.h
 ; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
 ; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z3.b, #16
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    uunpklo z2.s, z2.h
 ; VBITS_GE_256-NEXT:    uunpklo z3.s, z3.h
 ; VBITS_GE_256-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
 ; VBITS_GE_256-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_GE_256-NEXT:    uzp1 z4.h, z4.h, z4.h
-; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
-; VBITS_GE_256-NEXT:    splice z4.h, p0, z4.h, z2.h
-; VBITS_GE_256-NEXT:    uzp1 z2.b, z4.b, z4.b
+; VBITS_GE_256-NEXT:    uzp1 z3.h, z4.h, z4.h
+; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
+; VBITS_GE_256-NEXT:    splice z3.h, p0, z3.h, z2.h
+; VBITS_GE_256-NEXT:    uzp1 z2.b, z3.b, z3.b
 ; VBITS_GE_256-NEXT:    mls v0.16b, v2.16b, v1.16b
 ; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; VBITS_GE_256-NEXT:    ret
@@ -1047,11 +1044,11 @@ define void @urem_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    udivr z4.s, p1/m, z4.s, z5.s
 ; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    ptrue p1.h, vl64
-; CHECK-NEXT:    splice z4.h, p1, z4.h, z2.h
-; CHECK-NEXT:    uzp1 z2.b, z4.b, z4.b
+; CHECK-NEXT:    uzp1 z3.h, z4.h, z4.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    splice z3.h, p1, z3.h, z2.h
+; CHECK-NEXT:    uzp1 z2.b, z3.b, z3.b
 ; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -1069,42 +1066,42 @@ define void @urem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p1.s, vl64
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
-; CHECK-NEXT:    ptrue p2.h, vl64
 ; CHECK-NEXT:    uunpklo z2.h, z1.b
 ; CHECK-NEXT:    uunpklo z3.h, z0.b
-; CHECK-NEXT:    uunpklo z5.s, z2.h
-; CHECK-NEXT:    uunpklo z6.s, z3.h
+; CHECK-NEXT:    uunpklo z4.s, z2.h
+; CHECK-NEXT:    uunpklo z5.s, z3.h
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #128
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    udivr z5.s, p1/m, z5.s, z6.s
-; CHECK-NEXT:    mov z6.d, z0.d
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    ext z4.b, z4.b, z1.b, #128
+; CHECK-NEXT:    udivr z4.s, p1/m, z4.s, z5.s
+; CHECK-NEXT:    mov z5.d, z0.d
+; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #128
+; CHECK-NEXT:    uunpklo z5.h, z5.b
+; CHECK-NEXT:    uunpklo z7.s, z5.h
+; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #128
 ; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
-; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #128
-; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    uunpklo z3.h, z4.b
-; CHECK-NEXT:    uunpklo z4.h, z6.b
-; CHECK-NEXT:    splice z5.h, p2, z5.h, z2.h
-; CHECK-NEXT:    uunpklo z2.s, z3.h
-; CHECK-NEXT:    uunpklo z6.s, z4.h
+; CHECK-NEXT:    mov z3.d, z1.d
+; CHECK-NEXT:    uunpklo z5.s, z5.h
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #128
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    uunpklo z3.h, z3.b
+; CHECK-NEXT:    uunpklo z6.s, z3.h
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #128
-; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #128
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    uunpklo z4.s, z4.h
-; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z6.s
-; CHECK-NEXT:    udivr z3.s, p1/m, z3.s, z4.s
+; CHECK-NEXT:    udivr z6.s, p1/m, z6.s, z7.s
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    udivr z3.s, p1/m, z3.s, z5.s
+; CHECK-NEXT:    ptrue p1.h, vl64
+; CHECK-NEXT:    uzp1 z5.h, z6.h, z6.h
+; CHECK-NEXT:    splice z4.h, p1, z4.h, z2.h
+; CHECK-NEXT:    uzp1 z2.b, z4.b, z4.b
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    uzp1 z4.b, z5.b, z5.b
-; CHECK-NEXT:    splice z2.h, p2, z2.h, z3.h
+; CHECK-NEXT:    splice z5.h, p1, z5.h, z3.h
 ; CHECK-NEXT:    ptrue p1.b, vl128
-; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
-; CHECK-NEXT:    splice z4.b, p1, z4.b, z2.b
-; CHECK-NEXT:    mls z0.b, p0/m, z4.b, z1.b
+; CHECK-NEXT:    uzp1 z3.b, z5.b, z5.b
+; CHECK-NEXT:    splice z2.b, p1, z2.b, z3.b
+; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, ptr %a
@@ -1119,8 +1116,8 @@ define void @urem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ; VBITS_GE_128-LABEL: urem_v4i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ushll v2.4s, v1.4h, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    ushll v2.4s, v1.4h, #0
 ; VBITS_GE_128-NEXT:    ushll v3.4s, v0.4h, #0
 ; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_128-NEXT:    xtn v2.4h, v2.4s
@@ -1129,13 +1126,13 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ;
 ; VBITS_GE_256-LABEL: urem_v4i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ushll v2.4s, v1.4h, #0
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_256-NEXT:    ushll v2.4s, v1.4h, #0
 ; VBITS_GE_256-NEXT:    ushll v3.4s, v0.4h, #0
 ; VBITS_GE_256-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    mov w8, v2.s[1]
-; VBITS_GE_256-NEXT:    mov w9, v2.s[2]
 ; VBITS_GE_256-NEXT:    mov v3.16b, v2.16b
+; VBITS_GE_256-NEXT:    mov w9, v2.s[2]
 ; VBITS_GE_256-NEXT:    mov v3.h[1], w8
 ; VBITS_GE_256-NEXT:    mov w8, v2.s[3]
 ; VBITS_GE_256-NEXT:    mov v3.h[2], w9
@@ -1145,13 +1142,13 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ;
 ; VBITS_GE_512-LABEL: urem_v4i16:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ushll v2.4s, v1.4h, #0
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_512-NEXT:    ushll v2.4s, v1.4h, #0
 ; VBITS_GE_512-NEXT:    ushll v3.4s, v0.4h, #0
 ; VBITS_GE_512-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_512-NEXT:    mov w8, v2.s[1]
-; VBITS_GE_512-NEXT:    mov w9, v2.s[2]
 ; VBITS_GE_512-NEXT:    mov v3.16b, v2.16b
+; VBITS_GE_512-NEXT:    mov w9, v2.s[2]
 ; VBITS_GE_512-NEXT:    mov v3.h[1], w8
 ; VBITS_GE_512-NEXT:    mov w8, v2.s[3]
 ; VBITS_GE_512-NEXT:    mov v3.h[2], w9
@@ -1165,23 +1162,22 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ; VBITS_GE_128-LABEL: urem_v8i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ushll2 v2.4s, v1.8h, #0
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
+; VBITS_GE_128-NEXT:    ushll2 v2.4s, v1.8h, #0
 ; VBITS_GE_128-NEXT:    ushll2 v3.4s, v0.8h, #0
-; VBITS_GE_128-NEXT:    ushll v4.4s, v1.4h, #0
+; VBITS_GE_128-NEXT:    ushll v4.4s, v0.4h, #0
 ; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; VBITS_GE_128-NEXT:    ushll v5.4s, v0.4h, #0
-; VBITS_GE_128-NEXT:    movprfx z3, z5
-; VBITS_GE_128-NEXT:    udiv z3.s, p0/m, z3.s, z4.s
+; VBITS_GE_128-NEXT:    ushll v3.4s, v1.4h, #0
+; VBITS_GE_128-NEXT:    udivr z3.s, p0/m, z3.s, z4.s
 ; VBITS_GE_128-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
 ; VBITS_GE_128-NEXT:    mls v0.8h, v2.8h, v1.8h
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: urem_v8i16:
 ; VBITS_GE_256:       // %bb.0:
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    uunpklo z2.s, z1.h
 ; VBITS_GE_256-NEXT:    uunpklo z3.s, z0.h
 ; VBITS_GE_256-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
@@ -1192,9 +1188,9 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ;
 ; VBITS_GE_512-LABEL: urem_v8i16:
 ; VBITS_GE_512:       // %bb.0:
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_512-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_512-NEXT:    uunpklo z2.s, z1.h
 ; VBITS_GE_512-NEXT:    uunpklo z3.s, z0.h
 ; VBITS_GE_512-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
@@ -1209,26 +1205,27 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 define void @urem_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: urem_v16i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ldp q0, q1, [x1]
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
-; VBITS_GE_128-NEXT:    ushll2 v5.4s, v0.8h, #0
-; VBITS_GE_128-NEXT:    ushll v7.4s, v0.4h, #0
-; VBITS_GE_128-NEXT:    ldp q2, q3, [x0]
-; VBITS_GE_128-NEXT:    ushll2 v4.4s, v1.8h, #0
-; VBITS_GE_128-NEXT:    ushll2 v6.4s, v2.8h, #0
-; VBITS_GE_128-NEXT:    ushll v16.4s, v2.4h, #0
-; VBITS_GE_128-NEXT:    ushll2 v17.4s, v3.8h, #0
+; VBITS_GE_128-NEXT:    ldp q4, q1, [x1]
+; VBITS_GE_128-NEXT:    ldr q0, [x0, #16]
+; VBITS_GE_128-NEXT:    ushll2 v2.4s, v1.8h, #0
+; VBITS_GE_128-NEXT:    ushll2 v3.4s, v0.8h, #0
+; VBITS_GE_128-NEXT:    ushll2 v5.4s, v4.8h, #0
+; VBITS_GE_128-NEXT:    ushll v16.4s, v0.4h, #0
+; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; VBITS_GE_128-NEXT:    ldr q3, [x0]
+; VBITS_GE_128-NEXT:    ushll2 v6.4s, v3.8h, #0
+; VBITS_GE_128-NEXT:    ushll v7.4s, v3.4h, #0
 ; VBITS_GE_128-NEXT:    udivr z5.s, p0/m, z5.s, z6.s
-; VBITS_GE_128-NEXT:    ushll v6.4s, v1.4h, #0
+; VBITS_GE_128-NEXT:    ushll v6.4s, v4.4h, #0
+; VBITS_GE_128-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
+; VBITS_GE_128-NEXT:    ushll v7.4s, v1.4h, #0
 ; VBITS_GE_128-NEXT:    udivr z7.s, p0/m, z7.s, z16.s
-; VBITS_GE_128-NEXT:    ushll v16.4s, v3.4h, #0
-; VBITS_GE_128-NEXT:    udivr z4.s, p0/m, z4.s, z17.s
-; VBITS_GE_128-NEXT:    uzp1 v5.8h, v7.8h, v5.8h
-; VBITS_GE_128-NEXT:    udivr z6.s, p0/m, z6.s, z16.s
-; VBITS_GE_128-NEXT:    uzp1 v4.8h, v6.8h, v4.8h
-; VBITS_GE_128-NEXT:    mls v2.8h, v5.8h, v0.8h
-; VBITS_GE_128-NEXT:    mls v3.8h, v4.8h, v1.8h
-; VBITS_GE_128-NEXT:    stp q2, q3, [x0]
+; VBITS_GE_128-NEXT:    uzp1 v5.8h, v6.8h, v5.8h
+; VBITS_GE_128-NEXT:    mls v3.8h, v5.8h, v4.8h
+; VBITS_GE_128-NEXT:    uzp1 v2.8h, v7.8h, v2.8h
+; VBITS_GE_128-NEXT:    mls v0.8h, v2.8h, v1.8h
+; VBITS_GE_128-NEXT:    stp q3, q0, [x0]
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: urem_v16i16:
@@ -1237,19 +1234,19 @@ define void @urem_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    mov z3.d, z1.d
+; VBITS_GE_256-NEXT:    uunpklo z2.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z3.s, z0.h
 ; VBITS_GE_256-NEXT:    mov z4.d, z0.d
-; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z1.b, #16
 ; VBITS_GE_256-NEXT:    ext z4.b, z4.b, z0.b, #16
-; VBITS_GE_256-NEXT:    uunpklo z2.s, z1.h
-; VBITS_GE_256-NEXT:    uunpklo z5.s, z0.h
-; VBITS_GE_256-NEXT:    uunpklo z3.s, z3.h
+; VBITS_GE_256-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
+; VBITS_GE_256-NEXT:    mov z3.d, z1.d
 ; VBITS_GE_256-NEXT:    uunpklo z4.s, z4.h
-; VBITS_GE_256-NEXT:    udivr z2.s, p1/m, z2.s, z5.s
+; VBITS_GE_256-NEXT:    ext z3.b, z3.b, z1.b, #16
+; VBITS_GE_256-NEXT:    uunpklo z3.s, z3.h
 ; VBITS_GE_256-NEXT:    udivr z3.s, p1/m, z3.s, z4.s
+; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
 ; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; VBITS_GE_256-NEXT:    uzp1 z3.h, z3.h, z3.h
-; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
 ; VBITS_GE_256-NEXT:    splice z2.h, p1, z2.h, z3.h
 ; VBITS_GE_256-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
@@ -1324,19 +1321,19 @@ define void @urem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p1.s, vl64
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; CHECK-NEXT:    mov z3.d, z1.d
+; CHECK-NEXT:    uunpklo z2.s, z1.h
+; CHECK-NEXT:    uunpklo z3.s, z0.h
 ; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #128
 ; CHECK-NEXT:    ext z4.b, z4.b, z0.b, #128
-; CHECK-NEXT:    uunpklo z2.s, z1.h
-; CHECK-NEXT:    uunpklo z5.s, z0.h
-; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z3.s
+; CHECK-NEXT:    mov z3.d, z1.d
 ; CHECK-NEXT:    uunpklo z4.s, z4.h
-; CHECK-NEXT:    udivr z2.s, p1/m, z2.s, z5.s
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #128
+; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    udivr z3.s, p1/m, z3.s, z4.s
+; CHECK-NEXT:    ptrue p1.h, vl64
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    ptrue p1.h, vl64
 ; CHECK-NEXT:    splice z2.h, p1, z2.h, z3.h
 ; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
@@ -1401,41 +1398,41 @@ define void @urem_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @urem_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: urem_v16i32:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ldp q0, q1, [x0, #32]
+; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
 ; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
-; VBITS_GE_128-NEXT:    ldp q2, q3, [x0]
-; VBITS_GE_128-NEXT:    ldp q4, q5, [x1, #32]
-; VBITS_GE_128-NEXT:    movprfx z16, z0
-; VBITS_GE_128-NEXT:    udiv z16.s, p0/m, z16.s, z4.s
-; VBITS_GE_128-NEXT:    mls v0.4s, v16.4s, v4.4s
-; VBITS_GE_128-NEXT:    movprfx z16, z1
-; VBITS_GE_128-NEXT:    udiv z16.s, p0/m, z16.s, z5.s
-; VBITS_GE_128-NEXT:    ldp q7, q6, [x1]
-; VBITS_GE_128-NEXT:    movprfx z4, z3
-; VBITS_GE_128-NEXT:    udiv z4.s, p0/m, z4.s, z6.s
-; VBITS_GE_128-NEXT:    mls v1.4s, v16.4s, v5.4s
-; VBITS_GE_128-NEXT:    movprfx z5, z2
-; VBITS_GE_128-NEXT:    udiv z5.s, p0/m, z5.s, z7.s
-; VBITS_GE_128-NEXT:    mls v2.4s, v5.4s, v7.4s
-; VBITS_GE_128-NEXT:    mls v3.4s, v4.4s, v6.4s
-; VBITS_GE_128-NEXT:    stp q0, q1, [x0, #32]
-; VBITS_GE_128-NEXT:    stp q2, q3, [x0]
+; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
+; VBITS_GE_128-NEXT:    movprfx z4, z1
+; VBITS_GE_128-NEXT:    udiv z4.s, p0/m, z4.s, z0.s
+; VBITS_GE_128-NEXT:    ldp q16, q5, [x0, #32]
+; VBITS_GE_128-NEXT:    movprfx z19, z2
+; VBITS_GE_128-NEXT:    udiv z19.s, p0/m, z19.s, z3.s
+; VBITS_GE_128-NEXT:    ldp q17, q6, [x1, #32]
+; VBITS_GE_128-NEXT:    movprfx z7, z5
+; VBITS_GE_128-NEXT:    udiv z7.s, p0/m, z7.s, z6.s
+; VBITS_GE_128-NEXT:    movprfx z18, z16
+; VBITS_GE_128-NEXT:    udiv z18.s, p0/m, z18.s, z17.s
+; VBITS_GE_128-NEXT:    mls v1.4s, v4.4s, v0.4s
+; VBITS_GE_128-NEXT:    mls v2.4s, v19.4s, v3.4s
+; VBITS_GE_128-NEXT:    mls v16.4s, v18.4s, v17.4s
+; VBITS_GE_128-NEXT:    mls v5.4s, v7.4s, v6.4s
+; VBITS_GE_128-NEXT:    stp q1, q2, [x0]
+; VBITS_GE_128-NEXT:    stp q16, q5, [x0, #32]
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: urem_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    movprfx z4, z0
-; VBITS_GE_256-NEXT:    udiv z4.s, p0/m, z4.s, z2.s
+; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    movprfx z3, z0
+; VBITS_GE_256-NEXT:    udiv z3.s, p0/m, z3.s, z2.s
 ; VBITS_GE_256-NEXT:    movprfx z5, z1
-; VBITS_GE_256-NEXT:    udiv z5.s, p0/m, z5.s, z3.s
-; VBITS_GE_256-NEXT:    mls z0.s, p0/m, z4.s, z2.s
-; VBITS_GE_256-NEXT:    mls z1.s, p0/m, z5.s, z3.s
+; VBITS_GE_256-NEXT:    udiv z5.s, p0/m, z5.s, z4.s
+; VBITS_GE_256-NEXT:    mls z0.s, p0/m, z3.s, z2.s
+; VBITS_GE_256-NEXT:    mls z1.s, p0/m, z5.s, z4.s
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
@@ -1498,8 +1495,8 @@ define void @urem_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: urem_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
@@ -1515,8 +1512,8 @@ define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #
 define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(1,0) #0 {
 ; CHECK-LABEL: urem_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
@@ -1548,43 +1545,42 @@ define void @urem_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @urem_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: urem_v8i64:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ldp q0, q1, [x0, #32]
 ; VBITS_GE_128-NEXT:    ptrue p0.d, vl2
-; VBITS_GE_128-NEXT:    ldp q2, q3, [x1, #32]
-; VBITS_GE_128-NEXT:    movprfx z16, z1
-; VBITS_GE_128-NEXT:    udiv z16.d, p0/m, z16.d, z3.d
-; VBITS_GE_128-NEXT:    mls z1.d, p0/m, z16.d, z3.d
-; VBITS_GE_128-NEXT:    movprfx z3, z0
-; VBITS_GE_128-NEXT:    udiv z3.d, p0/m, z3.d, z2.d
-; VBITS_GE_128-NEXT:    mls z0.d, p0/m, z3.d, z2.d
-; VBITS_GE_128-NEXT:    ldp q4, q5, [x0]
-; VBITS_GE_128-NEXT:    ldp q7, q6, [x1]
-; VBITS_GE_128-NEXT:    movprfx z16, z5
-; VBITS_GE_128-NEXT:    udiv z16.d, p0/m, z16.d, z6.d
-; VBITS_GE_128-NEXT:    movprfx z2, z4
-; VBITS_GE_128-NEXT:    udiv z2.d, p0/m, z2.d, z7.d
-; VBITS_GE_128-NEXT:    stp q0, q1, [x0, #32]
-; VBITS_GE_128-NEXT:    movprfx z0, z4
-; VBITS_GE_128-NEXT:    mls z0.d, p0/m, z2.d, z7.d
-; VBITS_GE_128-NEXT:    movprfx z1, z5
-; VBITS_GE_128-NEXT:    mls z1.d, p0/m, z16.d, z6.d
+; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
+; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
+; VBITS_GE_128-NEXT:    movprfx z4, z1
+; VBITS_GE_128-NEXT:    udiv z4.d, p0/m, z4.d, z0.d
+; VBITS_GE_128-NEXT:    ldp q16, q5, [x0, #32]
+; VBITS_GE_128-NEXT:    movprfx z19, z2
+; VBITS_GE_128-NEXT:    udiv z19.d, p0/m, z19.d, z3.d
+; VBITS_GE_128-NEXT:    ldp q17, q6, [x1, #32]
+; VBITS_GE_128-NEXT:    movprfx z7, z5
+; VBITS_GE_128-NEXT:    udiv z7.d, p0/m, z7.d, z6.d
+; VBITS_GE_128-NEXT:    movprfx z18, z16
+; VBITS_GE_128-NEXT:    udiv z18.d, p0/m, z18.d, z17.d
+; VBITS_GE_128-NEXT:    mls z16.d, p0/m, z18.d, z17.d
+; VBITS_GE_128-NEXT:    mls z5.d, p0/m, z7.d, z6.d
+; VBITS_GE_128-NEXT:    msb z0.d, p0/m, z4.d, z1.d
+; VBITS_GE_128-NEXT:    movprfx z1, z2
+; VBITS_GE_128-NEXT:    mls z1.d, p0/m, z19.d, z3.d
+; VBITS_GE_128-NEXT:    stp q16, q5, [x0, #32]
 ; VBITS_GE_128-NEXT:    stp q0, q1, [x0]
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: urem_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    movprfx z4, z0
-; VBITS_GE_256-NEXT:    udiv z4.d, p0/m, z4.d, z2.d
+; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    movprfx z3, z0
+; VBITS_GE_256-NEXT:    udiv z3.d, p0/m, z3.d, z2.d
 ; VBITS_GE_256-NEXT:    movprfx z5, z1
-; VBITS_GE_256-NEXT:    udiv z5.d, p0/m, z5.d, z3.d
-; VBITS_GE_256-NEXT:    mls z0.d, p0/m, z4.d, z2.d
-; VBITS_GE_256-NEXT:    mls z1.d, p0/m, z5.d, z3.d
+; VBITS_GE_256-NEXT:    udiv z5.d, p0/m, z5.d, z4.d
+; VBITS_GE_256-NEXT:    mls z0.d, p0/m, z3.d, z2.d
+; VBITS_GE_256-NEXT:    mls z1.d, p0/m, z5.d, z4.d
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll
index e62dc5900e8437..30b680b174b983 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll
@@ -34,14 +34,14 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) vscale_
 define void @select_v32i8(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p0.b, vl32
-; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    mov z2.b, w8
-; CHECK-NEXT:    cmpne p1.b, p1/z, z2.b, #0
-; CHECK-NEXT:    sel z0.b, p1, z0.b, z1.b
+; CHECK-NEXT:    mov z0.b, w8
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z2.b }, p0/z, [x1]
+; CHECK-NEXT:    cmpne p1.b, p1/z, z0.b, #0
+; CHECK-NEXT:    sel z0.b, p1, z1.b, z2.b
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <32 x i8>, ptr %a
@@ -54,32 +54,32 @@ define void @select_v32i8(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 define void @select_v64i8(ptr %a, ptr %b, i1 %mask) #0 {
 ; VBITS_GE_256-LABEL: select_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_256-NEXT:    and w9, w2, #0x1
+; VBITS_GE_256-NEXT:    and w8, w2, #0x1
 ; VBITS_GE_256-NEXT:    ptrue p1.b
-; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
-; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
-; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    mov z4.b, w9
-; VBITS_GE_256-NEXT:    cmpne p1.b, p1/z, z4.b, #0
+; VBITS_GE_256-NEXT:    mov z0.b, w8
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1, x8]
+; VBITS_GE_256-NEXT:    cmpne p1.b, p1/z, z0.b, #0
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    sel z1.b, p1, z1.b, z3.b
-; VBITS_GE_256-NEXT:    sel z0.b, p1, z0.b, z2.b
-; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    mov z0.b, p1/m, z2.b
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: select_v64i8:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    and w8, w2, #0x1
 ; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
-; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    and w8, w2, #0x1
 ; VBITS_GE_512-NEXT:    ptrue p1.b
-; VBITS_GE_512-NEXT:    mov z2.b, w8
-; VBITS_GE_512-NEXT:    cmpne p1.b, p1/z, z2.b, #0
-; VBITS_GE_512-NEXT:    sel z0.b, p1, z0.b, z1.b
+; VBITS_GE_512-NEXT:    mov z0.b, w8
+; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1b { z2.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    cmpne p1.b, p1/z, z0.b, #0
+; VBITS_GE_512-NEXT:    sel z0.b, p1, z1.b, z2.b
 ; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load volatile <64 x i8>, ptr %a
@@ -92,14 +92,14 @@ define void @select_v64i8(ptr %a, ptr %b, i1 %mask) #0 {
 define void @select_v128i8(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 ; CHECK-LABEL: select_v128i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p0.b, vl128
-; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    mov z2.b, w8
-; CHECK-NEXT:    cmpne p1.b, p1/z, z2.b, #0
-; CHECK-NEXT:    sel z0.b, p1, z0.b, z1.b
+; CHECK-NEXT:    mov z0.b, w8
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z2.b }, p0/z, [x1]
+; CHECK-NEXT:    cmpne p1.b, p1/z, z0.b, #0
+; CHECK-NEXT:    sel z0.b, p1, z1.b, z2.b
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <128 x i8>, ptr %a
@@ -112,14 +112,14 @@ define void @select_v128i8(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 define void @select_v256i8(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 {
 ; CHECK-LABEL: select_v256i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p0.b, vl256
-; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
+; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    mov z2.b, w8
-; CHECK-NEXT:    cmpne p1.b, p1/z, z2.b, #0
-; CHECK-NEXT:    sel z0.b, p1, z0.b, z1.b
+; CHECK-NEXT:    mov z0.b, w8
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x0]
+; CHECK-NEXT:    ld1b { z2.b }, p0/z, [x1]
+; CHECK-NEXT:    cmpne p1.b, p1/z, z0.b, #0
+; CHECK-NEXT:    sel z0.b, p1, z1.b, z2.b
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <256 x i8>, ptr %a
@@ -158,14 +158,14 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) vscale_
 define void @select_v16i16(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p0.h, vl16
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    cmpne p1.h, p1/z, z2.h, #0
-; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, w8
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x1]
+; CHECK-NEXT:    cmpne p1.h, p1/z, z0.h, #0
+; CHECK-NEXT:    sel z0.h, p1, z1.h, z2.h
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <16 x i16>, ptr %a
@@ -178,32 +178,32 @@ define void @select_v16i16(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 define void @select_v32i16(ptr %a, ptr %b, i1 %mask) #0 {
 ; VBITS_GE_256-LABEL: select_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    and w9, w2, #0x1
+; VBITS_GE_256-NEXT:    and w8, w2, #0x1
 ; VBITS_GE_256-NEXT:    ptrue p1.h
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    mov z4.h, w9
-; VBITS_GE_256-NEXT:    cmpne p1.h, p1/z, z4.h, #0
+; VBITS_GE_256-NEXT:    mov z0.h, w8
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    cmpne p1.h, p1/z, z0.h, #0
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    sel z1.h, p1, z1.h, z3.h
-; VBITS_GE_256-NEXT:    sel z0.h, p1, z0.h, z2.h
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    mov z0.h, p1/m, z2.h
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: select_v32i16:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    and w8, w2, #0x1
 ; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    and w8, w2, #0x1
 ; VBITS_GE_512-NEXT:    ptrue p1.h
-; VBITS_GE_512-NEXT:    mov z2.h, w8
-; VBITS_GE_512-NEXT:    cmpne p1.h, p1/z, z2.h, #0
-; VBITS_GE_512-NEXT:    sel z0.h, p1, z0.h, z1.h
+; VBITS_GE_512-NEXT:    mov z0.h, w8
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z2.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    cmpne p1.h, p1/z, z0.h, #0
+; VBITS_GE_512-NEXT:    sel z0.h, p1, z1.h, z2.h
 ; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load volatile <32 x i16>, ptr %a
@@ -216,14 +216,14 @@ define void @select_v32i16(ptr %a, ptr %b, i1 %mask) #0 {
 define void @select_v64i16(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 ; CHECK-LABEL: select_v64i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p0.h, vl64
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    cmpne p1.h, p1/z, z2.h, #0
-; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, w8
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x1]
+; CHECK-NEXT:    cmpne p1.h, p1/z, z0.h, #0
+; CHECK-NEXT:    sel z0.h, p1, z1.h, z2.h
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <64 x i16>, ptr %a
@@ -236,14 +236,14 @@ define void @select_v64i16(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 define void @select_v128i16(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 {
 ; CHECK-LABEL: select_v128i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p0.h, vl128
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
+; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    cmpne p1.h, p1/z, z2.h, #0
-; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    mov z0.h, w8
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x1]
+; CHECK-NEXT:    cmpne p1.h, p1/z, z0.h, #0
+; CHECK-NEXT:    sel z0.h, p1, z1.h, z2.h
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <128 x i16>, ptr %a
@@ -282,14 +282,14 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) vscale_
 define void @select_v8i32(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p0.s, vl8
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    mov z2.s, w8
-; CHECK-NEXT:    cmpne p1.s, p1/z, z2.s, #0
-; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, w8
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
+; CHECK-NEXT:    cmpne p1.s, p1/z, z0.s, #0
+; CHECK-NEXT:    sel z0.s, p1, z1.s, z2.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <8 x i32>, ptr %a
@@ -302,32 +302,32 @@ define void @select_v8i32(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 define void @select_v16i32(ptr %a, ptr %b, i1 %mask) #0 {
 ; VBITS_GE_256-LABEL: select_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    and w9, w2, #0x1
+; VBITS_GE_256-NEXT:    and w8, w2, #0x1
 ; VBITS_GE_256-NEXT:    ptrue p1.s
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    mov z4.s, w9
-; VBITS_GE_256-NEXT:    cmpne p1.s, p1/z, z4.s, #0
+; VBITS_GE_256-NEXT:    mov z0.s, w8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    cmpne p1.s, p1/z, z0.s, #0
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    sel z1.s, p1, z1.s, z3.s
-; VBITS_GE_256-NEXT:    sel z0.s, p1, z0.s, z2.s
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    mov z0.s, p1/m, z2.s
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: select_v16i32:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    and w8, w2, #0x1
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    and w8, w2, #0x1
 ; VBITS_GE_512-NEXT:    ptrue p1.s
-; VBITS_GE_512-NEXT:    mov z2.s, w8
-; VBITS_GE_512-NEXT:    cmpne p1.s, p1/z, z2.s, #0
-; VBITS_GE_512-NEXT:    sel z0.s, p1, z0.s, z1.s
+; VBITS_GE_512-NEXT:    mov z0.s, w8
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z2.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    cmpne p1.s, p1/z, z0.s, #0
+; VBITS_GE_512-NEXT:    sel z0.s, p1, z1.s, z2.s
 ; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load volatile <16 x i32>, ptr %a
@@ -340,14 +340,14 @@ define void @select_v16i32(ptr %a, ptr %b, i1 %mask) #0 {
 define void @select_v32i32(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 ; CHECK-LABEL: select_v32i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p0.s, vl32
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    mov z2.s, w8
-; CHECK-NEXT:    cmpne p1.s, p1/z, z2.s, #0
-; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, w8
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
+; CHECK-NEXT:    cmpne p1.s, p1/z, z0.s, #0
+; CHECK-NEXT:    sel z0.s, p1, z1.s, z2.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <32 x i32>, ptr %a
@@ -360,14 +360,14 @@ define void @select_v32i32(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 define void @select_v64i32(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 {
 ; CHECK-LABEL: select_v64i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p0.s, vl64
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    and w8, w2, #0x1
 ; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    mov z2.s, w8
-; CHECK-NEXT:    cmpne p1.s, p1/z, z2.s, #0
-; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, w8
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
+; CHECK-NEXT:    cmpne p1.s, p1/z, z0.s, #0
+; CHECK-NEXT:    sel z0.s, p1, z1.s, z2.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <64 x i32>, ptr %a
@@ -406,15 +406,15 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) vscale_
 define void @select_v4i64(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    and x8, x2, #0x1
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    cmpne p1.d, p1/z, z2.d, #0
-; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, x8
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
+; CHECK-NEXT:    cmpne p1.d, p1/z, z0.d, #0
+; CHECK-NEXT:    sel z0.d, p1, z1.d, z2.d
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <4 x i64>, ptr %a
@@ -427,34 +427,34 @@ define void @select_v4i64(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 define void @select_v8i64(ptr %a, ptr %b, i1 %mask) #0 {
 ; VBITS_GE_256-LABEL: select_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    and x9, x2, #0x1
+; VBITS_GE_256-NEXT:    // kill: def $w2 killed $w2 def $x2
+; VBITS_GE_256-NEXT:    and x8, x2, #0x1
 ; VBITS_GE_256-NEXT:    ptrue p1.d
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    mov z4.d, x9
-; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z4.d, #0
+; VBITS_GE_256-NEXT:    mov z0.d, x8
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z0.d, #0
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    sel z1.d, p1, z1.d, z3.d
-; VBITS_GE_256-NEXT:    sel z0.d, p1, z0.d, z2.d
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    mov z0.d, p1/m, z2.d
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: select_v8i64:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; VBITS_GE_512-NEXT:    and x8, x2, #0x1
-; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    ptrue p1.d
-; VBITS_GE_512-NEXT:    mov z2.d, x8
-; VBITS_GE_512-NEXT:    cmpne p1.d, p1/z, z2.d, #0
-; VBITS_GE_512-NEXT:    sel z0.d, p1, z0.d, z1.d
+; VBITS_GE_512-NEXT:    mov z0.d, x8
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z2.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    cmpne p1.d, p1/z, z0.d, #0
+; VBITS_GE_512-NEXT:    sel z0.d, p1, z1.d, z2.d
 ; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load volatile <8 x i64>, ptr %a
@@ -467,15 +467,15 @@ define void @select_v8i64(ptr %a, ptr %b, i1 %mask) #0 {
 define void @select_v16i64(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 ; CHECK-LABEL: select_v16i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    and x8, x2, #0x1
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    cmpne p1.d, p1/z, z2.d, #0
-; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, x8
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
+; CHECK-NEXT:    cmpne p1.d, p1/z, z0.d, #0
+; CHECK-NEXT:    sel z0.d, p1, z1.d, z2.d
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <16 x i64>, ptr %a
@@ -488,15 +488,15 @@ define void @select_v16i64(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 define void @select_v32i64(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 {
 ; CHECK-LABEL: select_v32i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    and x8, x2, #0x1
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    cmpne p1.d, p1/z, z2.d, #0
-; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, x8
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
+; CHECK-NEXT:    cmpne p1.d, p1/z, z0.d, #0
+; CHECK-NEXT:    sel z0.d, p1, z1.d, z2.d
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <32 x i64>, ptr %a

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll
index 24439ef0eb7bdd..0424773c14fd08 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll
@@ -50,8 +50,8 @@ define void @ashr_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @ashr_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: ashr_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
@@ -150,8 +150,8 @@ define void @ashr_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @ashr_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: ashr_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -250,8 +250,8 @@ define void @ashr_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @ashr_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: ashr_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -350,8 +350,8 @@ define void @ashr_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @ashr_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: ashr_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
@@ -454,8 +454,8 @@ define void @lshr_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @lshr_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: lshr_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
@@ -554,8 +554,8 @@ define void @lshr_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @lshr_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: lshr_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -654,8 +654,8 @@ define void @lshr_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @lshr_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: lshr_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -754,8 +754,8 @@ define void @lshr_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @lshr_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: lshr_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
@@ -856,8 +856,8 @@ define void @shl_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @shl_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: shl_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
@@ -954,8 +954,8 @@ define void @shl_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @shl_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: shl_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -1052,8 +1052,8 @@ define void @shl_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @shl_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: shl_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -1150,8 +1150,8 @@ define void @shl_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @shl_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: shl_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
index cec0e29495d401..50040eaa61e6c5 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
@@ -50,8 +50,8 @@ define void @ucvtf_v16i16_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @ucvtf_v32i16_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: ucvtf_v32i16_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ucvtf z0.h, p0/m, z0.h
@@ -131,8 +131,8 @@ define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) vscale_range(2,0) #0 {
 define void @ucvtf_v8i16_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v8i16_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
@@ -147,7 +147,7 @@ define void @ucvtf_v16i16_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: ucvtf_v16i16_v16f32:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    uunpklo z1.s, z0.h
@@ -252,16 +252,16 @@ define void @ucvtf_v8i16_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: ucvtf_v8i16_v8f64:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ldr q0, [x0]
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    ucvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
 ; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; VBITS_GE_256-NEXT:    ucvtf z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ret
 ;
@@ -352,18 +352,18 @@ define <8 x half> @ucvtf_v8i32_v8f16(ptr %a) vscale_range(2,0) #0 {
 define void @ucvtf_v16i32_v16f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: ucvtf_v16i32_v16f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
+; VBITS_GE_256-NEXT:    ptrue p1.h, vl16
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ucvtf z0.h, p0/m, z0.s
 ; VBITS_GE_256-NEXT:    ucvtf z1.h, p0/m, z1.s
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
 ; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
 ; VBITS_GE_256-NEXT:    splice z1.h, p0, z1.h, z0.h
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p1, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: ucvtf_v16i32_v16f16:
@@ -454,8 +454,8 @@ define void @ucvtf_v8i32_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @ucvtf_v16i32_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: ucvtf_v16i32_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ucvtf z0.s, p0/m, z0.s
@@ -535,8 +535,8 @@ define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) vscale_range(2,0) #0 {
 define void @ucvtf_v4i32_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v4i32_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
@@ -551,7 +551,7 @@ define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: ucvtf_v8i32_v8f64:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    uunpklo z1.d, z0.s
@@ -624,8 +624,8 @@ define <1 x half> @ucvtf_v1i64_v1f16(<1 x i64> %op1) vscale_range(2,0) #0 {
 define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v2i64_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
@@ -653,8 +653,8 @@ define <4 x half> @ucvtf_v4i64_v4f16(ptr %a) vscale_range(2,0) #0 {
 define <8 x half> @ucvtf_v8i64_v8f16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: ucvtf_v8i64_v8f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ucvtf z0.h, p0/m, z0.d
@@ -757,18 +757,18 @@ define <4 x float> @ucvtf_v4i64_v4f32(ptr %a) vscale_range(2,0) #0 {
 define void @ucvtf_v8i64_v8f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: ucvtf_v8i64_v8f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ucvtf z0.s, p0/m, z0.d
 ; VBITS_GE_256-NEXT:    ucvtf z1.s, p0/m, z1.d
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p1, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: ucvtf_v8i64_v8f32:
@@ -861,8 +861,8 @@ define void @ucvtf_v4i64_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @ucvtf_v8i64_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: ucvtf_v8i64_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ucvtf z0.d, p0/m, z0.d
@@ -957,8 +957,8 @@ define void @scvtf_v16i16_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @scvtf_v32i16_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: scvtf_v32i16_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    scvtf z0.h, p0/m, z0.h
@@ -1038,8 +1038,8 @@ define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) vscale_range(2,0) #0 {
 define void @scvtf_v8i16_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v8i16_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
@@ -1054,7 +1054,7 @@ define void @scvtf_v16i16_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: scvtf_v16i16_v16f32:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x8, #8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    sunpklo z1.s, z0.h
@@ -1165,16 +1165,16 @@ define void @scvtf_v8i16_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: scvtf_v8i16_v8f64:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ldr q0, [x0]
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    scvtf z0.d, p0/m, z0.d
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
 ; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    scvtf z0.d, p0/m, z0.d
 ; VBITS_GE_256-NEXT:    scvtf z1.d, p0/m, z1.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ret
 ;
@@ -1271,18 +1271,18 @@ define <8 x half> @scvtf_v8i32_v8f16(ptr %a) vscale_range(2,0) #0 {
 define void @scvtf_v16i32_v16f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: scvtf_v16i32_v16f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
+; VBITS_GE_256-NEXT:    ptrue p1.h, vl16
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    scvtf z0.h, p0/m, z0.s
 ; VBITS_GE_256-NEXT:    scvtf z1.h, p0/m, z1.s
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
 ; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
 ; VBITS_GE_256-NEXT:    splice z1.h, p0, z1.h, z0.h
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p1, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: scvtf_v16i32_v16f16:
@@ -1373,8 +1373,8 @@ define void @scvtf_v8i32_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @scvtf_v16i32_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: scvtf_v16i32_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    scvtf z0.s, p0/m, z0.s
@@ -1454,8 +1454,8 @@ define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) vscale_range(2,0) #0 {
 define void @scvtf_v4i32_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v4i32_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
@@ -1470,7 +1470,7 @@ define void @scvtf_v8i32_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: scvtf_v8i32_v8f64:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    sunpklo z1.d, z0.s
@@ -1549,8 +1549,8 @@ define <1 x half> @scvtf_v1i64_v1f16(<1 x i64> %op1) vscale_range(2,0) #0 {
 define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v2i64_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    scvtf z0.h, p0/m, z0.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
@@ -1578,8 +1578,8 @@ define <4 x half> @scvtf_v4i64_v4f16(ptr %a) vscale_range(2,0) #0 {
 define <8 x half> @scvtf_v8i64_v8f16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: scvtf_v8i64_v8f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    scvtf z0.h, p0/m, z0.d
@@ -1682,18 +1682,18 @@ define <4 x float> @scvtf_v4i64_v4f32(ptr %a) vscale_range(2,0) #0 {
 define void @scvtf_v8i64_v8f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: scvtf_v8i64_v8f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    scvtf z0.s, p0/m, z0.d
 ; VBITS_GE_256-NEXT:    scvtf z1.s, p0/m, z1.d
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p1, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: scvtf_v8i64_v8f32:
@@ -1786,8 +1786,8 @@ define void @scvtf_v4i64_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @scvtf_v8i64_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: scvtf_v8i64_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    scvtf z0.d, p0/m, z0.d

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll
index bfd917feb7ee4e..2c6ffeaeefd6d9 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll
@@ -50,8 +50,8 @@ define void @select_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @select_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: select_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x8]
@@ -163,8 +163,8 @@ define void @select_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @select_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: select_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
@@ -276,8 +276,8 @@ define void @select_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @select_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: select_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -390,8 +390,8 @@ define void @select_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @select_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: select_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-limit-duplane.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-limit-duplane.ll
index bb4f13178f95f7..78b41f71f0ea28 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-limit-duplane.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-limit-duplane.ll
@@ -6,14 +6,14 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x i32> @test(ptr %arg1, ptr %arg2) {
 ; CHECK-LABEL: test:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov x8, #8 // =0x8
 ; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    mov x8, #8 // =0x8
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0]
 ; CHECK-NEXT:    add z1.s, z0.s, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #16
-; CHECK-NEXT:    dup v0.4s, v0.s[2]
 ; CHECK-NEXT:    add z2.s, z2.s, z2.s
+; CHECK-NEXT:    dup v0.4s, v0.s[2]
 ; CHECK-NEXT:    st1w { z1.s }, p0, [x0, x8, lsl #2]
 ; CHECK-NEXT:    st1w { z2.s }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -29,14 +29,14 @@ entry:
 define <2 x i32> @test2(ptr %arg1, ptr %arg2) {
 ; CHECK-LABEL: test2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov x8, #8 // =0x8
 ; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    mov x8, #8 // =0x8
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0]
 ; CHECK-NEXT:    add z1.s, z0.s, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #24
-; CHECK-NEXT:    dup v0.2s, v0.s[0]
 ; CHECK-NEXT:    add z2.s, z2.s, z2.s
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
 ; CHECK-NEXT:    st1w { z1.s }, p0, [x0, x8, lsl #2]
 ; CHECK-NEXT:    st1w { z2.s }, p0, [x0]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll
index 9771bdf2b6f026..8c574f8e4716a7 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll
@@ -52,8 +52,8 @@ define <8 x float> @load_v8f32(ptr %a) #0 {
 define <16 x float> @load_v16f32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: load_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x9, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8, x9, lsl #2]
@@ -87,24 +87,24 @@ define <16 x float> @load_v16f32(ptr %a) #0 {
 define <32 x float> @load_v32f32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: load_v32f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #16
-; VBITS_GE_256-NEXT:    mov x10, #24
-; VBITS_GE_256-NEXT:    mov x11, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x9, #24 // =0x18
+; VBITS_GE_256-NEXT:    mov x10, #16 // =0x10
+; VBITS_GE_256-NEXT:    mov x11, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8, x10, lsl #2]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8, x10, lsl #2]
 ; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x8, x11, lsl #2]
 ; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x8]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: load_v32f32:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    mov x9, #16
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    mov x9, #16 // =0x10
 ; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
 ; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x8, x9, lsl #2]
@@ -131,52 +131,52 @@ define <32 x float> @load_v32f32(ptr %a) #0 {
 define <64 x float> @load_v64f32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: load_v64f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    mov x10, #48
-; VBITS_GE_256-NEXT:    mov x11, #56
-; VBITS_GE_256-NEXT:    mov x12, #32
-; VBITS_GE_256-NEXT:    mov x13, #40
-; VBITS_GE_256-NEXT:    mov x14, #16
-; VBITS_GE_256-NEXT:    mov x15, #24
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x11, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x12, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0, x13, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x0, x14, lsl #2]
+; VBITS_GE_256-NEXT:    mov x9, #8 // =0x8
+; VBITS_GE_256-NEXT:    mov x10, #16 // =0x10
+; VBITS_GE_256-NEXT:    mov x11, #24 // =0x18
+; VBITS_GE_256-NEXT:    mov x12, #56 // =0x38
+; VBITS_GE_256-NEXT:    mov x13, #32 // =0x20
+; VBITS_GE_256-NEXT:    mov x14, #48 // =0x30
+; VBITS_GE_256-NEXT:    mov x15, #40 // =0x28
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x12, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0, x14, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0, x10, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x0, x11, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z5.s }, p0/z, [x0, x15, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z6.s }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z6.s }, p0/z, [x0, x13, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z7.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8, x11, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x8, x13, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x8, x12, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8, x12, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x8, x14, lsl #2]
 ; VBITS_GE_256-NEXT:    st1w { z5.s }, p0, [x8, x15, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x8, x14, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z6.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z6.s }, p0, [x8, x13, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z4.s }, p0, [x8, x11, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x8, x10, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8, x9, lsl #2]
 ; VBITS_GE_256-NEXT:    st1w { z7.s }, p0, [x8]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: load_v64f32:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    mov x9, #32
-; VBITS_GE_512-NEXT:    mov x10, #48
-; VBITS_GE_512-NEXT:    mov x11, #16
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    mov x9, #48 // =0x30
+; VBITS_GE_512-NEXT:    mov x10, #32 // =0x20
+; VBITS_GE_512-NEXT:    mov x11, #16 // =0x10
 ; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
 ; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x0, x10, lsl #2]
 ; VBITS_GE_512-NEXT:    ld1w { z2.s }, p0/z, [x0, x11, lsl #2]
 ; VBITS_GE_512-NEXT:    ld1w { z3.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    st1w { z1.s }, p0, [x8, x10, lsl #2]
 ; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x8, x9, lsl #2]
+; VBITS_GE_512-NEXT:    st1w { z1.s }, p0, [x8, x10, lsl #2]
 ; VBITS_GE_512-NEXT:    st1w { z2.s }, p0, [x8, x11, lsl #2]
 ; VBITS_GE_512-NEXT:    st1w { z3.s }, p0, [x8]
 ; VBITS_GE_512-NEXT:    ret
 ;
 ; VBITS_GE_1024-LABEL: load_v64f32:
 ; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    mov x9, #32
 ; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
+; VBITS_GE_1024-NEXT:    mov x9, #32 // =0x20
 ; VBITS_GE_1024-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
 ; VBITS_GE_1024-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x8, x9, lsl #2]

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll
index 61dbcbdb475db1..d0585274a43e3c 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll
@@ -13,8 +13,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define i8 @andv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: andv_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    andv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -26,8 +26,8 @@ define i8 @andv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
 define i8 @andv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: andv_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    andv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -51,8 +51,8 @@ define i8 @andv_v32i8(ptr %a) vscale_range(2,0) #0 {
 define i8 @andv_v64i8(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: andv_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    and z0.d, z1.d, z0.d
@@ -102,8 +102,8 @@ define i8 @andv_v256i8(ptr %a) vscale_range(16,0) #0 {
 define i16 @andv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: andv_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    andv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -115,8 +115,8 @@ define i16 @andv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
 define i16 @andv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: andv_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    andv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -140,8 +140,8 @@ define i16 @andv_v16i16(ptr %a) vscale_range(2,0) #0 {
 define i16 @andv_v32i16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: andv_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    and z0.d, z1.d, z0.d
@@ -191,8 +191,8 @@ define i16 @andv_v128i16(ptr %a) vscale_range(16,0) #0 {
 define i32 @andv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: andv_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    andv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -204,8 +204,8 @@ define i32 @andv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
 define i32 @andv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: andv_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    andv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -229,8 +229,8 @@ define i32 @andv_v8i32(ptr %a) vscale_range(2,0) #0 {
 define i32 @andv_v16i32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: andv_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    and z0.d, z1.d, z0.d
@@ -291,8 +291,8 @@ define i64 @andv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
 define i64 @andv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: andv_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    andv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -316,8 +316,8 @@ define i64 @andv_v4i64(ptr %a) vscale_range(2,0) #0 {
 define i64 @andv_v8i64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: andv_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    and z0.d, z1.d, z0.d
@@ -371,8 +371,8 @@ define i64 @andv_v32i64(ptr %a) vscale_range(16,0) #0 {
 define i8 @eorv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: eorv_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    eorv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -384,8 +384,8 @@ define i8 @eorv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
 define i8 @eorv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: eorv_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    eorv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -409,8 +409,8 @@ define i8 @eorv_v32i8(ptr %a) vscale_range(2,0) #0 {
 define i8 @eorv_v64i8(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: eorv_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    eor z0.d, z1.d, z0.d
@@ -460,8 +460,8 @@ define i8 @eorv_v256i8(ptr %a) vscale_range(16,0) #0 {
 define i16 @eorv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: eorv_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    eorv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -473,8 +473,8 @@ define i16 @eorv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
 define i16 @eorv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: eorv_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    eorv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -498,8 +498,8 @@ define i16 @eorv_v16i16(ptr %a) vscale_range(2,0) #0 {
 define i16 @eorv_v32i16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: eorv_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    eor z0.d, z1.d, z0.d
@@ -549,8 +549,8 @@ define i16 @eorv_v128i16(ptr %a) vscale_range(16,0) #0 {
 define i32 @eorv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: eorv_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    eorv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -562,8 +562,8 @@ define i32 @eorv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
 define i32 @eorv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: eorv_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    eorv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -587,8 +587,8 @@ define i32 @eorv_v8i32(ptr %a) vscale_range(2,0) #0 {
 define i32 @eorv_v16i32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: eorv_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    eor z0.d, z1.d, z0.d
@@ -649,8 +649,8 @@ define i64 @eorv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
 define i64 @eorv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: eorv_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    eorv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -674,8 +674,8 @@ define i64 @eorv_v4i64(ptr %a) vscale_range(2,0) #0 {
 define i64 @eorv_v8i64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: eorv_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    eor z0.d, z1.d, z0.d
@@ -729,8 +729,8 @@ define i64 @eorv_v32i64(ptr %a) vscale_range(16,0) #0 {
 define i8 @orv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: orv_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    orv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -742,8 +742,8 @@ define i8 @orv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 {
 define i8 @orv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: orv_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    orv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -767,8 +767,8 @@ define i8 @orv_v32i8(ptr %a) vscale_range(2,0) #0 {
 define i8 @orv_v64i8(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: orv_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    orr z0.d, z1.d, z0.d
@@ -818,8 +818,8 @@ define i8 @orv_v256i8(ptr %a) vscale_range(16,0) #0 {
 define i16 @orv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: orv_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    orv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -831,8 +831,8 @@ define i16 @orv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 {
 define i16 @orv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: orv_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    orv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -856,8 +856,8 @@ define i16 @orv_v16i16(ptr %a) vscale_range(2,0) #0 {
 define i16 @orv_v32i16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: orv_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    orr z0.d, z1.d, z0.d
@@ -907,8 +907,8 @@ define i16 @orv_v128i16(ptr %a) vscale_range(16,0) #0 {
 define i32 @orv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: orv_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    orv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -920,8 +920,8 @@ define i32 @orv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 {
 define i32 @orv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: orv_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    orv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -945,8 +945,8 @@ define i32 @orv_v8i32(ptr %a) vscale_range(2,0) #0 {
 define i32 @orv_v16i32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: orv_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    orr z0.d, z1.d, z0.d
@@ -1007,8 +1007,8 @@ define i64 @orv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 {
 define i64 @orv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 {
 ; CHECK-LABEL: orv_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    orv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -1032,8 +1032,8 @@ define i64 @orv_v4i64(ptr %a) vscale_range(2,0) #0 {
 define i64 @orv_v8i64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: orv_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    orr z0.d, z1.d, z0.d

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll
index deafa73fbbe7fe..29ad550c40d91b 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll
@@ -12,8 +12,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @masked_gather_v2i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x1]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldr q0, [x1]
 ; CHECK-NEXT:    ld1b { z0.d }, p0/z, [z0.d]
 ; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    xtn v0.2s, v0.2d
@@ -42,8 +42,8 @@ define void @masked_gather_v4i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @masked_gather_v8i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: masked_gather_v8i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    ld1b { z0.d }, p0/z, [z0.d]
@@ -114,8 +114,8 @@ define void @masked_gather_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define void @masked_gather_v2i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x1]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldr q0, [x1]
 ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z0.d]
 ; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    xtn v0.2s, v0.2d
@@ -146,8 +146,8 @@ define void @masked_gather_v4i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @masked_gather_v8i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: masked_gather_v8i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    ld1h { z0.d }, p0/z, [z0.d]
@@ -214,8 +214,8 @@ define void @masked_gather_v32i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define void @masked_gather_v2i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x1]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldr q0, [x1]
 ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
 ; CHECK-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-NEXT:    str d0, [x0]
@@ -244,8 +244,9 @@ define void @masked_gather_v4i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @masked_gather_v8i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: masked_gather_v8i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
@@ -254,8 +255,7 @@ define void @masked_gather_v8i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p1, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_gather_v8i32:
@@ -310,8 +310,8 @@ define void @masked_gather_v32i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define void @masked_gather_v2i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x1]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldr q0, [x1]
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [z0.d]
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
@@ -338,8 +338,8 @@ define void @masked_gather_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @masked_gather_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: masked_gather_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [z0.d]

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
index 11b57ad319bea7..1d240bf38d482c 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
@@ -13,15 +13,15 @@ define void @masked_gather_v2i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    ldrb w9, [x0, #1]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    ldrb w8, [x0, #1]
-; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    mov v0.s[1], w9
 ; CHECK-NEXT:    cmeq v0.2s, v0.2s, #0
 ; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
-; CHECK-NEXT:    ld1b { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [z0.d]
 ; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-NEXT:    st1b { z0.s }, p0, [x0]
@@ -39,9 +39,9 @@ define void @masked_gather_v4i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl4
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-NEXT:    cmeq v0.4h, v0.4h, #0
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    cmpne p1.d, p0/z, z0.d, #0
@@ -60,13 +60,12 @@ define void @masked_gather_v8i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: masked_gather_v8i8:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ldr d0, [x0]
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    cmeq v0.8b, v0.8b, #0
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    zip2 v1.8b, v0.8b, v0.8b
 ; VBITS_GE_256-NEXT:    zip1 v0.8b, v0.8b, v0.8b
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    shl v1.4h, v1.4h, #8
 ; VBITS_GE_256-NEXT:    shl v0.4h, v0.4h, #8
 ; VBITS_GE_256-NEXT:    sshr v1.4h, v1.4h, #8
@@ -76,9 +75,10 @@ define void @masked_gather_v8i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
 ; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z1.d, #0
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpne p0.d, p0/z, z0.d, #0
 ; VBITS_GE_256-NEXT:    ld1b { z0.d }, p1/z, [z2.d]
-; VBITS_GE_256-NEXT:    ld1b { z1.d }, p0/z, [z3.d]
+; VBITS_GE_256-NEXT:    ld1b { z1.d }, p0/z, [z1.d]
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
@@ -91,9 +91,9 @@ define void @masked_gather_v8i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ldr d0, [x0]
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    cmeq v0.8b, v0.8b, #0
 ; VBITS_GE_512-NEXT:    sunpklo z0.h, z0.b
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
 ; VBITS_GE_512-NEXT:    sunpklo z0.d, z0.s
 ; VBITS_GE_512-NEXT:    cmpne p0.d, p0/z, z0.d, #0
@@ -116,9 +116,9 @@ define void @masked_gather_v16i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl16
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    cmeq v0.16b, v0.16b, #0
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
@@ -142,12 +142,12 @@ define void @masked_gather_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p0.b, vl32
 ; CHECK-NEXT:    ptrue p1.d, vl32
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
 ; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
 ; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    ld1b { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [z0.d]
 ; CHECK-NEXT:    st1b { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cval = load <32 x i8>, ptr %a
@@ -166,15 +166,15 @@ define void @masked_gather_v2i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    ldrh w9, [x0, #2]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    ldrh w8, [x0, #2]
-; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    mov v0.s[1], w9
 ; CHECK-NEXT:    cmeq v0.2s, v0.2s, #0
 ; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z0.d]
 ; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x0]
@@ -192,9 +192,9 @@ define void @masked_gather_v4i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl4
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    cmeq v0.4h, v0.4h, #0
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z1.d]
@@ -214,23 +214,23 @@ define void @masked_gather_v8i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: masked_gather_v8i16:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ldr q0, [x0]
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    cmeq v0.8h, v0.8h, #0
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
 ; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z0.d, #0
 ; VBITS_GE_256-NEXT:    ld1h { z0.d }, p1/z, [z3.d]
-; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
 ; VBITS_GE_256-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT:    ld1h { z1.d }, p0/z, [z2.d]
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    ld1h { z1.d }, p0/z, [z2.d]
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; VBITS_GE_256-NEXT:    mov v0.d[1], v1.d[0]
 ; VBITS_GE_256-NEXT:    str q0, [x0]
@@ -240,9 +240,9 @@ define void @masked_gather_v8i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ldr q0, [x0]
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    cmeq v0.8h, v0.8h, #0
 ; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    sunpklo z0.d, z0.s
 ; VBITS_GE_512-NEXT:    cmpne p0.d, p0/z, z0.d, #0
 ; VBITS_GE_512-NEXT:    ld1h { z0.d }, p0/z, [z1.d]
@@ -264,11 +264,11 @@ define void @masked_gather_v16i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK-NEXT:    ptrue p0.h, vl16
 ; CHECK-NEXT:    ptrue p1.d, vl16
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
 ; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z0.d]
 ; CHECK-NEXT:    st1h { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cval = load <16 x i16>, ptr %a
@@ -285,11 +285,11 @@ define void @masked_gather_v32i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p0.h, vl32
 ; CHECK-NEXT:    ptrue p1.d, vl32
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
 ; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z0.d]
 ; CHECK-NEXT:    st1h { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cval = load <32 x i16>, ptr %a
@@ -309,11 +309,11 @@ define void @masked_gather_v2i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    cmeq v0.2s, v0.2s, #0
 ; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
 ; CHECK-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
@@ -330,9 +330,9 @@ define void @masked_gather_v4i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl4
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z1.d]
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
@@ -350,25 +350,25 @@ define void @masked_gather_v8i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: masked_gather_v8i32:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT:    ptrue p2.d, vl4
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT:    cmpeq p2.s, p0/z, z0.s, #0
-; VBITS_GE_256-NEXT:    mov z0.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    punpklo p2.h, p2.b
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p2/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p2/z, [x1]
+; VBITS_GE_256-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
+; VBITS_GE_256-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    punpklo p1.h, p1.b
 ; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
 ; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    ld1w { z2.d }, p2/z, [z2.d]
-; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z0.d, #0
-; VBITS_GE_256-NEXT:    ld1w { z0.d }, p1/z, [z1.d]
+; VBITS_GE_256-NEXT:    and p1.b, p1/z, p1.b, p2.b
+; VBITS_GE_256-NEXT:    cmpne p2.d, p2/z, z0.d, #0
+; VBITS_GE_256-NEXT:    ld1w { z0.d }, p1/z, [z2.d]
 ; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z2.s, z2.s
+; VBITS_GE_256-NEXT:    ld1w { z1.d }, p2/z, [z1.d]
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    splice z0.s, p1, z0.s, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_gather_v8i32:
@@ -376,10 +376,10 @@ define void @masked_gather_v8i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_512-NEXT:    ptrue p1.d, vl8
 ; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1d { z1.d }, p1/z, [x1]
 ; VBITS_GE_512-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p1/z, [x1]
 ; VBITS_GE_512-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_512-NEXT:    ld1w { z0.d }, p0/z, [z1.d]
+; VBITS_GE_512-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
 ; VBITS_GE_512-NEXT:    st1w { z0.d }, p1, [x0]
 ; VBITS_GE_512-NEXT:    ret
   %cval = load <8 x i32>, ptr %a
@@ -396,10 +396,10 @@ define void @masked_gather_v16i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK-NEXT:    ptrue p0.s, vl16
 ; CHECK-NEXT:    ptrue p1.d, vl16
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
 ; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
 ; CHECK-NEXT:    st1w { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cval = load <16 x i32>, ptr %a
@@ -416,10 +416,10 @@ define void @masked_gather_v32i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p0.s, vl32
 ; CHECK-NEXT:    ptrue p1.d, vl32
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
 ; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
 ; CHECK-NEXT:    st1w { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cval = load <32 x i32>, ptr %a
@@ -460,12 +460,12 @@ define void @masked_gather_v1i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @masked_gather_v2i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    cmeq v0.2d, v0.2d, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [z0.d]
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
   %cval = load <2 x i64>, ptr %a
@@ -497,18 +497,18 @@ define void @masked_gather_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @masked_gather_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: masked_gather_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p2.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [z2.d]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p2/z, [z3.d]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p1/z, [z2.d]
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p2/z, [z0.d]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_gather_v8i64:
@@ -572,9 +572,8 @@ define void @masked_gather_v2f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s1, [x0]
-; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    ldr q2, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    fcmeq v1.4h, v1.4h, #0.0
 ; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
 ; CHECK-NEXT:    mov v0.h[0], v1.h[0]
@@ -583,7 +582,8 @@ define void @masked_gather_v2f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z2.d]
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z0.d]
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    str s0, [x0]
@@ -601,9 +601,9 @@ define void @masked_gather_v4f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl4
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    fcmeq v0.4h, v0.4h, #0.0
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z1.d]
@@ -623,23 +623,23 @@ define void @masked_gather_v8f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: masked_gather_v8f16:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ldr q0, [x0]
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    fcmeq v0.8h, v0.8h, #0.0
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
 ; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z0.d, #0
 ; VBITS_GE_256-NEXT:    ld1h { z0.d }, p1/z, [z3.d]
-; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
 ; VBITS_GE_256-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT:    ld1h { z1.d }, p0/z, [z2.d]
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
+; VBITS_GE_256-NEXT:    ld1h { z1.d }, p0/z, [z2.d]
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; VBITS_GE_256-NEXT:    mov v0.d[1], v1.d[0]
 ; VBITS_GE_256-NEXT:    str q0, [x0]
@@ -649,9 +649,9 @@ define void @masked_gather_v8f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ldr q0, [x0]
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    fcmeq v0.8h, v0.8h, #0.0
 ; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    sunpklo z0.d, z0.s
 ; VBITS_GE_512-NEXT:    cmpne p0.d, p0/z, z0.d, #0
 ; VBITS_GE_512-NEXT:    ld1h { z0.d }, p0/z, [z1.d]
@@ -673,11 +673,11 @@ define void @masked_gather_v16f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK-NEXT:    ptrue p0.h, vl16
 ; CHECK-NEXT:    ptrue p1.d, vl16
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
 ; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
 ; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z0.d]
 ; CHECK-NEXT:    st1h { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cval = load <16 x half>, ptr %a
@@ -694,11 +694,11 @@ define void @masked_gather_v32f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p0.h, vl32
 ; CHECK-NEXT:    ptrue p1.d, vl32
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
 ; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
 ; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z0.d]
 ; CHECK-NEXT:    st1h { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cval = load <32 x half>, ptr %a
@@ -718,11 +718,11 @@ define void @masked_gather_v2f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    fcmeq v0.2s, v0.2s, #0.0
 ; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
 ; CHECK-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
@@ -739,9 +739,9 @@ define void @masked_gather_v4f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl4
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    fcmeq v0.4s, v0.4s, #0.0
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z1.d]
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
@@ -759,25 +759,25 @@ define void @masked_gather_v8f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: masked_gather_v8f32:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x8, #4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT:    ptrue p2.d, vl4
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p1/z, [x1]
-; VBITS_GE_256-NEXT:    fcmeq p2.s, p0/z, z0.s, #0.0
-; VBITS_GE_256-NEXT:    mov z0.s, p2/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    punpklo p2.h, p2.b
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p2/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p2/z, [x1]
+; VBITS_GE_256-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
+; VBITS_GE_256-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    punpklo p1.h, p1.b
 ; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    and p2.b, p2/z, p2.b, p1.b
 ; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    ld1w { z2.d }, p2/z, [z2.d]
-; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z0.d, #0
-; VBITS_GE_256-NEXT:    ld1w { z0.d }, p1/z, [z1.d]
+; VBITS_GE_256-NEXT:    and p1.b, p1/z, p1.b, p2.b
+; VBITS_GE_256-NEXT:    cmpne p2.d, p2/z, z0.d, #0
+; VBITS_GE_256-NEXT:    ld1w { z0.d }, p1/z, [z2.d]
 ; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z2.s, z2.s
+; VBITS_GE_256-NEXT:    ld1w { z1.d }, p2/z, [z1.d]
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    splice z0.s, p1, z0.s, z1.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_gather_v8f32:
@@ -785,10 +785,10 @@ define void @masked_gather_v8f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_512-NEXT:    ptrue p1.d, vl8
 ; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1d { z1.d }, p1/z, [x1]
 ; VBITS_GE_512-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p1/z, [x1]
 ; VBITS_GE_512-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_512-NEXT:    ld1w { z0.d }, p0/z, [z1.d]
+; VBITS_GE_512-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
 ; VBITS_GE_512-NEXT:    st1w { z0.d }, p1, [x0]
 ; VBITS_GE_512-NEXT:    ret
   %cval = load <8 x float>, ptr %a
@@ -805,10 +805,10 @@ define void @masked_gather_v16f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK-NEXT:    ptrue p0.s, vl16
 ; CHECK-NEXT:    ptrue p1.d, vl16
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
 ; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
 ; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
 ; CHECK-NEXT:    st1w { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cval = load <16 x float>, ptr %a
@@ -825,10 +825,10 @@ define void @masked_gather_v32f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p0.s, vl32
 ; CHECK-NEXT:    ptrue p1.d, vl32
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
 ; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
 ; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
 ; CHECK-NEXT:    st1w { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cval = load <32 x float>, ptr %a
@@ -869,12 +869,12 @@ define void @masked_gather_v1f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @masked_gather_v2f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    fcmeq v0.2d, v0.2d, #0.0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [z0.d]
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
   %cval = load <2 x double>, ptr %a
@@ -890,9 +890,9 @@ define void @masked_gather_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
-; CHECK-NEXT:    ld1d { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [z0.d]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   %cval = load <4 x double>, ptr %a
@@ -906,16 +906,16 @@ define void @masked_gather_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @masked_gather_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: masked_gather_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
 ; VBITS_GE_256-NEXT:    fcmeq p2.d, p0/z, z1.d, #0.0
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [z2.d]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p2/z, [z3.d]
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [z0.d]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p2/z, [z1.d]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
@@ -924,9 +924,9 @@ define void @masked_gather_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
 ; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
-; VBITS_GE_512-NEXT:    ld1d { z0.d }, p1/z, [z1.d]
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p1/z, [z0.d]
 ; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
 ; VBITS_GE_512-NEXT:    ret
   %cval = load <8 x double>, ptr %a
@@ -942,9 +942,9 @@ define void @masked_gather_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl16
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
-; CHECK-NEXT:    ld1d { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [z0.d]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   %cval = load <16 x double>, ptr %a
@@ -960,9 +960,9 @@ define void @masked_gather_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl32
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
-; CHECK-NEXT:    ld1d { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [z0.d]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   %cval = load <32 x double>, ptr %a
@@ -982,10 +982,10 @@ define void @masked_gather_32b_scaled_sext_f16(ptr %a, ptr %b, ptr %base) vscale
 ; CHECK-NEXT:    ptrue p0.h, vl32
 ; CHECK-NEXT:    ptrue p1.s, vl32
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
 ; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x1]
 ; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x2, z1.s, sxtw #1]
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x2, z0.s, sxtw #1]
 ; CHECK-NEXT:    st1h { z0.s }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cvals = load <32 x half>, ptr %a
@@ -1003,9 +1003,9 @@ define void @masked_gather_32b_scaled_sext_f32(ptr %a, ptr %b, ptr %base) vscale
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl32
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
 ; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x2, z1.s, sxtw #2]
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1]
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x2, z0.s, sxtw #2]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
   %cvals = load <32 x float>, ptr %a
@@ -1023,9 +1023,9 @@ define void @masked_gather_32b_scaled_sext_f64(ptr %a, ptr %b, ptr %base) vscale
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl32
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1sw { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
-; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x2, z1.d, lsl #3]
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x2, z0.d, lsl #3]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   %cvals = load <32 x double>, ptr %a
@@ -1044,10 +1044,10 @@ define void @masked_gather_32b_scaled_zext(ptr %a, ptr %b, ptr %base) vscale_ran
 ; CHECK-NEXT:    ptrue p0.h, vl32
 ; CHECK-NEXT:    ptrue p1.s, vl32
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
 ; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x1]
 ; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x2, z1.s, uxtw #1]
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x2, z0.s, uxtw #1]
 ; CHECK-NEXT:    st1h { z0.s }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cvals = load <32 x half>, ptr %a
@@ -1066,10 +1066,10 @@ define void @masked_gather_32b_unscaled_sext(ptr %a, ptr %b, ptr %base) vscale_r
 ; CHECK-NEXT:    ptrue p0.h, vl32
 ; CHECK-NEXT:    ptrue p1.s, vl32
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
 ; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x1]
 ; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x2, z1.s, sxtw]
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x2, z0.s, sxtw]
 ; CHECK-NEXT:    st1h { z0.s }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cvals = load <32 x half>, ptr %a
@@ -1089,10 +1089,10 @@ define void @masked_gather_32b_unscaled_zext(ptr %a, ptr %b, ptr %base) vscale_r
 ; CHECK-NEXT:    ptrue p0.h, vl32
 ; CHECK-NEXT:    ptrue p1.s, vl32
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
 ; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x1]
 ; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x2, z1.s, uxtw]
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x2, z0.s, uxtw]
 ; CHECK-NEXT:    st1h { z0.s }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cvals = load <32 x half>, ptr %a
@@ -1112,10 +1112,10 @@ define void @masked_gather_64b_scaled(ptr %a, ptr %b, ptr %base) vscale_range(16
 ; CHECK-NEXT:    ptrue p0.s, vl32
 ; CHECK-NEXT:    ptrue p1.d, vl32
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
 ; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
 ; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x2, z1.d, lsl #2]
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x2, z0.d, lsl #2]
 ; CHECK-NEXT:    st1w { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cvals = load <32 x float>, ptr %a
@@ -1133,10 +1133,10 @@ define void @masked_gather_64b_unscaled(ptr %a, ptr %b, ptr %base) vscale_range(
 ; CHECK-NEXT:    ptrue p0.s, vl32
 ; CHECK-NEXT:    ptrue p1.d, vl32
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
 ; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
 ; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x2, z1.d]
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x2, z0.d]
 ; CHECK-NEXT:    st1w { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cvals = load <32 x float>, ptr %a
@@ -1155,10 +1155,10 @@ define void @masked_gather_vec_plus_reg(ptr %a, ptr %b, i64 %off) vscale_range(1
 ; CHECK-NEXT:    ptrue p0.s, vl32
 ; CHECK-NEXT:    ptrue p1.d, vl32
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
 ; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
 ; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x2, z1.d]
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x2, z0.d]
 ; CHECK-NEXT:    st1w { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cvals = load <32 x float>, ptr %a
@@ -1177,10 +1177,10 @@ define void @masked_gather_vec_plus_imm(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p0.s, vl32
 ; CHECK-NEXT:    ptrue p1.d, vl32
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
 ; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
 ; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z1.d, #4]
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z0.d, #4]
 ; CHECK-NEXT:    st1w { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cvals = load <32 x float>, ptr %a
@@ -1197,15 +1197,15 @@ define void @masked_gather_passthru(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #
 ; CHECK-LABEL: masked_gather_passthru:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl32
-; CHECK-NEXT:    ptrue p1.d, vl32
+; CHECK-NEXT:    ptrue p2.d, vl32
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
 ; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x2]
-; CHECK-NEXT:    punpklo p2.h, p1.b
-; CHECK-NEXT:    ld1w { z1.d }, p2/z, [z1.d]
-; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    mov z0.s, p1/m, z1.s
+; CHECK-NEXT:    ld1d { z0.d }, p2/z, [x1]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x2]
+; CHECK-NEXT:    punpklo p3.h, p1.b
+; CHECK-NEXT:    ld1w { z0.d }, p3/z, [z0.d]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
   %cvals = load <32 x float>, ptr %a
@@ -1223,10 +1223,10 @@ define void @masked_gather_passthru_0(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p0.s, vl32
 ; CHECK-NEXT:    ptrue p1.d, vl32
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
 ; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x1]
 ; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
 ; CHECK-NEXT:    st1w { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %cvals = load <32 x float>, ptr %a

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
index 40709ca420bc90..a63b90856a66d8 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
@@ -13,9 +13,9 @@ define <2 x half> @masked_load_v2f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_load_v2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s1, [x0]
-; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    ldr s2, [x1]
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    fcmeq v1.4h, v1.4h, v2.4h
 ; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
 ; CHECK-NEXT:    mov v0.h[0], v1.h[0]
@@ -35,8 +35,8 @@ define <2 x half> @masked_load_v2f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 define <2 x float> @masked_load_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_load_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    fcmeq v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
@@ -53,8 +53,8 @@ define <2 x float> @masked_load_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 define <4 x float> @masked_load_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_load_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
@@ -88,8 +88,8 @@ define <8 x float> @masked_load_v8f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 define <16 x float> @masked_load_v16f32(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x9, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x9, lsl #2]
@@ -155,8 +155,8 @@ define <64 x float> @masked_load_v64f32(ptr %ap, ptr %bp) vscale_range(16,0) #0
 define <64 x i8> @masked_load_v64i8(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w9, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w9, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x9]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1, x9]
@@ -188,8 +188,8 @@ define <64 x i8> @masked_load_v64i8(ptr %ap, ptr %bp) #0 {
 define <32 x i16> @masked_load_v32i16(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x9, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x9, lsl #1]
@@ -221,8 +221,8 @@ define <32 x i16> @masked_load_v32i16(ptr %ap, ptr %bp) #0 {
 define <16 x i32> @masked_load_v16i32(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x9, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x9, lsl #2]
@@ -254,8 +254,8 @@ define <16 x i32> @masked_load_v16i32(ptr %ap, ptr %bp) #0 {
 define <8 x i64> @masked_load_v8i64(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x9, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x9, lsl #3]
@@ -287,8 +287,8 @@ define <8 x i64> @masked_load_v8i64(ptr %ap, ptr %bp) #0 {
 define <8 x i64> @masked_load_passthru_v8i64(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_passthru_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x9, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x9, lsl #3]
@@ -323,8 +323,8 @@ define <8 x i64> @masked_load_passthru_v8i64(ptr %ap, ptr %bp) #0 {
 define <8 x double> @masked_load_passthru_v8f64(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_passthru_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x9, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x9, lsl #3]
@@ -360,7 +360,7 @@ define <32 x i16> @masked_load_sext_v32i8i16(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_sext_v32i8i16:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_256-NEXT:    mov x9, #16
+; VBITS_GE_256-NEXT:    mov x9, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
@@ -390,17 +390,17 @@ define <32 x i16> @masked_load_sext_v32i8i16(ptr %ap, ptr %bp) #0 {
 define <16 x i32> @masked_load_sext_v16i8i32(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_sext_v16i8i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ldr q0, [x1]
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl16
-; VBITS_GE_256-NEXT:    mov x9, #8
+; VBITS_GE_256-NEXT:    ldr q0, [x1]
+; VBITS_GE_256-NEXT:    mov x9, #8 // =0x8
 ; VBITS_GE_256-NEXT:    cmeq v0.16b, v0.16b, #0
 ; VBITS_GE_256-NEXT:    cmpne p0.b, p0/z, z0.b, #0
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; VBITS_GE_256-NEXT:    sunpklo z0.h, z0.b
-; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    sunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8, x9, lsl #2]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8]
@@ -424,9 +424,9 @@ define <16 x i32> @masked_load_sext_v16i8i32(ptr %ap, ptr %bp) #0 {
 define <8 x i64> @masked_load_sext_v8i8i64(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_sext_v8i8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ldr d0, [x1]
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl8
-; VBITS_GE_256-NEXT:    mov x9, #4
+; VBITS_GE_256-NEXT:    ldr d0, [x1]
+; VBITS_GE_256-NEXT:    mov x9, #4 // =0x4
 ; VBITS_GE_256-NEXT:    cmeq v0.8b, v0.8b, #0
 ; VBITS_GE_256-NEXT:    cmpne p0.b, p0/z, z0.b, #0
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
@@ -434,8 +434,8 @@ define <8 x i64> @masked_load_sext_v8i8i64(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-NEXT:    sshll v0.8h, v0.8b, #0
 ; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8, x9, lsl #3]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8]
@@ -460,7 +460,7 @@ define <16 x i32> @masked_load_sext_v16i16i32(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_sext_v16i16i32:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x9, #8
+; VBITS_GE_256-NEXT:    mov x9, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
@@ -490,17 +490,17 @@ define <16 x i32> @masked_load_sext_v16i16i32(ptr %ap, ptr %bp) #0 {
 define <8 x i64> @masked_load_sext_v8i16i64(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_sext_v8i16i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ldr q0, [x1]
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
-; VBITS_GE_256-NEXT:    mov x9, #4
+; VBITS_GE_256-NEXT:    ldr q0, [x1]
+; VBITS_GE_256-NEXT:    mov x9, #4 // =0x4
 ; VBITS_GE_256-NEXT:    cmeq v0.8h, v0.8h, #0
 ; VBITS_GE_256-NEXT:    cmpne p0.h, p0/z, z0.h, #0
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8, x9, lsl #3]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8]
@@ -525,7 +525,7 @@ define <8 x i64> @masked_load_sext_v8i32i64(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_sext_v8i32i64:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x9, #4
+; VBITS_GE_256-NEXT:    mov x9, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
@@ -556,7 +556,7 @@ define <32 x i16> @masked_load_zext_v32i8i16(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_zext_v32i8i16:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_256-NEXT:    mov x9, #16
+; VBITS_GE_256-NEXT:    mov x9, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
@@ -586,17 +586,17 @@ define <32 x i16> @masked_load_zext_v32i8i16(ptr %ap, ptr %bp) #0 {
 define <16 x i32> @masked_load_zext_v16i8i32(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_zext_v16i8i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ldr q0, [x1]
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl16
-; VBITS_GE_256-NEXT:    mov x9, #8
+; VBITS_GE_256-NEXT:    ldr q0, [x1]
+; VBITS_GE_256-NEXT:    mov x9, #8 // =0x8
 ; VBITS_GE_256-NEXT:    cmeq v0.16b, v0.16b, #0
 ; VBITS_GE_256-NEXT:    cmpne p0.b, p0/z, z0.b, #0
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; VBITS_GE_256-NEXT:    uunpklo z0.h, z0.b
-; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    uunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8, x9, lsl #2]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8]
@@ -620,9 +620,9 @@ define <16 x i32> @masked_load_zext_v16i8i32(ptr %ap, ptr %bp) #0 {
 define <8 x i64> @masked_load_zext_v8i8i64(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_zext_v8i8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ldr d0, [x1]
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl8
-; VBITS_GE_256-NEXT:    mov x9, #4
+; VBITS_GE_256-NEXT:    ldr d0, [x1]
+; VBITS_GE_256-NEXT:    mov x9, #4 // =0x4
 ; VBITS_GE_256-NEXT:    cmeq v0.8b, v0.8b, #0
 ; VBITS_GE_256-NEXT:    cmpne p0.b, p0/z, z0.b, #0
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0]
@@ -630,8 +630,8 @@ define <8 x i64> @masked_load_zext_v8i8i64(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-NEXT:    ushll v0.8h, v0.8b, #0
 ; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8, x9, lsl #3]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8]
@@ -656,7 +656,7 @@ define <16 x i32> @masked_load_zext_v16i16i32(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_zext_v16i16i32:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov x9, #8
+; VBITS_GE_256-NEXT:    mov x9, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
@@ -686,17 +686,17 @@ define <16 x i32> @masked_load_zext_v16i16i32(ptr %ap, ptr %bp) #0 {
 define <8 x i64> @masked_load_zext_v8i16i64(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_zext_v8i16i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ldr q0, [x1]
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
-; VBITS_GE_256-NEXT:    mov x9, #4
+; VBITS_GE_256-NEXT:    ldr q0, [x1]
+; VBITS_GE_256-NEXT:    mov x9, #4 // =0x4
 ; VBITS_GE_256-NEXT:    cmeq v0.8h, v0.8h, #0
 ; VBITS_GE_256-NEXT:    cmpne p0.h, p0/z, z0.h, #0
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8, x9, lsl #3]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8]
@@ -721,7 +721,7 @@ define <8 x i64> @masked_load_zext_v8i32i64(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_zext_v8i32i64:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x9, #4
+; VBITS_GE_256-NEXT:    mov x9, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
@@ -751,20 +751,20 @@ define <8 x i64> @masked_load_zext_v8i32i64(ptr %ap, ptr %bp) #0 {
 define <32 x i16> @masked_load_sext_v32i8i16_m16(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_sext_v32i8i16_m16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x9, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x1, x9, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p1.h, p0/z, z0.h, #0
+; VBITS_GE_256-NEXT:    cmpeq p2.h, p0/z, z1.h, #0
 ; VBITS_GE_256-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    cmpeq p1.h, p0/z, z1.h, #0
-; VBITS_GE_256-NEXT:    mov z1.h, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    mov z1.h, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    ptrue p1.b, vl16
+; VBITS_GE_256-NEXT:    ptrue p2.b, vl32
 ; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
-; VBITS_GE_256-NEXT:    ptrue p1.b, vl16
 ; VBITS_GE_256-NEXT:    splice z1.b, p1, z1.b, z0.b
-; VBITS_GE_256-NEXT:    ptrue p1.b, vl32
-; VBITS_GE_256-NEXT:    cmpne p1.b, p1/z, z1.b, #0
+; VBITS_GE_256-NEXT:    cmpne p1.b, p2/z, z1.b, #0
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p1/z, [x0]
 ; VBITS_GE_256-NEXT:    sunpklo z1.h, z0.b
 ; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
@@ -791,26 +791,26 @@ define <32 x i16> @masked_load_sext_v32i8i16_m16(ptr %ap, ptr %bp) #0 {
 define <16 x i32> @masked_load_sext_v16i8i32_m32(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_sext_v16i8i32_m32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x9, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1, x9, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
 ; VBITS_GE_256-NEXT:    cmpeq p2.s, p0/z, z1.s, #0
 ; VBITS_GE_256-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
 ; VBITS_GE_256-NEXT:    mov z1.s, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    ptrue p1.b, vl16
 ; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
 ; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
-; VBITS_GE_256-NEXT:    ptrue p1.b, vl16
 ; VBITS_GE_256-NEXT:    cmpne p1.b, p1/z, z1.b, #0
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p1/z, [x0]
 ; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; VBITS_GE_256-NEXT:    sunpklo z0.h, z0.b
-; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    sunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8, x9, lsl #2]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8]
@@ -834,17 +834,17 @@ define <16 x i32> @masked_load_sext_v16i8i32_m32(ptr %ap, ptr %bp) #0 {
 define <8 x i64> @masked_load_sext_v8i8i64_m64(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_sext_v8i8i64_m64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x9, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT:    cmpeq p2.d, p0/z, z1.d, #0
 ; VBITS_GE_256-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT:    mov z1.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    mov z1.d, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
 ; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
 ; VBITS_GE_256-NEXT:    ptrue p1.b, vl8
 ; VBITS_GE_256-NEXT:    uzp1 z0.h, z1.h, z1.h
@@ -854,8 +854,8 @@ define <8 x i64> @masked_load_sext_v8i8i64_m64(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-NEXT:    sshll v0.8h, v0.8b, #0
 ; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8, x9, lsl #3]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8]
@@ -879,20 +879,20 @@ define <8 x i64> @masked_load_sext_v8i8i64_m64(ptr %ap, ptr %bp) #0 {
 define <16 x i32> @masked_load_sext_v16i16i32_m32(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_sext_v16i16i32_m32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x9, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1, x9, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
 ; VBITS_GE_256-NEXT:    cmpeq p2.s, p0/z, z1.s, #0
 ; VBITS_GE_256-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
 ; VBITS_GE_256-NEXT:    mov z1.s, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    ptrue p1.h, vl16
 ; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
 ; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
-; VBITS_GE_256-NEXT:    ptrue p1.h, vl16
 ; VBITS_GE_256-NEXT:    sunpklo z0.h, z1.b
 ; VBITS_GE_256-NEXT:    cmpne p1.h, p1/z, z0.h, #0
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p1/z, [x0]
@@ -921,17 +921,17 @@ define <16 x i32> @masked_load_sext_v16i16i32_m32(ptr %ap, ptr %bp) #0 {
 define <8 x i64> @masked_load_sext_v8i16i64_m64(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_sext_v8i16i64_m64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x9, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT:    cmpeq p2.d, p0/z, z1.d, #0
 ; VBITS_GE_256-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT:    mov z1.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    mov z1.d, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
 ; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
 ; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
 ; VBITS_GE_256-NEXT:    uzp1 z0.h, z1.h, z1.h
@@ -939,8 +939,8 @@ define <8 x i64> @masked_load_sext_v8i16i64_m64(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p1/z, [x0]
 ; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8, x9, lsl #3]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8]
@@ -964,20 +964,20 @@ define <8 x i64> @masked_load_sext_v8i16i64_m64(ptr %ap, ptr %bp) #0 {
 define <8 x i64> @masked_load_sext_v8i32i64_m64(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_sext_v8i32i64_m64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x9, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT:    cmpeq p2.d, p0/z, z1.d, #0
 ; VBITS_GE_256-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT:    mov z1.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    mov z1.d, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
+; VBITS_GE_256-NEXT:    ptrue p2.s, vl8
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
 ; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
-; VBITS_GE_256-NEXT:    cmpne p1.s, p1/z, z1.s, #0
+; VBITS_GE_256-NEXT:    cmpne p1.s, p2/z, z1.s, #0
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p1/z, [x0]
 ; VBITS_GE_256-NEXT:    sunpklo z1.d, z0.s
 ; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
@@ -1004,20 +1004,20 @@ define <8 x i64> @masked_load_sext_v8i32i64_m64(ptr %ap, ptr %bp) #0 {
 define <32 x i16> @masked_load_zext_v32i8i16_m16(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_zext_v32i8i16_m16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x9, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x1, x9, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p1.h, p0/z, z0.h, #0
+; VBITS_GE_256-NEXT:    cmpeq p2.h, p0/z, z1.h, #0
 ; VBITS_GE_256-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    cmpeq p1.h, p0/z, z1.h, #0
-; VBITS_GE_256-NEXT:    mov z1.h, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    mov z1.h, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    ptrue p1.b, vl16
+; VBITS_GE_256-NEXT:    ptrue p2.b, vl32
 ; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
-; VBITS_GE_256-NEXT:    ptrue p1.b, vl16
 ; VBITS_GE_256-NEXT:    splice z1.b, p1, z1.b, z0.b
-; VBITS_GE_256-NEXT:    ptrue p1.b, vl32
-; VBITS_GE_256-NEXT:    cmpne p1.b, p1/z, z1.b, #0
+; VBITS_GE_256-NEXT:    cmpne p1.b, p2/z, z1.b, #0
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p1/z, [x0]
 ; VBITS_GE_256-NEXT:    uunpklo z1.h, z0.b
 ; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
@@ -1044,26 +1044,26 @@ define <32 x i16> @masked_load_zext_v32i8i16_m16(ptr %ap, ptr %bp) #0 {
 define <16 x i32> @masked_load_zext_v16i8i32_m32(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_zext_v16i8i32_m32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x9, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1, x9, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
 ; VBITS_GE_256-NEXT:    cmpeq p2.s, p0/z, z1.s, #0
 ; VBITS_GE_256-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
 ; VBITS_GE_256-NEXT:    mov z1.s, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    ptrue p1.b, vl16
 ; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
 ; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
-; VBITS_GE_256-NEXT:    ptrue p1.b, vl16
 ; VBITS_GE_256-NEXT:    cmpne p1.b, p1/z, z1.b, #0
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p1/z, [x0]
 ; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; VBITS_GE_256-NEXT:    uunpklo z0.h, z0.b
-; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    uunpklo z1.h, z1.b
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x8, x9, lsl #2]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x8]
@@ -1087,17 +1087,17 @@ define <16 x i32> @masked_load_zext_v16i8i32_m32(ptr %ap, ptr %bp) #0 {
 define <8 x i64> @masked_load_zext_v8i8i64_m64(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_zext_v8i8i64_m64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x9, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT:    cmpeq p2.d, p0/z, z1.d, #0
 ; VBITS_GE_256-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT:    mov z1.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    mov z1.d, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
 ; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
 ; VBITS_GE_256-NEXT:    ptrue p1.b, vl8
 ; VBITS_GE_256-NEXT:    uzp1 z0.h, z1.h, z1.h
@@ -1107,8 +1107,8 @@ define <8 x i64> @masked_load_zext_v8i8i64_m64(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-NEXT:    ushll v0.8h, v0.8b, #0
 ; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8, x9, lsl #3]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8]
@@ -1132,20 +1132,20 @@ define <8 x i64> @masked_load_zext_v8i8i64_m64(ptr %ap, ptr %bp) #0 {
 define <16 x i32> @masked_load_zext_v16i16i32_m32(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_zext_v16i16i32_m32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x9, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1, x9, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
 ; VBITS_GE_256-NEXT:    cmpeq p2.s, p0/z, z1.s, #0
 ; VBITS_GE_256-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
 ; VBITS_GE_256-NEXT:    mov z1.s, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    ptrue p1.h, vl16
 ; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
 ; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
-; VBITS_GE_256-NEXT:    ptrue p1.h, vl16
 ; VBITS_GE_256-NEXT:    sunpklo z0.h, z1.b
 ; VBITS_GE_256-NEXT:    cmpne p1.h, p1/z, z0.h, #0
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p1/z, [x0]
@@ -1174,17 +1174,17 @@ define <16 x i32> @masked_load_zext_v16i16i32_m32(ptr %ap, ptr %bp) #0 {
 define <8 x i64> @masked_load_zext_v8i16i64_m64(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_zext_v8i16i64_m64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x9, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT:    cmpeq p2.d, p0/z, z1.d, #0
 ; VBITS_GE_256-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT:    mov z1.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    mov z1.d, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
 ; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
 ; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
 ; VBITS_GE_256-NEXT:    uzp1 z0.h, z1.h, z1.h
@@ -1192,8 +1192,8 @@ define <8 x i64> @masked_load_zext_v8i16i64_m64(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p1/z, [x0]
 ; VBITS_GE_256-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x8, x9, lsl #3]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x8]
@@ -1217,20 +1217,20 @@ define <8 x i64> @masked_load_zext_v8i16i64_m64(ptr %ap, ptr %bp) #0 {
 define <8 x i64> @masked_load_zext_v8i32i64_m64(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_zext_v8i32i64_m64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x9, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x9, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x9, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT:    cmpeq p2.d, p0/z, z1.d, #0
 ; VBITS_GE_256-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT:    mov z1.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    mov z1.d, p2/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
+; VBITS_GE_256-NEXT:    ptrue p2.s, vl8
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
 ; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
-; VBITS_GE_256-NEXT:    cmpne p1.s, p1/z, z1.s, #0
+; VBITS_GE_256-NEXT:    cmpne p1.s, p2/z, z1.s, #0
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p1/z, [x0]
 ; VBITS_GE_256-NEXT:    uunpklo z1.d, z0.s
 ; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
@@ -1450,7 +1450,7 @@ define <8 x i64> @masked_load_sext_ugt_v8i32i64(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_sext_ugt_v8i32i64:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x9, #4
+; VBITS_GE_256-NEXT:    mov x9, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpne p0.s, p0/z, z0.s, #0
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
@@ -1481,7 +1481,7 @@ define <8 x i64> @masked_load_zext_sgt_v8i32i64(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_load_zext_sgt_v8i32i64:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x9, #4
+; VBITS_GE_256-NEXT:    mov x9, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpgt p0.s, p0/z, z0.s, #0
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
index cb58eb5ebb8910..5f7a22ed055c89 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
@@ -13,16 +13,16 @@ define void @masked_scatter_v2i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v2i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    ldrb w9, [x0, #1]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldr q2, [x1]
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    ldrb w8, [x0, #1]
-; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    mov v0.s[1], w9
 ; CHECK-NEXT:    cmeq v1.2s, v0.2s, #0
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    sshll v1.2d, v1.2s, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; CHECK-NEXT:    st1b { z0.d }, p0, [z2.d]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    st1b { z0.d }, p0, [z1.d]
 ; CHECK-NEXT:    ret
   %vals = load <2 x i8>, ptr %a
   %ptrs = load <2 x ptr>, ptr %b
@@ -36,15 +36,15 @@ define void @masked_scatter_v4i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl4
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    cmeq v2.4h, v0.4h, #0
+; CHECK-NEXT:    cmeq v1.4h, v0.4h, #0
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
+; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    sunpklo z2.s, z2.h
-; CHECK-NEXT:    sunpklo z2.d, z2.s
-; CHECK-NEXT:    cmpne p0.d, p0/z, z2.d, #0
-; CHECK-NEXT:    st1b { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
+; CHECK-NEXT:    st1b { z0.d }, p0, [z2.d]
 ; CHECK-NEXT:    ret
   %vals = load <4 x i8>, ptr %a
   %ptrs = load <4 x ptr>, ptr %b
@@ -57,18 +57,19 @@ define void @masked_scatter_v8i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: masked_scatter_v8i8:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ldr d0, [x0]
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    cmeq v1.8b, v0.8b, #0
-; VBITS_GE_256-NEXT:    zip1 v5.8b, v0.8b, v0.8b
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    zip1 v3.8b, v0.8b, v0.8b
 ; VBITS_GE_256-NEXT:    zip1 v2.8b, v1.8b, v0.8b
 ; VBITS_GE_256-NEXT:    zip2 v1.8b, v1.8b, v0.8b
 ; VBITS_GE_256-NEXT:    zip2 v0.8b, v0.8b, v0.8b
+; VBITS_GE_256-NEXT:    uunpklo z3.s, z3.h
+; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    shl v2.4h, v2.4h, #8
 ; VBITS_GE_256-NEXT:    shl v1.4h, v1.4h, #8
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    uunpklo z3.d, z3.s
 ; VBITS_GE_256-NEXT:    sshr v2.4h, v2.4h, #8
 ; VBITS_GE_256-NEXT:    sshr v1.4h, v1.4h, #8
 ; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
@@ -77,27 +78,26 @@ define void @masked_scatter_v8i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
 ; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
 ; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z2.d, #0
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT:    uunpklo z1.s, z5.h
-; VBITS_GE_256-NEXT:    uunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    st1b { z1.d }, p1, [z4.d]
-; VBITS_GE_256-NEXT:    st1b { z0.d }, p0, [z3.d]
+; VBITS_GE_256-NEXT:    st1b { z3.d }, p1, [z2.d]
+; VBITS_GE_256-NEXT:    st1b { z0.d }, p0, [z4.d]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_scatter_v8i8:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ldr d0, [x0]
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    cmeq v2.8b, v0.8b, #0
+; VBITS_GE_512-NEXT:    cmeq v1.8b, v0.8b, #0
 ; VBITS_GE_512-NEXT:    uunpklo z0.h, z0.b
+; VBITS_GE_512-NEXT:    sunpklo z1.h, z1.b
 ; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_512-NEXT:    sunpklo z2.h, z2.b
+; VBITS_GE_512-NEXT:    ld1d { z2.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    sunpklo z1.s, z1.h
 ; VBITS_GE_512-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_512-NEXT:    sunpklo z2.s, z2.h
-; VBITS_GE_512-NEXT:    sunpklo z2.d, z2.s
-; VBITS_GE_512-NEXT:    cmpne p0.d, p0/z, z2.d, #0
-; VBITS_GE_512-NEXT:    st1b { z0.d }, p0, [z1.d]
+; VBITS_GE_512-NEXT:    sunpklo z1.d, z1.s
+; VBITS_GE_512-NEXT:    cmpne p0.d, p0/z, z1.d, #0
+; VBITS_GE_512-NEXT:    st1b { z0.d }, p0, [z2.d]
 ; VBITS_GE_512-NEXT:    ret
   %vals = load <8 x i8>, ptr %a
   %ptrs = load <8 x ptr>, ptr %b
@@ -111,16 +111,16 @@ define void @masked_scatter_v16i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl16
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; CHECK-NEXT:    cmeq v2.16b, v0.16b, #0
+; CHECK-NEXT:    cmeq v1.16b, v0.16b, #0
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    sunpklo z1.h, z1.b
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    sunpklo z2.h, z2.b
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
+; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    sunpklo z2.s, z2.h
-; CHECK-NEXT:    sunpklo z2.d, z2.s
-; CHECK-NEXT:    cmpne p0.d, p0/z, z2.d, #0
-; CHECK-NEXT:    st1b { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
+; CHECK-NEXT:    st1b { z0.d }, p0, [z2.d]
 ; CHECK-NEXT:    ret
   %vals = load <16 x i8>, ptr %a
   %ptrs = load <16 x ptr>, ptr %b
@@ -140,9 +140,9 @@ define void @masked_scatter_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    st1b { z0.d }, p0, [z1.d]
 ; CHECK-NEXT:    ret
   %vals = load <32 x i8>, ptr %a
@@ -160,16 +160,16 @@ define void @masked_scatter_v2i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v2i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    ldrh w9, [x0, #2]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldr q2, [x1]
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    ldrh w8, [x0, #2]
-; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    mov v0.s[1], w9
 ; CHECK-NEXT:    cmeq v1.2s, v0.2s, #0
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    sshll v1.2d, v1.2s, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; CHECK-NEXT:    st1h { z0.d }, p0, [z2.d]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    st1h { z0.d }, p0, [z1.d]
 ; CHECK-NEXT:    ret
   %vals = load <2 x i16>, ptr %a
   %ptrs = load <2 x ptr>, ptr %b
@@ -183,14 +183,14 @@ define void @masked_scatter_v4i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl4
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; CHECK-NEXT:    cmeq v2.4h, v0.4h, #0
+; CHECK-NEXT:    cmeq v1.4h, v0.4h, #0
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    sunpklo z2.s, z2.h
-; CHECK-NEXT:    sunpklo z2.d, z2.s
-; CHECK-NEXT:    cmpne p0.d, p0/z, z2.d, #0
-; CHECK-NEXT:    st1h { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
+; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
+; CHECK-NEXT:    st1h { z0.d }, p0, [z2.d]
 ; CHECK-NEXT:    ret
   %vals = load <4 x i16>, ptr %a
   %ptrs = load <4 x ptr>, ptr %b
@@ -203,39 +203,39 @@ define void @masked_scatter_v8i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: masked_scatter_v8i16:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ldr q0, [x0]
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    cmeq v1.8h, v0.8h, #0
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    uunpklo z3.s, z0.h
+; VBITS_GE_256-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; VBITS_GE_256-NEXT:    sunpklo z2.s, z1.h
-; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    uunpklo z3.d, z3.s
+; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
 ; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z2.d, #0
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    uunpklo z3.s, z3.h
-; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    st1h { z0.d }, p1, [z2.d]
+; VBITS_GE_256-NEXT:    st1h { z3.d }, p1, [z2.d]
 ; VBITS_GE_256-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT:    uunpklo z1.d, z3.s
-; VBITS_GE_256-NEXT:    st1h { z1.d }, p0, [z4.d]
+; VBITS_GE_256-NEXT:    st1h { z0.d }, p0, [z4.d]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_scatter_v8i16:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ldr q0, [x0]
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    cmeq v2.8h, v0.8h, #0
+; VBITS_GE_512-NEXT:    cmeq v1.8h, v0.8h, #0
 ; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT:    sunpklo z1.s, z1.h
 ; VBITS_GE_512-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_512-NEXT:    sunpklo z2.s, z2.h
-; VBITS_GE_512-NEXT:    sunpklo z2.d, z2.s
-; VBITS_GE_512-NEXT:    cmpne p0.d, p0/z, z2.d, #0
-; VBITS_GE_512-NEXT:    st1h { z0.d }, p0, [z1.d]
+; VBITS_GE_512-NEXT:    ld1d { z2.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    sunpklo z1.d, z1.s
+; VBITS_GE_512-NEXT:    cmpne p0.d, p0/z, z1.d, #0
+; VBITS_GE_512-NEXT:    st1h { z0.d }, p0, [z2.d]
 ; VBITS_GE_512-NEXT:    ret
   %vals = load <8 x i16>, ptr %a
   %ptrs = load <8 x ptr>, ptr %b
@@ -295,12 +295,12 @@ define void @masked_scatter_v2i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldr q2, [x1]
 ; CHECK-NEXT:    cmeq v1.2s, v0.2s, #0
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    sshll v1.2d, v1.2s, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; CHECK-NEXT:    st1w { z0.d }, p0, [z2.d]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    st1w { z0.d }, p0, [z1.d]
 ; CHECK-NEXT:    ret
   %vals = load <2 x i32>, ptr %a
   %ptrs = load <2 x ptr>, ptr %b
@@ -314,12 +314,12 @@ define void @masked_scatter_v4i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl4
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; CHECK-NEXT:    cmeq v2.4s, v0.4s, #0
+; CHECK-NEXT:    cmeq v1.4s, v0.4s, #0
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    sunpklo z2.d, z2.s
-; CHECK-NEXT:    cmpne p0.d, p0/z, z2.d, #0
-; CHECK-NEXT:    st1w { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
+; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
+; CHECK-NEXT:    st1w { z0.d }, p0, [z2.d]
 ; CHECK-NEXT:    ret
   %vals = load <4 x i32>, ptr %a
   %ptrs = load <4 x ptr>, ptr %b
@@ -332,23 +332,23 @@ define void @masked_scatter_v8i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: masked_scatter_v8i32:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p1/z, [x1]
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p1/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z4.d }, p1/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
-; VBITS_GE_256-NEXT:    uunpklo z4.d, z0.s
-; VBITS_GE_256-NEXT:    mov z2.s, p0/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
+; VBITS_GE_256-NEXT:    uunpklo z2.d, z0.s
 ; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT:    and p0.b, p0/z, p0.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z2.d, #0
+; VBITS_GE_256-NEXT:    punpklo p2.h, p0.b
+; VBITS_GE_256-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
 ; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    st1w { z4.d }, p0, [z3.d]
-; VBITS_GE_256-NEXT:    st1w { z0.d }, p1, [z1.d]
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
+; VBITS_GE_256-NEXT:    and p0.b, p2/z, p2.b, p1.b
+; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1w { z2.d }, p0, [z4.d]
+; VBITS_GE_256-NEXT:    cmpne p0.d, p1/z, z1.d, #0
+; VBITS_GE_256-NEXT:    st1w { z0.d }, p0, [z3.d]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_scatter_v8i32:
@@ -434,11 +434,11 @@ define void @masked_scatter_v1i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @masked_scatter_v2i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    cmeq v1.2d, v0.2d, #0
+; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    cmeq v2.2d, v0.2d, #0
-; CHECK-NEXT:    cmpne p0.d, p0/z, z2.d, #0
 ; CHECK-NEXT:    st1d { z0.d }, p0, [z1.d]
 ; CHECK-NEXT:    ret
   %vals = load <2 x i64>, ptr %a
@@ -467,16 +467,16 @@ define void @masked_scatter_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @masked_scatter_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: masked_scatter_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_256-NEXT:    cmpeq p0.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [z3.d]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p1, [z2.d]
+; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z1.d, #0
+; VBITS_GE_256-NEXT:    cmpeq p0.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p1, [z3.d]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [z2.d]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_scatter_v8i64:
@@ -534,20 +534,20 @@ define void @masked_scatter_v2f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s1, [x0]
-; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    ldr q3, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    fcmeq v2.4h, v1.4h, #0.0
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-NEXT:    mov w8, v2.s[1]
 ; CHECK-NEXT:    mov v0.h[1], w8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
-; CHECK-NEXT:    uunpklo z0.d, z1.s
-; CHECK-NEXT:    st1h { z0.d }, p0, [z3.d]
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    st1h { z1.d }, p0, [z0.d]
 ; CHECK-NEXT:    ret
   %vals = load <2 x half>, ptr %a
   %ptrs = load <2 x ptr>, ptr %b
@@ -561,14 +561,14 @@ define void @masked_scatter_v4f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl4
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; CHECK-NEXT:    fcmeq v2.4h, v0.4h, #0.0
+; CHECK-NEXT:    fcmeq v1.4h, v0.4h, #0.0
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    sunpklo z2.s, z2.h
-; CHECK-NEXT:    sunpklo z2.d, z2.s
-; CHECK-NEXT:    cmpne p0.d, p0/z, z2.d, #0
-; CHECK-NEXT:    st1h { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
+; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
+; CHECK-NEXT:    st1h { z0.d }, p0, [z2.d]
 ; CHECK-NEXT:    ret
   %vals = load <4 x half>, ptr %a
   %ptrs = load <4 x ptr>, ptr %b
@@ -581,39 +581,39 @@ define void @masked_scatter_v8f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: masked_scatter_v8f16:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ldr q0, [x0]
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    fcmeq v1.8h, v0.8h, #0.0
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    uunpklo z3.s, z0.h
+; VBITS_GE_256-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; VBITS_GE_256-NEXT:    sunpklo z2.s, z1.h
 ; VBITS_GE_256-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_256-NEXT:    uunpklo z3.d, z3.s
+; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
+; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
 ; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z2.d, #0
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    uunpklo z3.s, z3.h
-; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
-; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    st1h { z0.d }, p1, [z2.d]
+; VBITS_GE_256-NEXT:    st1h { z3.d }, p1, [z2.d]
 ; VBITS_GE_256-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT:    uunpklo z1.d, z3.s
-; VBITS_GE_256-NEXT:    st1h { z1.d }, p0, [z4.d]
+; VBITS_GE_256-NEXT:    st1h { z0.d }, p0, [z4.d]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_scatter_v8f16:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ldr q0, [x0]
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    fcmeq v2.8h, v0.8h, #0.0
+; VBITS_GE_512-NEXT:    fcmeq v1.8h, v0.8h, #0.0
 ; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT:    sunpklo z1.s, z1.h
 ; VBITS_GE_512-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_512-NEXT:    sunpklo z2.s, z2.h
-; VBITS_GE_512-NEXT:    sunpklo z2.d, z2.s
-; VBITS_GE_512-NEXT:    cmpne p0.d, p0/z, z2.d, #0
-; VBITS_GE_512-NEXT:    st1h { z0.d }, p0, [z1.d]
+; VBITS_GE_512-NEXT:    ld1d { z2.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    sunpklo z1.d, z1.s
+; VBITS_GE_512-NEXT:    cmpne p0.d, p0/z, z1.d, #0
+; VBITS_GE_512-NEXT:    st1h { z0.d }, p0, [z2.d]
 ; VBITS_GE_512-NEXT:    ret
   %vals = load <8 x half>, ptr %a
   %ptrs = load <8 x ptr>, ptr %b
@@ -673,12 +673,12 @@ define void @masked_scatter_v2f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldr q2, [x1]
 ; CHECK-NEXT:    fcmeq v1.2s, v0.2s, #0.0
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    sshll v1.2d, v1.2s, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; CHECK-NEXT:    st1w { z0.d }, p0, [z2.d]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    st1w { z0.d }, p0, [z1.d]
 ; CHECK-NEXT:    ret
   %vals = load <2 x float>, ptr %a
   %ptrs = load <2 x ptr>, ptr %b
@@ -692,12 +692,12 @@ define void @masked_scatter_v4f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl4
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; CHECK-NEXT:    fcmeq v2.4s, v0.4s, #0.0
+; CHECK-NEXT:    fcmeq v1.4s, v0.4s, #0.0
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    sunpklo z2.d, z2.s
-; CHECK-NEXT:    cmpne p0.d, p0/z, z2.d, #0
-; CHECK-NEXT:    st1w { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
+; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
+; CHECK-NEXT:    st1w { z0.d }, p0, [z2.d]
 ; CHECK-NEXT:    ret
   %vals = load <4 x float>, ptr %a
   %ptrs = load <4 x ptr>, ptr %b
@@ -710,23 +710,23 @@ define void @masked_scatter_v8f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: masked_scatter_v8f32:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p1/z, [x1]
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p1/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z4.d }, p1/z, [x1]
 ; VBITS_GE_256-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
-; VBITS_GE_256-NEXT:    uunpklo z4.d, z0.s
-; VBITS_GE_256-NEXT:    mov z2.s, p0/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    punpklo p0.h, p0.b
-; VBITS_GE_256-NEXT:    ext z2.b, z2.b, z2.b, #16
+; VBITS_GE_256-NEXT:    uunpklo z2.d, z0.s
 ; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
-; VBITS_GE_256-NEXT:    and p0.b, p0/z, p0.b, p1.b
-; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z2.d, #0
 ; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    st1w { z4.d }, p0, [z3.d]
-; VBITS_GE_256-NEXT:    st1w { z0.d }, p1, [z1.d]
+; VBITS_GE_256-NEXT:    punpklo p2.h, p0.b
+; VBITS_GE_256-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
+; VBITS_GE_256-NEXT:    and p0.b, p2/z, p2.b, p1.b
+; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
+; VBITS_GE_256-NEXT:    st1w { z2.d }, p0, [z4.d]
+; VBITS_GE_256-NEXT:    cmpne p0.d, p1/z, z1.d, #0
+; VBITS_GE_256-NEXT:    st1w { z0.d }, p0, [z3.d]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_scatter_v8f32:
@@ -812,12 +812,12 @@ define void @masked_scatter_v1f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
 define void @masked_scatter_v2f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldr q2, [x1]
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    fcmeq v1.2d, v0.2d, #0.0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; CHECK-NEXT:    st1d { z0.d }, p0, [z2.d]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    st1d { z0.d }, p0, [z1.d]
 ; CHECK-NEXT:    ret
   %vals = load <2 x double>, ptr %a
   %ptrs = load <2 x ptr>, ptr %b
@@ -832,8 +832,8 @@ define void @masked_scatter_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; CHECK-NEXT:    fcmeq p0.d, p0/z, z0.d, #0.0
-; CHECK-NEXT:    st1d { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    st1d { z0.d }, p1, [z1.d]
 ; CHECK-NEXT:    ret
   %vals = load <4 x double>, ptr %a
   %ptrs = load <4 x ptr>, ptr %b
@@ -845,16 +845,16 @@ define void @masked_scatter_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @masked_scatter_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: masked_scatter_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
-; VBITS_GE_256-NEXT:    fcmeq p0.d, p0/z, z1.d, #0.0
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [z3.d]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p1, [z2.d]
+; VBITS_GE_256-NEXT:    fcmeq p1.d, p0/z, z1.d, #0.0
+; VBITS_GE_256-NEXT:    fcmeq p0.d, p0/z, z0.d, #0.0
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p1, [z3.d]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [z2.d]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_scatter_v8f64:
@@ -862,8 +862,8 @@ define void @masked_scatter_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
 ; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    fcmeq p0.d, p0/z, z0.d, #0.0
-; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [z1.d]
+; VBITS_GE_512-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p1, [z1.d]
 ; VBITS_GE_512-NEXT:    ret
   %vals = load <8 x double>, ptr %a
   %ptrs = load <8 x ptr>, ptr %b
@@ -878,8 +878,8 @@ define void @masked_scatter_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK-NEXT:    ptrue p0.d, vl16
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; CHECK-NEXT:    fcmeq p0.d, p0/z, z0.d, #0.0
-; CHECK-NEXT:    st1d { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    st1d { z0.d }, p1, [z1.d]
 ; CHECK-NEXT:    ret
   %vals = load <16 x double>, ptr %a
   %ptrs = load <16 x ptr>, ptr %b
@@ -894,8 +894,8 @@ define void @masked_scatter_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p0.d, vl32
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; CHECK-NEXT:    fcmeq p0.d, p0/z, z0.d, #0.0
-; CHECK-NEXT:    st1d { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    st1d { z0.d }, p1, [z1.d]
 ; CHECK-NEXT:    ret
   %vals = load <32 x double>, ptr %a
   %ptrs = load <32 x ptr>, ptr %b
@@ -934,8 +934,8 @@ define void @masked_scatter_32b_scaled_sext_f32(ptr %a, ptr %b, ptr %base) vscal
 ; CHECK-NEXT:    ptrue p0.s, vl32
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    st1w { z0.s }, p0, [x2, z1.s, sxtw #2]
+; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    st1w { z0.s }, p1, [x2, z1.s, sxtw #2]
 ; CHECK-NEXT:    ret
   %vals = load <32 x float>, ptr %a
   %idxs = load <32 x i32>, ptr %b
@@ -952,8 +952,8 @@ define void @masked_scatter_32b_scaled_sext_f64(ptr %a, ptr %b, ptr %base) vscal
 ; CHECK-NEXT:    ptrue p0.d, vl32
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld1sw { z1.d }, p0/z, [x1]
-; CHECK-NEXT:    fcmeq p0.d, p0/z, z0.d, #0.0
-; CHECK-NEXT:    st1d { z0.d }, p0, [x2, z1.d, lsl #3]
+; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    st1d { z0.d }, p1, [x2, z1.d, lsl #3]
 ; CHECK-NEXT:    ret
   %vals = load <32 x double>, ptr %a
   %idxs = load <32 x i32>, ptr %b
@@ -1127,9 +1127,9 @@ define void @masked_scatter_bitcast_infinite_loop(ptr %a, ptr %b, i1 %cond) vsca
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    tbz w2, #0, .LBB47_2
 ; CHECK-NEXT:  // %bb.1: // %bb.1
+; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, #0.0
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; CHECK-NEXT:    fcmeq p0.d, p0/z, z0.d, #0.0
-; CHECK-NEXT:    st1d { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    st1d { z0.d }, p1, [z1.d]
 ; CHECK-NEXT:  .LBB47_2: // %bb.2
 ; CHECK-NEXT:    ret
   %vals = load volatile <8 x double>, ptr %a

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
index 60fe507f7882b8..384b2cc6269328 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
@@ -13,9 +13,9 @@ define void @masked_store_v2f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_store_v2f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s1, [x0]
-; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    ldr s2, [x1]
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    fcmeq v2.4h, v1.4h, v2.4h
 ; CHECK-NEXT:    sshll v2.4s, v2.4h, #0
 ; CHECK-NEXT:    mov v0.h[0], v2.h[0]
@@ -34,8 +34,8 @@ define void @masked_store_v2f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 define void @masked_store_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_store_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    fcmeq v1.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
@@ -51,8 +51,8 @@ define void @masked_store_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 define void @masked_store_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_store_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    fcmeq v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
@@ -84,8 +84,8 @@ define void @masked_store_v8f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 define void @masked_store_v16f32(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: masked_store_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -146,25 +146,25 @@ define void @masked_store_v64f32(ptr %ap, ptr %bp) vscale_range(16,0) #0 {
 define void @masked_store_trunc_v8i64i8(ptr %ap, ptr %bp, ptr %dest) #0 {
 ; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, z2.d
+; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    cmpeq p0.d, p0/z, z1.d, z3.d
+; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; VBITS_GE_256-NEXT:    mov z2.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    mov z3.d, p0/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
 ; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    splice z3.s, p1, z3.s, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    cmpne p0.s, p0/z, z3.s, #0
-; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
+; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
+; VBITS_GE_256-NEXT:    splice z3.s, p0, z3.s, z2.s
+; VBITS_GE_256-NEXT:    cmpne p0.s, p1/z, z3.s, #0
 ; VBITS_GE_256-NEXT:    st1b { z1.s }, p0, [x2]
 ; VBITS_GE_256-NEXT:    ret
 ;
@@ -187,28 +187,28 @@ define void @masked_store_trunc_v8i64i8(ptr %ap, ptr %bp, ptr %dest) #0 {
 define void @masked_store_trunc_v8i64i16(ptr %ap, ptr %bp, ptr %dest) #0 {
 ; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, z2.d
-; VBITS_GE_256-NEXT:    cmpeq p0.d, p0/z, z1.d, z3.d
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT:    cmpeq p0.d, p0/z, z1.d, z3.d
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; VBITS_GE_256-NEXT:    mov z2.d, p1/z, #-1 // =0xffffffffffffffff
 ; VBITS_GE_256-NEXT:    mov z3.d, p0/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
 ; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
 ; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
 ; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
-; VBITS_GE_256-NEXT:    splice z3.s, p1, z3.s, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z3.h, z3.h
-; VBITS_GE_256-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; VBITS_GE_256-NEXT:    splice z3.s, p0, z3.s, z2.s
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
+; VBITS_GE_256-NEXT:    uzp1 z2.h, z3.h, z3.h
+; VBITS_GE_256-NEXT:    cmpne p0.h, p0/z, z2.h, #0
 ; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x2]
 ; VBITS_GE_256-NEXT:    ret
 ;
@@ -231,25 +231,25 @@ define void @masked_store_trunc_v8i64i16(ptr %ap, ptr %bp, ptr %dest) #0 {
 define void @masked_store_trunc_v8i64i32(ptr %ap, ptr %bp, ptr %dest) #0 {
 ; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, z2.d
+; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    cmpeq p0.d, p0/z, z1.d, z3.d
+; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; VBITS_GE_256-NEXT:    mov z2.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    mov z3.d, p0/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
 ; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    splice z3.s, p1, z3.s, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
-; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
-; VBITS_GE_256-NEXT:    cmpne p0.s, p0/z, z3.s, #0
-; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
+; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
+; VBITS_GE_256-NEXT:    splice z3.s, p0, z3.s, z2.s
+; VBITS_GE_256-NEXT:    cmpne p0.s, p1/z, z3.s, #0
 ; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x2]
 ; VBITS_GE_256-NEXT:    ret
 ;
@@ -272,27 +272,27 @@ define void @masked_store_trunc_v8i64i32(ptr %ap, ptr %bp, ptr %dest) #0 {
 define void @masked_store_trunc_v16i32i8(ptr %ap, ptr %bp, ptr %dest) #0 {
 ; VBITS_GE_256-LABEL: masked_store_trunc_v16i32i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p1.s, p0/z, z0.s, z2.s
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_256-NEXT:    cmpeq p0.s, p0/z, z1.s, z3.s
+; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; VBITS_GE_256-NEXT:    mov z2.s, p1/z, #-1 // =0xffffffffffffffff
 ; VBITS_GE_256-NEXT:    mov z3.s, p0/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl16
+; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
+; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
 ; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; VBITS_GE_256-NEXT:    uzp1 z3.h, z3.h, z3.h
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
 ; VBITS_GE_256-NEXT:    uzp1 z2.b, z2.b, z2.b
 ; VBITS_GE_256-NEXT:    uzp1 z3.b, z3.b, z3.b
 ; VBITS_GE_256-NEXT:    mov v3.d[1], v2.d[0]
-; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
-; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
-; VBITS_GE_256-NEXT:    ptrue p0.b, vl16
-; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
 ; VBITS_GE_256-NEXT:    cmpne p0.b, p0/z, z3.b, #0
 ; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x2]
 ; VBITS_GE_256-NEXT:    ret
@@ -316,28 +316,28 @@ define void @masked_store_trunc_v16i32i8(ptr %ap, ptr %bp, ptr %dest) #0 {
 define void @masked_store_trunc_v16i32i16(ptr %ap, ptr %bp, ptr %dest) #0 {
 ; VBITS_GE_256-LABEL: masked_store_trunc_v16i32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p1.s, p0/z, z0.s, z2.s
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_256-NEXT:    cmpeq p0.s, p0/z, z1.s, z3.s
+; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; VBITS_GE_256-NEXT:    mov z2.s, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    ptrue p1.h, vl16
 ; VBITS_GE_256-NEXT:    mov z3.s, p0/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
 ; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; VBITS_GE_256-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; VBITS_GE_256-NEXT:    uzp1 z2.b, z2.b, z2.b
+; VBITS_GE_256-NEXT:    splice z1.h, p0, z1.h, z0.h
 ; VBITS_GE_256-NEXT:    uzp1 z3.b, z3.b, z3.b
 ; VBITS_GE_256-NEXT:    mov v3.d[1], v2.d[0]
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
-; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
-; VBITS_GE_256-NEXT:    splice z1.h, p1, z1.h, z0.h
 ; VBITS_GE_256-NEXT:    sunpklo z2.h, z3.b
-; VBITS_GE_256-NEXT:    cmpne p0.h, p0/z, z2.h, #0
+; VBITS_GE_256-NEXT:    cmpne p0.h, p1/z, z2.h, #0
 ; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x2]
 ; VBITS_GE_256-NEXT:    ret
 ;
@@ -360,25 +360,25 @@ define void @masked_store_trunc_v16i32i16(ptr %ap, ptr %bp, ptr %dest) #0 {
 define void @masked_store_trunc_v32i16i8(ptr %ap, ptr %bp, ptr %dest) #0 {
 ; VBITS_GE_256-LABEL: masked_store_trunc_v32i16i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpeq p1.h, p0/z, z0.h, z2.h
+; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; VBITS_GE_256-NEXT:    cmpeq p0.h, p0/z, z1.h, z3.h
+; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
 ; VBITS_GE_256-NEXT:    mov z2.h, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    ptrue p1.b, vl32
 ; VBITS_GE_256-NEXT:    mov z3.h, p0/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    ptrue p1.b, vl16
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl16
 ; VBITS_GE_256-NEXT:    uzp1 z2.b, z2.b, z2.b
 ; VBITS_GE_256-NEXT:    uzp1 z3.b, z3.b, z3.b
-; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_256-NEXT:    splice z3.b, p1, z3.b, z2.b
-; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
-; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
-; VBITS_GE_256-NEXT:    cmpne p0.b, p0/z, z3.b, #0
-; VBITS_GE_256-NEXT:    splice z1.b, p1, z1.b, z0.b
+; VBITS_GE_256-NEXT:    splice z1.b, p0, z1.b, z0.b
+; VBITS_GE_256-NEXT:    splice z3.b, p0, z3.b, z2.b
+; VBITS_GE_256-NEXT:    cmpne p0.b, p1/z, z3.b, #0
 ; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x2]
 ; VBITS_GE_256-NEXT:    ret
 ;

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-no-vscale-range.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-no-vscale-range.ll
index ae350ee4fad999..6e8d477fc3ad58 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-no-vscale-range.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-no-vscale-range.ll
@@ -6,8 +6,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
 ; CHECK-LABEL: mul_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -19,8 +19,8 @@ define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
 define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
 ; CHECK-LABEL: sdiv_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll
index 88732096db67df..24ece2873adb85 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll
@@ -165,8 +165,8 @@ define void @test_revwv8i32v8i32(ptr %a, ptr %b) #0 {
 define void @test_revhv32i16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: test_revhv32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ptrue p1.d
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
@@ -204,14 +204,14 @@ define void @test_rev_elts_fail(ptr %a) #1 {
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    mov z1.d, z0.d[2]
-; CHECK-NEXT:    fmov x11, d0
+; CHECK-NEXT:    mov z2.d, z0.d[3]
+; CHECK-NEXT:    mov x9, v0.d[1]
 ; CHECK-NEXT:    fmov x8, d1
-; CHECK-NEXT:    mov z1.d, z0.d[3]
-; CHECK-NEXT:    fmov x9, d1
-; CHECK-NEXT:    mov x10, v0.d[1]
-; CHECK-NEXT:    stp x9, x8, [sp, #16]
+; CHECK-NEXT:    fmov x10, d2
+; CHECK-NEXT:    stp x10, x8, [sp, #16]
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    stp x9, x8, [sp]
 ; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    stp x10, x11, [sp]
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    mov sp, x29
@@ -272,22 +272,22 @@ define void @test_revv8i32(ptr %a) #0 {
 ; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w10, s0
 ; CHECK-NEXT:    mov w9, v0.s[2]
-; CHECK-NEXT:    mov w11, v0.s[3]
+; CHECK-NEXT:    mov w10, v0.s[3]
+; CHECK-NEXT:    fmov w11, s0
 ; CHECK-NEXT:    mov z1.s, z0.s[4]
 ; CHECK-NEXT:    mov z2.s, z0.s[5]
 ; CHECK-NEXT:    mov z3.s, z0.s[6]
 ; CHECK-NEXT:    mov z0.s, z0.s[7]
-; CHECK-NEXT:    stp w8, w10, [sp, #24]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    stp w11, w9, [sp, #16]
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    fmov w11, s0
-; CHECK-NEXT:    stp w8, w10, [sp, #8]
+; CHECK-NEXT:    stp w8, w11, [sp, #24]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    stp w10, w9, [sp, #16]
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    stp w9, w8, [sp, #8]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    stp w9, w8, [sp]
 ; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    stp w11, w9, [sp]
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    mov sp, x29
@@ -390,45 +390,45 @@ define void @test_rev_fail(ptr %a) #1 {
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    mov z1.h, z0.h[8]
 ; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov w9, s1
-; CHECK-NEXT:    mov z4.h, z0.h[11]
-; CHECK-NEXT:    mov z5.h, z0.h[12]
 ; CHECK-NEXT:    mov z2.h, z0.h[9]
-; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s4
 ; CHECK-NEXT:    mov z3.h, z0.h[10]
-; CHECK-NEXT:    strh w9, [sp, #30]
-; CHECK-NEXT:    fmov w9, s5
-; CHECK-NEXT:    mov z16.h, z0.h[15]
-; CHECK-NEXT:    fmov w11, s2
-; CHECK-NEXT:    fmov w12, s3
-; CHECK-NEXT:    strh w8, [sp, #24]
-; CHECK-NEXT:    fmov w8, s16
-; CHECK-NEXT:    mov z6.h, z0.h[13]
-; CHECK-NEXT:    mov z7.h, z0.h[14]
-; CHECK-NEXT:    umov w10, v0.h[1]
-; CHECK-NEXT:    strh w9, [sp, #22]
-; CHECK-NEXT:    umov w9, v0.h[2]
-; CHECK-NEXT:    strh w11, [sp, #28]
-; CHECK-NEXT:    fmov w11, s6
-; CHECK-NEXT:    strh w12, [sp, #26]
-; CHECK-NEXT:    fmov w12, s7
-; CHECK-NEXT:    strh w8, [sp, #16]
-; CHECK-NEXT:    umov w8, v0.h[5]
-; CHECK-NEXT:    strh w10, [sp, #12]
-; CHECK-NEXT:    strh w11, [sp, #20]
-; CHECK-NEXT:    umov w11, v0.h[3]
-; CHECK-NEXT:    strh w12, [sp, #18]
-; CHECK-NEXT:    umov w12, v0.h[4]
-; CHECK-NEXT:    umov w10, v0.h[6]
-; CHECK-NEXT:    strh w9, [sp, #10]
-; CHECK-NEXT:    umov w9, v0.h[7]
-; CHECK-NEXT:    strh w8, [sp, #4]
+; CHECK-NEXT:    mov z4.h, z0.h[11]
+; CHECK-NEXT:    strh w8, [sp, #14]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov z1.h, z0.h[12]
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    mov z2.h, z0.h[13]
+; CHECK-NEXT:    strh w8, [sp, #30]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    mov z3.h, z0.h[14]
+; CHECK-NEXT:    strh w9, [sp, #28]
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    mov z4.h, z0.h[15]
+; CHECK-NEXT:    fmov w10, s2
+; CHECK-NEXT:    strh w8, [sp, #26]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w11, s3
+; CHECK-NEXT:    strh w9, [sp, #24]
+; CHECK-NEXT:    umov w9, v0.h[1]
+; CHECK-NEXT:    fmov w12, s4
+; CHECK-NEXT:    strh w10, [sp, #20]
+; CHECK-NEXT:    umov w10, v0.h[3]
+; CHECK-NEXT:    strh w8, [sp, #22]
+; CHECK-NEXT:    umov w8, v0.h[2]
+; CHECK-NEXT:    strh w11, [sp, #18]
+; CHECK-NEXT:    umov w11, v0.h[4]
+; CHECK-NEXT:    strh w12, [sp, #16]
+; CHECK-NEXT:    umov w12, v0.h[5]
+; CHECK-NEXT:    strh w9, [sp, #12]
+; CHECK-NEXT:    umov w9, v0.h[6]
+; CHECK-NEXT:    strh w8, [sp, #10]
+; CHECK-NEXT:    umov w8, v0.h[7]
+; CHECK-NEXT:    strh w10, [sp, #8]
+; CHECK-NEXT:    strh w11, [sp, #6]
+; CHECK-NEXT:    strh w12, [sp, #4]
+; CHECK-NEXT:    strh w9, [sp, #2]
+; CHECK-NEXT:    strh w8, [sp]
 ; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    strh w11, [sp, #8]
-; CHECK-NEXT:    strh w12, [sp, #6]
-; CHECK-NEXT:    strh w10, [sp, #2]
-; CHECK-NEXT:    strh w9, [sp]
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x8]
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    mov sp, x29
@@ -453,40 +453,40 @@ define void @test_revv8i16v8i16(ptr %a, ptr %b, ptr %c) #1 {
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov x8, sp
 ; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    orr x9, x8, #0x1e
 ; CHECK-NEXT:    orr x10, x8, #0x1c
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    orr x11, x8, #0x18
-; CHECK-NEXT:    orr x12, x8, #0x10
-; CHECK-NEXT:    str h0, [sp, #22]
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    st1 { v0.h }[4], [x9]
+; CHECK-NEXT:    orr x9, x8, #0x18
+; CHECK-NEXT:    st1 { v0.h }[7], [x9]
 ; CHECK-NEXT:    orr x9, x8, #0xe
-; CHECK-NEXT:    st1 { v0.h }[5], [x10]
-; CHECK-NEXT:    orr x10, x8, #0xc
-; CHECK-NEXT:    st1 { v0.h }[7], [x11]
-; CHECK-NEXT:    orr x11, x8, #0x8
 ; CHECK-NEXT:    st1 { v1.h }[4], [x9]
+; CHECK-NEXT:    orr x9, x8, #0xc
+; CHECK-NEXT:    st1 { v1.h }[5], [x9]
+; CHECK-NEXT:    orr x9, x8, #0x8
+; CHECK-NEXT:    st1 { v0.h }[5], [x10]
+; CHECK-NEXT:    orr x10, x8, #0x10
+; CHECK-NEXT:    st1 { v1.h }[7], [x9]
 ; CHECK-NEXT:    orr x9, x8, #0x4
-; CHECK-NEXT:    st1 { v1.h }[5], [x10]
+; CHECK-NEXT:    st1 { v0.h }[3], [x10]
 ; CHECK-NEXT:    mov w10, #26 // =0x1a
-; CHECK-NEXT:    orr x10, x8, x10
-; CHECK-NEXT:    st1 { v0.h }[3], [x12]
 ; CHECK-NEXT:    st1 { v1.h }[1], [x9]
 ; CHECK-NEXT:    orr x9, x8, #0x2
-; CHECK-NEXT:    st1 { v1.h }[7], [x11]
-; CHECK-NEXT:    mov w11, #20 // =0x14
-; CHECK-NEXT:    mov w12, #18 // =0x12
-; CHECK-NEXT:    st1 { v0.h }[6], [x10]
-; CHECK-NEXT:    mov w10, #10 // =0xa
-; CHECK-NEXT:    orr x11, x8, x11
 ; CHECK-NEXT:    st1 { v1.h }[2], [x9]
-; CHECK-NEXT:    orr x9, x8, x12
-; CHECK-NEXT:    orr x10, x8, x10
-; CHECK-NEXT:    st1 { v1.h }[3], [x8]
-; CHECK-NEXT:    st1 { v0.h }[1], [x11]
-; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    orr x9, x8, x10
+; CHECK-NEXT:    mov w10, #20 // =0x14
+; CHECK-NEXT:    st1 { v0.h }[6], [x9]
+; CHECK-NEXT:    orr x9, x8, x10
+; CHECK-NEXT:    mov w10, #18 // =0x12
+; CHECK-NEXT:    st1 { v0.h }[1], [x9]
+; CHECK-NEXT:    orr x9, x8, x10
 ; CHECK-NEXT:    st1 { v0.h }[2], [x9]
-; CHECK-NEXT:    st1 { v1.h }[6], [x10]
+; CHECK-NEXT:    mov w9, #10 // =0xa
+; CHECK-NEXT:    orr x9, x8, x9
+; CHECK-NEXT:    st1 { v1.h }[3], [x8]
+; CHECK-NEXT:    st1 { v1.h }[6], [x9]
+; CHECK-NEXT:    str h0, [sp, #22]
 ; CHECK-NEXT:    str h1, [sp, #6]
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x8]
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x2]

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll
index b29ac96c8a7f53..42b3196fdf1dd6 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll
@@ -32,19 +32,19 @@ define void @zip1_v32i8(ptr %a, ptr %b) #0 {
 define void @zip_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-LABEL: zip_v32i16:
 ; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    mov x8, #16
 ; VBITS_EQ_256-NEXT:    ptrue p0.h
+; VBITS_EQ_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_EQ_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_EQ_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
 ; VBITS_EQ_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
+; VBITS_EQ_256-NEXT:    zip2 z5.h, z0.h, z2.h
+; VBITS_EQ_256-NEXT:    zip1 z0.h, z0.h, z2.h
 ; VBITS_EQ_256-NEXT:    zip2 z4.h, z1.h, z3.h
 ; VBITS_EQ_256-NEXT:    zip1 z1.h, z1.h, z3.h
-; VBITS_EQ_256-NEXT:    zip2 z3.h, z0.h, z2.h
-; VBITS_EQ_256-NEXT:    zip1 z0.h, z0.h, z2.h
+; VBITS_EQ_256-NEXT:    add z2.h, z4.h, z5.h
 ; VBITS_EQ_256-NEXT:    add z0.h, z1.h, z0.h
-; VBITS_EQ_256-NEXT:    add z1.h, z4.h, z3.h
-; VBITS_EQ_256-NEXT:    st1h { z1.h }, p0, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    st1h { z2.h }, p0, [x0, x8, lsl #1]
 ; VBITS_EQ_256-NEXT:    st1h { z0.h }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    ret
 ;
@@ -144,13 +144,13 @@ define void @zip_v4f64(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_EQ_512-NEXT:    mov z2.d, z1.d[3]
 ; VBITS_EQ_512-NEXT:    mov z3.d, z0.d[3]
-; VBITS_EQ_512-NEXT:    stp d3, d2, [sp, #16]
-; VBITS_EQ_512-NEXT:    mov z2.d, z1.d[2]
-; VBITS_EQ_512-NEXT:    mov z3.d, z0.d[2]
+; VBITS_EQ_512-NEXT:    mov z4.d, z1.d[2]
+; VBITS_EQ_512-NEXT:    mov z5.d, z0.d[2]
 ; VBITS_EQ_512-NEXT:    zip1 z0.d, z0.d, z1.d
-; VBITS_EQ_512-NEXT:    stp d3, d2, [sp]
-; VBITS_EQ_512-NEXT:    ld1d { z2.d }, p0/z, [x8]
-; VBITS_EQ_512-NEXT:    fadd z0.d, p0/m, z0.d, z2.d
+; VBITS_EQ_512-NEXT:    stp d3, d2, [sp, #16]
+; VBITS_EQ_512-NEXT:    stp d5, d4, [sp]
+; VBITS_EQ_512-NEXT:    ld1d { z1.d }, p0/z, [x8]
+; VBITS_EQ_512-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; VBITS_EQ_512-NEXT:    st1d { z0.d }, p0, [x0]
 ; VBITS_EQ_512-NEXT:    mov sp, x29
 ; VBITS_EQ_512-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
@@ -240,18 +240,18 @@ define void @trn_v32i8(ptr %a, ptr %b) #0 {
 define void @trn_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_EQ_256-LABEL: trn_v32i16:
 ; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    mov x8, #16
 ; VBITS_EQ_256-NEXT:    ptrue p0.h
+; VBITS_EQ_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_EQ_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_EQ_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
 ; VBITS_EQ_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
 ; VBITS_EQ_256-NEXT:    trn1 z4.h, z0.h, z2.h
-; VBITS_EQ_256-NEXT:    trn1 z5.h, z1.h, z3.h
 ; VBITS_EQ_256-NEXT:    trn2 z0.h, z0.h, z2.h
+; VBITS_EQ_256-NEXT:    trn1 z2.h, z1.h, z3.h
 ; VBITS_EQ_256-NEXT:    trn2 z1.h, z1.h, z3.h
 ; VBITS_EQ_256-NEXT:    add z0.h, z4.h, z0.h
-; VBITS_EQ_256-NEXT:    add z1.h, z5.h, z1.h
+; VBITS_EQ_256-NEXT:    add z1.h, z2.h, z1.h
 ; VBITS_EQ_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
 ; VBITS_EQ_256-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_EQ_256-NEXT:    ret
@@ -513,18 +513,18 @@ define void @uzp_v32i8(ptr %a, ptr %b) #1 {
 define void @uzp_v32i16(ptr %a, ptr %b) #1 {
 ; CHECK-LABEL: uzp_v32i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #16
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov x8, #16 // =0x10
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
 ; CHECK-NEXT:    ld1h { z3.h }, p0/z, [x1]
-; CHECK-NEXT:    uzp1 z5.h, z1.h, z0.h
-; CHECK-NEXT:    uzp2 z0.h, z1.h, z0.h
 ; CHECK-NEXT:    uzp1 z4.h, z3.h, z2.h
 ; CHECK-NEXT:    uzp2 z2.h, z3.h, z2.h
-; CHECK-NEXT:    add z0.h, z5.h, z0.h
+; CHECK-NEXT:    uzp1 z3.h, z1.h, z0.h
+; CHECK-NEXT:    uzp2 z0.h, z1.h, z0.h
 ; CHECK-NEXT:    add z1.h, z4.h, z2.h
+; CHECK-NEXT:    add z0.h, z3.h, z0.h
 ; CHECK-NEXT:    st1h { z1.h }, p0, [x0, x8, lsl #1]
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -661,13 +661,13 @@ define void @zip_vscale2_4(ptr %a, ptr %b) #2 {
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    mov z2.d, z1.d[3]
 ; CHECK-NEXT:    mov z3.d, z0.d[3]
-; CHECK-NEXT:    stp d3, d2, [sp, #16]
-; CHECK-NEXT:    mov z2.d, z1.d[2]
-; CHECK-NEXT:    mov z3.d, z0.d[2]
+; CHECK-NEXT:    mov z4.d, z1.d[2]
+; CHECK-NEXT:    mov z5.d, z0.d[2]
 ; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
-; CHECK-NEXT:    stp d3, d2, [sp]
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x8]
-; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    stp d3, d2, [sp, #16]
+; CHECK-NEXT:    stp d5, d4, [sp]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x8]
+; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-ptest.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-ptest.ll
index 1304bb8bc69682..4d8855cd257723 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-ptest.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-ptest.ll
@@ -4,8 +4,8 @@
 define i1 @ptest_v16i1_256bit_min_sve(ptr %a, ptr %b) vscale_range(2, 0) {
 ; CHECK-LABEL: ptest_v16i1_256bit_min_sve:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #8 // =0x8
 ; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    mov x8, #8 // =0x8
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; CHECK-NEXT:    fcmne p1.s, p0/z, z0.s, #0.0
@@ -101,9 +101,9 @@ define i1 @ptest_and_v16i1_512bit_sve(ptr %a, ptr %b) vscale_range(4, 4) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; CHECK-NEXT:    fcmne p0.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    fcmne p0.s, p0/z, z1.s, #0.0
+; CHECK-NEXT:    fcmne p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1]
+; CHECK-NEXT:    fcmne p0.s, p1/z, z0.s, #0.0
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll
index c826f7337b18a6..4ba34407ff1846 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll
@@ -10,8 +10,8 @@ define <4 x i1> @reshuffle_v4i1_nxv4i1(<vscale x 4 x i1> %a) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z1.s, p0/z, #1 // =0x1
 ; CHECK-NEXT:    mov w8, v1.s[1]
-; CHECK-NEXT:    mov w9, v1.s[2]
 ; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    mov w9, v1.s[2]
 ; CHECK-NEXT:    mov v0.h[1], w8
 ; CHECK-NEXT:    mov w8, v1.s[3]
 ; CHECK-NEXT:    mov v0.h[2], w9

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll
index 020f45809a2dcb..82d350f6e28f82 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll
@@ -12,8 +12,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define <8 x i8> @bitreverse_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitreverse_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -24,8 +24,8 @@ define <8 x i8> @bitreverse_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
 define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitreverse_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -50,8 +50,8 @@ define void @bitreverse_v32i8(ptr %a) vscale_range(2,0) #0 {
 define void @bitreverse_v64i8(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: bitreverse_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    rbit z0.b, p0/m, z0.b
@@ -104,8 +104,8 @@ define void @bitreverse_v256i8(ptr %a) vscale_range(16,0) #0 {
 define <4 x i16> @bitreverse_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitreverse_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -116,8 +116,8 @@ define <4 x i16> @bitreverse_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
 define <8 x i16> @bitreverse_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitreverse_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -142,8 +142,8 @@ define void @bitreverse_v16i16(ptr %a) vscale_range(2,0) #0 {
 define void @bitreverse_v32i16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: bitreverse_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    rbit z0.h, p0/m, z0.h
@@ -196,8 +196,8 @@ define void @bitreverse_v128i16(ptr %a) vscale_range(16,0) #0 {
 define <2 x i32> @bitreverse_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitreverse_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -208,8 +208,8 @@ define <2 x i32> @bitreverse_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
 define <4 x i32> @bitreverse_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitreverse_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -234,8 +234,8 @@ define void @bitreverse_v8i32(ptr %a) vscale_range(2,0) #0 {
 define void @bitreverse_v16i32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: bitreverse_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    rbit z0.s, p0/m, z0.s
@@ -288,8 +288,8 @@ define void @bitreverse_v64i32(ptr %a) vscale_range(16,0) #0 {
 define <1 x i64> @bitreverse_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitreverse_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -300,8 +300,8 @@ define <1 x i64> @bitreverse_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
 define <2 x i64> @bitreverse_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
 ; CHECK-LABEL: bitreverse_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -326,8 +326,8 @@ define void @bitreverse_v4i64(ptr %a) vscale_range(2,0) #0 {
 define void @bitreverse_v8i64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: bitreverse_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    rbit z0.d, p0/m, z0.d
@@ -418,8 +418,8 @@ define void @bswap_v16i16(ptr %a) vscale_range(2,0) #0 {
 define void @bswap_v32i16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: bswap_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    revb z0.h, p0/m, z0.h
@@ -506,8 +506,8 @@ define void @bswap_v8i32(ptr %a) vscale_range(2,0) #0 {
 define void @bswap_v16i32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: bswap_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    revb z0.s, p0/m, z0.s
@@ -594,8 +594,8 @@ define void @bswap_v4i64(ptr %a) vscale_range(2,0) #0 {
 define void @bswap_v8i64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: bswap_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    revb z0.d, p0/m, z0.d

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll
index d291b336d8ae15..21a5abdeaa4d53 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll
@@ -8,8 +8,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sdiv_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    asrd z0.b, p0/m, z0.b, #1
 ; CHECK-NEXT:    subr z0.b, z0.b, #0 // =0x0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -21,8 +21,8 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) vscale_range(2,0) #0 {
 define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sdiv_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    asrd z0.b, p0/m, z0.b, #5
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -47,8 +47,8 @@ define void @sdiv_v32i8(ptr %a) vscale_range(2,0) #0 {
 define void @sdiv_v64i8(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: sdiv_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    asrd z0.b, p0/m, z0.b, #5
@@ -102,8 +102,8 @@ define void @sdiv_v256i8(ptr %a) vscale_range(16,0) #0 {
 define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sdiv_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    asrd z0.h, p0/m, z0.h, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -114,8 +114,8 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) vscale_range(2,0) #0 {
 define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sdiv_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    asrd z0.h, p0/m, z0.h, #3
 ; CHECK-NEXT:    subr z0.h, z0.h, #0 // =0x0
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -141,8 +141,8 @@ define void @sdiv_v16i16(ptr %a) vscale_range(2,0) #0 {
 define void @sdiv_v32i16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: sdiv_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    asrd z0.h, p0/m, z0.h, #5
@@ -196,8 +196,8 @@ define void @sdiv_v128i16(ptr %a) vscale_range(16,0) #0 {
 define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sdiv_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    asrd z0.s, p0/m, z0.s, #5
 ; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -209,8 +209,8 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) vscale_range(2,0) #0 {
 define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sdiv_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    asrd z0.s, p0/m, z0.s, #5
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -236,8 +236,8 @@ define void @sdiv_v8i32(ptr %a) vscale_range(2,0) #0 {
 define void @sdiv_v16i32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: sdiv_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    asrd z0.s, p0/m, z0.s, #5
@@ -290,8 +290,8 @@ define void @sdiv_v64i32(ptr %a) vscale_range(16,0) #0 {
 define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sdiv_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    asrd z0.d, p0/m, z0.d, #7
 ; CHECK-NEXT:    subr z0.d, z0.d, #0 // =0x0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -304,8 +304,8 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) vscale_range(2,0) #0 {
 define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sdiv_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    asrd z0.d, p0/m, z0.d, #5
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -331,8 +331,8 @@ define void @sdiv_v4i64(ptr %a) vscale_range(2,0) #0 {
 define void @sdiv_v8i64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: sdiv_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    asrd z0.d, p0/m, z0.d, #5

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
index fd0811bbf6580b..6e29f7cbabcc80 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
@@ -31,77 +31,77 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    umov w8, v0.b[8]
-; CHECK-NEXT:    umov w9, v0.b[1]
-; CHECK-NEXT:    umov w10, v0.b[9]
-; CHECK-NEXT:    umov w11, v0.b[2]
-; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    umov w9, v0.b[9]
+; CHECK-NEXT:    umov w10, v0.b[1]
+; CHECK-NEXT:    mov v2.16b, v0.16b
+; CHECK-NEXT:    umov w11, v0.b[15]
+; CHECK-NEXT:    fmov s1, w8
 ; CHECK-NEXT:    umov w8, v0.b[10]
-; CHECK-NEXT:    mov v1.b[1], w9
-; CHECK-NEXT:    umov w9, v0.b[3]
 ; CHECK-NEXT:    mov v2.b[1], w10
 ; CHECK-NEXT:    umov w10, v0.b[11]
-; CHECK-NEXT:    mov v1.b[2], w11
-; CHECK-NEXT:    umov w11, v0.b[7]
-; CHECK-NEXT:    mov v2.b[2], w8
-; CHECK-NEXT:    umov w8, v0.b[4]
-; CHECK-NEXT:    mov v1.b[3], w9
+; CHECK-NEXT:    mov v1.b[1], w9
+; CHECK-NEXT:    umov w9, v0.b[2]
+; CHECK-NEXT:    mov v1.b[2], w8
+; CHECK-NEXT:    umov w8, v0.b[3]
+; CHECK-NEXT:    mov v2.b[2], w9
 ; CHECK-NEXT:    umov w9, v0.b[12]
-; CHECK-NEXT:    mov v2.b[3], w10
-; CHECK-NEXT:    umov w10, v0.b[5]
-; CHECK-NEXT:    mov v1.b[4], w8
+; CHECK-NEXT:    mov v1.b[3], w10
+; CHECK-NEXT:    umov w10, v0.b[4]
+; CHECK-NEXT:    mov v2.b[3], w8
 ; CHECK-NEXT:    umov w8, v0.b[13]
-; CHECK-NEXT:    mov v2.b[4], w9
-; CHECK-NEXT:    umov w9, v0.b[6]
-; CHECK-NEXT:    mov v1.b[5], w10
+; CHECK-NEXT:    mov v1.b[4], w9
+; CHECK-NEXT:    umov w9, v0.b[5]
+; CHECK-NEXT:    mov v2.b[4], w10
 ; CHECK-NEXT:    umov w10, v0.b[14]
-; CHECK-NEXT:    mov v2.b[5], w8
-; CHECK-NEXT:    mov x8, #16 // =0x10
-; CHECK-NEXT:    mov v1.b[6], w9
-; CHECK-NEXT:    mov x9, #24 // =0x18
-; CHECK-NEXT:    ld1w { z4.s }, p0/z, [x0, x8, lsl #2]
-; CHECK-NEXT:    mov v2.b[6], w10
-; CHECK-NEXT:    umov w10, v0.b[15]
+; CHECK-NEXT:    mov v1.b[5], w8
+; CHECK-NEXT:    umov w8, v0.b[6]
+; CHECK-NEXT:    mov v2.b[5], w9
+; CHECK-NEXT:    umov w9, v0.b[7]
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #16
-; CHECK-NEXT:    ld1w { z5.s }, p0/z, [x0, x9, lsl #2]
+; CHECK-NEXT:    mov v1.b[6], w10
+; CHECK-NEXT:    mov v2.b[6], w8
 ; CHECK-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    mov x8, #16 // =0x10
+; CHECK-NEXT:    mov x10, #8 // =0x8
+; CHECK-NEXT:    ld1w { z4.s }, p0/z, [x0, x8, lsl #2]
 ; CHECK-NEXT:    mov v1.b[7], w11
-; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    mov v2.b[7], w10
-; CHECK-NEXT:    lsl z0.s, z0.s, #31
-; CHECK-NEXT:    asr z0.s, z0.s, #31
-; CHECK-NEXT:    mov x11, #8 // =0x8
+; CHECK-NEXT:    mov v2.b[7], w9
 ; CHECK-NEXT:    uunpklo z3.h, z3.b
-; CHECK-NEXT:    and z0.s, z0.s, #0x1
-; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    cmpne p1.s, p0/z, z0.s, #0
-; CHECK-NEXT:    lsl z3.s, z3.s, #31
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    mov x9, #24 // =0x18
 ; CHECK-NEXT:    uunpklo z1.h, z1.b
-; CHECK-NEXT:    asr z0.s, z3.s, #31
 ; CHECK-NEXT:    uunpklo z2.h, z2.b
-; CHECK-NEXT:    and z0.s, z0.s, #0x1
+; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    lsl z0.s, z0.s, #31
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
-; CHECK-NEXT:    cmpne p2.s, p0/z, z0.s, #0
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    lsl z3.s, z3.s, #31
+; CHECK-NEXT:    asr z0.s, z0.s, #31
+; CHECK-NEXT:    asr z3.s, z3.s, #31
 ; CHECK-NEXT:    lsl z1.s, z1.s, #31
 ; CHECK-NEXT:    lsl z2.s, z2.s, #31
+; CHECK-NEXT:    and z0.s, z0.s, #0x1
+; CHECK-NEXT:    and z3.s, z3.s, #0x1
 ; CHECK-NEXT:    asr z1.s, z1.s, #31
 ; CHECK-NEXT:    asr z2.s, z2.s, #31
+; CHECK-NEXT:    cmpne p1.s, p0/z, z0.s, #0
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
+; CHECK-NEXT:    cmpne p2.s, p0/z, z3.s, #0
+; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x0, x10, lsl #2]
 ; CHECK-NEXT:    and z1.s, z1.s, #0x1
 ; CHECK-NEXT:    and z2.s, z2.s, #0x1
 ; CHECK-NEXT:    mov z4.s, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z5.s, p2/m, #0 // =0x0
-; CHECK-NEXT:    cmpne p1.s, p0/z, z1.s, #0
-; CHECK-NEXT:    cmpne p2.s, p0/z, z2.s, #0
-; CHECK-NEXT:    mov z0.s, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z3.s, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.s, p2/m, #0 // =0x0
+; CHECK-NEXT:    cmpne p3.s, p0/z, z1.s, #0
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    cmpne p1.s, p0/z, z2.s, #0
 ; CHECK-NEXT:    st1w { z4.s }, p0, [x0, x8, lsl #2]
-; CHECK-NEXT:    st1w { z5.s }, p0, [x0, x9, lsl #2]
-; CHECK-NEXT:    st1w { z3.s }, p0, [x0, x11, lsl #2]
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0, x9, lsl #2]
+; CHECK-NEXT:    mov z3.s, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.s, p1/m, #0 // =0x0
+; CHECK-NEXT:    st1w { z3.s }, p0, [x0, x10, lsl #2]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x0]
 ; CHECK-NEXT:  .LBB1_2: // %exit
 ; CHECK-NEXT:    ret
   %broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
index 0204613b9fc8dd..113f7a9465a1fe 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
@@ -47,11 +47,11 @@ define void @splat_v32i8(i8 %a, ptr %b) vscale_range(2,0) #0 {
 define void @splat_v64i8(i8 %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: splat_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32
-; VBITS_GE_256-NEXT:    mov z0.b, w0
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x1]
+; VBITS_GE_256-NEXT:    mov z0.b, w0
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x1, x8]
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: splat_v64i8:
@@ -130,11 +130,11 @@ define void @splat_v16i16(i16 %a, ptr %b) vscale_range(2,0) #0 {
 define void @splat_v32i16(i16 %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: splat_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    mov z0.h, w0
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1]
+; VBITS_GE_256-NEXT:    mov z0.h, w0
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: splat_v32i16:
@@ -213,11 +213,11 @@ define void @splat_v8i32(i32 %a, ptr %b) vscale_range(2,0) #0 {
 define void @splat_v16i32(i32 %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: splat_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    mov z0.s, w0
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    mov z0.s, w0
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: splat_v16i32:
@@ -296,11 +296,11 @@ define void @splat_v4i64(i64 %a, ptr %b) vscale_range(2,0) #0 {
 define void @splat_v8i64(i64 %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: splat_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    mov z0.d, x0
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
+; VBITS_GE_256-NEXT:    mov z0.d, x0
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: splat_v8i64:
@@ -372,8 +372,8 @@ define <8 x half> @splat_v8f16(half %a) vscale_range(2,0) #0 {
 define void @splat_v16f16(half %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -386,18 +386,18 @@ define void @splat_v16f16(half %a, ptr %b) vscale_range(2,0) #0 {
 define void @splat_v32f16(half %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: splat_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
-; VBITS_GE_256-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    // kill: def $h0 killed $h0 def $z0
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    mov z0.h, h0
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: splat_v32f16:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; VBITS_GE_512-NEXT:    mov z0.h, h0
 ; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
 ; VBITS_GE_512-NEXT:    ret
@@ -410,8 +410,8 @@ define void @splat_v32f16(half %a, ptr %b) #0 {
 define void @splat_v64f16(half %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: splat_v64f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -424,8 +424,8 @@ define void @splat_v64f16(half %a, ptr %b) vscale_range(8,0) #0 {
 define void @splat_v128f16(half %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: splat_v128f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -462,8 +462,8 @@ define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) vscale_range(2,0) #0
 define void @splat_v8f32(float %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    mov z0.s, s0
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -476,18 +476,18 @@ define void @splat_v8f32(float %a, ptr %b) vscale_range(2,0) #0 {
 define void @splat_v16f32(float %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: splat_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
-; VBITS_GE_256-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    // kill: def $s0 killed $s0 def $z0
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    mov z0.s, s0
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: splat_v16f32:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; VBITS_GE_512-NEXT:    mov z0.s, s0
 ; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
 ; VBITS_GE_512-NEXT:    ret
@@ -500,8 +500,8 @@ define void @splat_v16f32(float %a, ptr %b) #0 {
 define void @splat_v32f32(float %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: splat_v32f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    mov z0.s, s0
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -514,8 +514,8 @@ define void @splat_v32f32(float %a, ptr %b) vscale_range(8,0) #0 {
 define void @splat_v64f32(float %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: splat_v64f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    mov z0.s, s0
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -550,8 +550,8 @@ define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) vscale_range(2,0)
 define void @splat_v4f64(double %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z0.d, d0
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -564,18 +564,18 @@ define void @splat_v4f64(double %a, ptr %b) vscale_range(2,0) #0 {
 define void @splat_v8f64(double %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: splat_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
-; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    mov z0.d, d0
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: splat_v8f64:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; VBITS_GE_512-NEXT:    mov z0.d, d0
 ; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
 ; VBITS_GE_512-NEXT:    ret
@@ -588,8 +588,8 @@ define void @splat_v8f64(double %a, ptr %b) #0 {
 define void @splat_v16f64(double %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: splat_v16f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z0.d, d0
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -602,8 +602,8 @@ define void @splat_v16f64(double %a, ptr %b) vscale_range(8,0) #0 {
 define void @splat_v32f64(double %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: splat_v32f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z0.d, d0
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -620,8 +620,8 @@ define void @splat_v32f64(double %a, ptr %b) vscale_range(16,0) #0 {
 define void @splat_imm_v64i8(ptr %a) vscale_range(4,0) #0 {
 ; CHECK-LABEL: splat_imm_v64i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.b, #1 // =0x1
 ; CHECK-NEXT:    ptrue p0.b, vl64
+; CHECK-NEXT:    mov z0.b, #1 // =0x1
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
   %insert = insertelement <64 x i8> undef, i8 1, i64 0
@@ -633,8 +633,8 @@ define void @splat_imm_v64i8(ptr %a) vscale_range(4,0) #0 {
 define void @splat_imm_v32i16(ptr %a) vscale_range(4,0) #0 {
 ; CHECK-LABEL: splat_imm_v32i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.h, #2 // =0x2
 ; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    mov z0.h, #2 // =0x2
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %insert = insertelement <32 x i16> undef, i16 2, i64 0
@@ -646,8 +646,8 @@ define void @splat_imm_v32i16(ptr %a) vscale_range(4,0) #0 {
 define void @splat_imm_v16i32(ptr %a) vscale_range(4,0) #0 {
 ; CHECK-LABEL: splat_imm_v16i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.s, #3 // =0x3
 ; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    mov z0.s, #3 // =0x3
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
   %insert = insertelement <16 x i32> undef, i32 3, i64 0
@@ -659,8 +659,8 @@ define void @splat_imm_v16i32(ptr %a) vscale_range(4,0) #0 {
 define void @splat_imm_v8i64(ptr %a) vscale_range(4,0) #0 {
 ; CHECK-LABEL: splat_imm_v8i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.d, #4 // =0x4
 ; CHECK-NEXT:    ptrue p0.d, vl8
+; CHECK-NEXT:    mov z0.d, #4 // =0x4
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   %insert = insertelement <8 x i64> undef, i64 4, i64 0
@@ -676,8 +676,8 @@ define void @splat_imm_v8i64(ptr %a) vscale_range(4,0) #0 {
 define void @splat_imm_v32f16(ptr %a) vscale_range(4,0) #0 {
 ; CHECK-LABEL: splat_imm_v32f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov z0.h, #5.00000000
 ; CHECK-NEXT:    ptrue p0.h, vl32
+; CHECK-NEXT:    fmov z0.h, #5.00000000
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %insert = insertelement <32 x half> undef, half 5.0, i64 0
@@ -689,8 +689,8 @@ define void @splat_imm_v32f16(ptr %a) vscale_range(4,0) #0 {
 define void @splat_imm_v16f32(ptr %a) vscale_range(4,0) #0 {
 ; CHECK-LABEL: splat_imm_v16f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov z0.s, #6.00000000
 ; CHECK-NEXT:    ptrue p0.s, vl16
+; CHECK-NEXT:    fmov z0.s, #6.00000000
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
   %insert = insertelement <16 x float> undef, float 6.0, i64 0
@@ -702,8 +702,8 @@ define void @splat_imm_v16f32(ptr %a) vscale_range(4,0) #0 {
 define void @splat_imm_v8f64(ptr %a) vscale_range(4,0) #0 {
 ; CHECK-LABEL: splat_imm_v8f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov z0.d, #7.00000000
 ; CHECK-NEXT:    ptrue p0.d, vl8
+; CHECK-NEXT:    fmov z0.d, #7.00000000
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   %insert = insertelement <8 x double> undef, double 7.0, i64 0

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll
index 0acdc8d5d60883..03bff6cb9b62df 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll
@@ -52,11 +52,11 @@ define void @store_v8f32(ptr %a) #0 {
 define void @store_v16f32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: store_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    mov z0.s, #0 // =0x0
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: store_v16f32:
@@ -86,24 +86,24 @@ define void @store_v16f32(ptr %a) #0 {
 define void @store_v32f32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: store_v32f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    mov x9, #16
-; VBITS_GE_256-NEXT:    mov x10, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    mov z0.s, #0 // =0x0
+; VBITS_GE_256-NEXT:    mov x8, #24 // =0x18
+; VBITS_GE_256-NEXT:    mov x9, #16 // =0x10
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x10, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: store_v32f32:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    mov x8, #16
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
 ; VBITS_GE_512-NEXT:    mov z0.s, #0 // =0x0
-; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
 ; VBITS_GE_512-NEXT:    ret
 ;
 ; VBITS_GE_1024-LABEL: store_v32f32:
@@ -126,45 +126,45 @@ define void @store_v32f32(ptr %a) #0 {
 define void @store_v64f32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: store_v64f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #56
-; VBITS_GE_256-NEXT:    mov x9, #48
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    mov z0.s, #0 // =0x0
-; VBITS_GE_256-NEXT:    mov x10, #40
-; VBITS_GE_256-NEXT:    mov x11, #32
+; VBITS_GE_256-NEXT:    mov x8, #56 // =0x38
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    mov x8, #24
-; VBITS_GE_256-NEXT:    mov x12, #16
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_256-NEXT:    mov x9, #8
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x10, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x11, lsl #2]
+; VBITS_GE_256-NEXT:    mov x8, #48 // =0x30
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    mov x8, #40 // =0x28
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    mov x8, #32 // =0x20
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    mov x8, #24 // =0x18
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x12, lsl #2]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x9, lsl #2]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: store_v64f32:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    mov x8, #48
-; VBITS_GE_512-NEXT:    mov x9, #32
-; VBITS_GE_512-NEXT:    mov x10, #16
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
 ; VBITS_GE_512-NEXT:    mov z0.s, #0 // =0x0
+; VBITS_GE_512-NEXT:    mov x8, #48 // =0x30
+; VBITS_GE_512-NEXT:    mov x9, #32 // =0x20
 ; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_512-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0, x9, lsl #2]
-; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0, x10, lsl #2]
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
 ; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
 ; VBITS_GE_512-NEXT:    ret
 ;
 ; VBITS_GE_1024-LABEL: store_v64f32:
 ; VBITS_GE_1024:       // %bb.0:
-; VBITS_GE_1024-NEXT:    mov x8, #32
 ; VBITS_GE_1024-NEXT:    ptrue p0.s, vl32
 ; VBITS_GE_1024-NEXT:    mov z0.s, #0 // =0x0
-; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_1024-NEXT:    mov x8, #32 // =0x20
 ; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_1024-NEXT:    st1w { z0.s }, p0, [x0]
 ; VBITS_GE_1024-NEXT:    ret
 ;
 ; VBITS_GE_2048-LABEL: store_v64f32:

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll
index 6093a2ce09620e..557b349d482f03 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll
@@ -46,8 +46,8 @@ bb1:
 define void @subvector_v32i16(ptr %in, ptr %out) #0 {
 ; VBITS_GE_256-LABEL: subvector_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x8, lsl #1]
@@ -101,8 +101,8 @@ bb1:
 define void @subvector_v16i32(ptr %in, ptr %out) #0 {
 ; VBITS_GE_256-LABEL: subvector_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
@@ -157,8 +157,8 @@ bb1:
 define void @subvector_v8i64(ptr %in, ptr %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: subvector_v8i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #4
 ; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    mov x8, #4 // =0x4
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
@@ -234,8 +234,8 @@ bb1:
 define void @subvector_v32f16(ptr %in, ptr %out) #0 {
 ; VBITS_GE_256-LABEL: subvector_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x8, lsl #1]
@@ -289,8 +289,8 @@ bb1:
 define void @subvector_v16f32(ptr %in, ptr %out) #0 {
 ; VBITS_GE_256-LABEL: subvector_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
@@ -343,8 +343,8 @@ bb1:
 define void @subvector_v8f64(ptr %in, ptr %out) #0 {
 ; VBITS_GE_256-LABEL: subvector_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll
index 6f7220a74b067b..2dc4bddb81a6db 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll
@@ -8,8 +8,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @store_trunc_v2i64i8(ptr %ap, ptr %dest) vscale_range(2,0) #0 {
 ; CHECK-LABEL: store_trunc_v2i64i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    st1b { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
   %a = load <2 x i64>, ptr %ap
@@ -34,16 +34,16 @@ define void @store_trunc_v4i64i8(ptr %ap, ptr %dest) vscale_range(2,0) #0 {
 define void @store_trunc_v8i64i8(ptr %ap, ptr %dest) #0 {
 ; VBITS_GE_256-LABEL: store_trunc_v8i64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    st1b { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1b { z1.s }, p1, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: store_trunc_v8i64i8:
@@ -88,8 +88,8 @@ define void @store_trunc_v8i64i16(ptr %ap, ptr %dest) #0 {
 ; Currently does not use the truncating store
 ; VBITS_GE_256-LABEL: store_trunc_v8i64i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
@@ -115,16 +115,16 @@ define void @store_trunc_v8i64i16(ptr %ap, ptr %dest) #0 {
 define void @store_trunc_v8i64i32(ptr %ap, ptr %dest) #0 {
 ; VBITS_GE_256-LABEL: store_trunc_v8i64i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p1, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: store_trunc_v8i64i32:
@@ -143,8 +143,8 @@ define void @store_trunc_v16i32i8(ptr %ap, ptr %dest) #0 {
 ; Currently does not use the truncating store
 ; VBITS_GE_256-LABEL: store_trunc_v16i32i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
@@ -170,16 +170,16 @@ define void @store_trunc_v16i32i8(ptr %ap, ptr %dest) #0 {
 define void @store_trunc_v16i32i16(ptr %ap, ptr %dest) #0 {
 ; VBITS_GE_256-LABEL: store_trunc_v16i32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
+; VBITS_GE_256-NEXT:    ptrue p1.h, vl16
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
 ; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; VBITS_GE_256-NEXT:    splice z1.h, p0, z1.h, z0.h
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p1, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: store_trunc_v16i32i16:
@@ -197,16 +197,16 @@ define void @store_trunc_v16i32i16(ptr %ap, ptr %dest) #0 {
 define void @store_trunc_v32i16i8(ptr %ap, ptr %dest) #0 {
 ; VBITS_GE_256-LABEL: store_trunc_v32i16i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
+; VBITS_GE_256-NEXT:    ptrue p1.b, vl32
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl16
 ; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
 ; VBITS_GE_256-NEXT:    splice z1.b, p0, z1.b, z0.b
-; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p1, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: store_trunc_v32i16i8:

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll
index c94d07d65ad533..8dc45eadce6f3b 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll
@@ -26,8 +26,8 @@ define <16 x i8> @trunc_v16i16_v16i8(ptr %in) vscale_range(2,0) #0 {
 define void @trunc_v32i16_v32i8(ptr %in, ptr %out) #0 {
 ; VBITS_GE_256-LABEL: trunc_v32i16_v32i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl16
@@ -112,8 +112,8 @@ define <8 x i8> @trunc_v8i32_v8i8(ptr %in) vscale_range(2,0) #0 {
 define <16 x i8> @trunc_v16i32_v16i8(ptr %in) #0 {
 ; VBITS_GE_256-LABEL: trunc_v16i32_v16i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
@@ -196,8 +196,8 @@ define <8 x i16> @trunc_v8i32_v8i16(ptr %in) vscale_range(2,0) #0 {
 define void @trunc_v16i32_v16i16(ptr %in, ptr %out) #0 {
 ; VBITS_GE_256-LABEL: trunc_v16i32_v16i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
@@ -283,8 +283,8 @@ define <4 x i8> @trunc_v4i64_v4i8(ptr %in) vscale_range(2,0) #0 {
 define <8 x i8> @trunc_v8i64_v8i8(ptr %in) #0 {
 ; VBITS_GE_256-LABEL: trunc_v8i64_v8i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
@@ -366,8 +366,8 @@ define <4 x i16> @trunc_v4i64_v4i16(ptr %in) vscale_range(2,0) #0 {
 define <8 x i16> @trunc_v8i64_v8i16(ptr %in) #0 {
 ; VBITS_GE_256-LABEL: trunc_v8i64_v8i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
@@ -450,8 +450,8 @@ define <4 x i32> @trunc_v4i64_v4i32(ptr %in) vscale_range(2,0) #0 {
 define void @trunc_v8i64_v8i32(ptr %in, ptr %out) #0 {
 ; VBITS_GE_256-LABEL: trunc_v8i64_v8i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl4

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll
index 88ccde53d7dc19..f32175a42d8eaa 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll
@@ -50,19 +50,19 @@ define void @shuffle_ext_byone_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @shuffle_ext_byone_v64i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: shuffle_ext_byone_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x1, x8]
 ; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    mov z0.b, z0.b[31]
 ; VBITS_GE_256-NEXT:    mov z3.b, z2.b[31]
+; VBITS_GE_256-NEXT:    mov z0.b, z0.b[31]
+; VBITS_GE_256-NEXT:    fmov w9, s3
+; VBITS_GE_256-NEXT:    insr z1.b, w9
 ; VBITS_GE_256-NEXT:    fmov w9, s0
-; VBITS_GE_256-NEXT:    fmov w10, s3
 ; VBITS_GE_256-NEXT:    insr z2.b, w9
-; VBITS_GE_256-NEXT:    insr z1.b, w10
-; VBITS_GE_256-NEXT:    st1b { z2.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z2.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: shuffle_ext_byone_v64i8:
@@ -94,12 +94,12 @@ define void @shuffle_ext_byone_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b, vl128
 ; CHECK-NEXT:    mov w8, #127 // =0x7f
-; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
 ; CHECK-NEXT:    whilels p1.b, xzr, x8
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    lastb w8, p1, z0.b
-; CHECK-NEXT:    insr z1.b, w8
-; CHECK-NEXT:    st1b { z1.b }, p0, [x0]
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x1]
+; CHECK-NEXT:    insr z0.b, w8
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, ptr %a
   %op2 = load <128 x i8>, ptr %b
@@ -128,12 +128,12 @@ define void @shuffle_ext_byone_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b, vl256
 ; CHECK-NEXT:    mov w8, #255 // =0xff
-; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
 ; CHECK-NEXT:    whilels p1.b, xzr, x8
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    lastb w8, p1, z0.b
-; CHECK-NEXT:    insr z1.b, w8
-; CHECK-NEXT:    st1b { z1.b }, p0, [x0]
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x1]
+; CHECK-NEXT:    insr z0.b, w8
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <256 x i8>, ptr %a
   %op2 = load <256 x i8>, ptr %b
@@ -215,19 +215,19 @@ define void @shuffle_ext_byone_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @shuffle_ext_byone_v32i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: shuffle_ext_byone_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    mov z0.h, z0.h[15]
 ; VBITS_GE_256-NEXT:    mov z3.h, z2.h[15]
+; VBITS_GE_256-NEXT:    mov z0.h, z0.h[15]
+; VBITS_GE_256-NEXT:    fmov w9, s3
+; VBITS_GE_256-NEXT:    insr z1.h, w9
 ; VBITS_GE_256-NEXT:    fmov w9, s0
-; VBITS_GE_256-NEXT:    fmov w10, s3
 ; VBITS_GE_256-NEXT:    insr z2.h, w9
-; VBITS_GE_256-NEXT:    insr z1.h, w10
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: shuffle_ext_byone_v32i16:
@@ -255,12 +255,12 @@ define void @shuffle_ext_byone_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl64
 ; CHECK-NEXT:    mov w8, #63 // =0x3f
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
 ; CHECK-NEXT:    whilels p1.h, xzr, x8
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    lastb w8, p1, z0.h
-; CHECK-NEXT:    insr z1.h, w8
-; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x1]
+; CHECK-NEXT:    insr z0.h, w8
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, ptr %a
   %op2 = load <64 x i16>, ptr %b
@@ -281,12 +281,12 @@ define void @shuffle_ext_byone_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl128
 ; CHECK-NEXT:    mov w8, #127 // =0x7f
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
 ; CHECK-NEXT:    whilels p1.h, xzr, x8
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    lastb w8, p1, z0.h
-; CHECK-NEXT:    insr z1.h, w8
-; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x1]
+; CHECK-NEXT:    insr z0.h, w8
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <128 x i16>, ptr %a
   %op2 = load <128 x i16>, ptr %b
@@ -351,19 +351,19 @@ define void @shuffle_ext_byone_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @shuffle_ext_byone_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: shuffle_ext_byone_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    mov z0.s, z0.s[7]
 ; VBITS_GE_256-NEXT:    mov z3.s, z2.s[7]
+; VBITS_GE_256-NEXT:    mov z0.s, z0.s[7]
+; VBITS_GE_256-NEXT:    fmov w9, s3
+; VBITS_GE_256-NEXT:    insr z1.s, w9
 ; VBITS_GE_256-NEXT:    fmov w9, s0
-; VBITS_GE_256-NEXT:    fmov w10, s3
 ; VBITS_GE_256-NEXT:    insr z2.s, w9
-; VBITS_GE_256-NEXT:    insr z1.s, w10
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: shuffle_ext_byone_v16i32:
@@ -389,12 +389,12 @@ define void @shuffle_ext_byone_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl32
 ; CHECK-NEXT:    mov w8, #31 // =0x1f
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
 ; CHECK-NEXT:    whilels p1.s, xzr, x8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    lastb w8, p1, z0.s
-; CHECK-NEXT:    insr z1.s, w8
-; CHECK-NEXT:    st1w { z1.s }, p0, [x0]
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1]
+; CHECK-NEXT:    insr z0.s, w8
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, ptr %a
   %op2 = load <32 x i32>, ptr %b
@@ -411,12 +411,12 @@ define void @shuffle_ext_byone_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl64
 ; CHECK-NEXT:    mov w8, #63 // =0x3f
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
 ; CHECK-NEXT:    whilels p1.s, xzr, x8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    lastb w8, p1, z0.s
-; CHECK-NEXT:    insr z1.s, w8
-; CHECK-NEXT:    st1w { z1.s }, p0, [x0]
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1]
+; CHECK-NEXT:    insr z0.s, w8
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <64 x i32>, ptr %a
   %op2 = load <64 x i32>, ptr %b
@@ -463,19 +463,19 @@ define void @shuffle_ext_byone_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @shuffle_ext_byone_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: shuffle_ext_byone_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    mov z0.d, z0.d[3]
 ; VBITS_GE_256-NEXT:    mov z3.d, z2.d[3]
+; VBITS_GE_256-NEXT:    mov z0.d, z0.d[3]
+; VBITS_GE_256-NEXT:    fmov x9, d3
+; VBITS_GE_256-NEXT:    insr z1.d, x9
 ; VBITS_GE_256-NEXT:    fmov x9, d0
-; VBITS_GE_256-NEXT:    fmov x10, d3
 ; VBITS_GE_256-NEXT:    insr z2.d, x9
-; VBITS_GE_256-NEXT:    insr z1.d, x10
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: shuffle_ext_byone_v8i64:
@@ -500,12 +500,12 @@ define void @shuffle_ext_byone_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl16
 ; CHECK-NEXT:    mov w8, #15 // =0xf
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    whilels p1.d, xzr, x8
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    lastb x8, p1, z0.d
-; CHECK-NEXT:    insr z1.d, x8
-; CHECK-NEXT:    st1d { z1.d }, p0, [x0]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    insr z0.d, x8
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, ptr %a
   %op2 = load <16 x i64>, ptr %b
@@ -520,12 +520,12 @@ define void @shuffle_ext_byone_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl32
 ; CHECK-NEXT:    mov w8, #31 // =0x1f
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    whilels p1.d, xzr, x8
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    lastb x8, p1, z0.d
-; CHECK-NEXT:    insr z1.d, x8
-; CHECK-NEXT:    st1d { z1.d }, p0, [x0]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    insr z0.d, x8
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i64>, ptr %a
   %op2 = load <32 x i64>, ptr %b
@@ -578,17 +578,17 @@ define void @shuffle_ext_byone_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @shuffle_ext_byone_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: shuffle_ext_byone_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    mov z0.h, z0.h[15]
 ; VBITS_GE_256-NEXT:    mov z3.h, z2.h[15]
-; VBITS_GE_256-NEXT:    insr z2.h, h0
+; VBITS_GE_256-NEXT:    mov z0.h, z0.h[15]
 ; VBITS_GE_256-NEXT:    insr z1.h, h3
-; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    insr z2.h, h0
 ; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: shuffle_ext_byone_v32f16:
@@ -615,9 +615,9 @@ define void @shuffle_ext_byone_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl64
 ; CHECK-NEXT:    mov w8, #63 // =0x3f
+; CHECK-NEXT:    whilels p1.h, xzr, x8
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; CHECK-NEXT:    whilels p1.h, xzr, x8
 ; CHECK-NEXT:    lastb h0, p1, z0.h
 ; CHECK-NEXT:    insr z1.h, h0
 ; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
@@ -641,9 +641,9 @@ define void @shuffle_ext_byone_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl128
 ; CHECK-NEXT:    mov w8, #127 // =0x7f
+; CHECK-NEXT:    whilels p1.h, xzr, x8
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; CHECK-NEXT:    whilels p1.h, xzr, x8
 ; CHECK-NEXT:    lastb h0, p1, z0.h
 ; CHECK-NEXT:    insr z1.h, h0
 ; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
@@ -710,17 +710,17 @@ define void @shuffle_ext_byone_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @shuffle_ext_byone_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: shuffle_ext_byone_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    mov z0.s, z0.s[7]
 ; VBITS_GE_256-NEXT:    mov z3.s, z2.s[7]
-; VBITS_GE_256-NEXT:    insr z2.s, s0
+; VBITS_GE_256-NEXT:    mov z0.s, z0.s[7]
 ; VBITS_GE_256-NEXT:    insr z1.s, s3
-; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    insr z2.s, s0
 ; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: shuffle_ext_byone_v16f32:
@@ -745,9 +745,9 @@ define void @shuffle_ext_byone_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl32
 ; CHECK-NEXT:    mov w8, #31 // =0x1f
+; CHECK-NEXT:    whilels p1.s, xzr, x8
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; CHECK-NEXT:    whilels p1.s, xzr, x8
 ; CHECK-NEXT:    lastb s0, p1, z0.s
 ; CHECK-NEXT:    insr z1.s, s0
 ; CHECK-NEXT:    st1w { z1.s }, p0, [x0]
@@ -767,9 +767,9 @@ define void @shuffle_ext_byone_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl64
 ; CHECK-NEXT:    mov w8, #63 // =0x3f
+; CHECK-NEXT:    whilels p1.s, xzr, x8
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; CHECK-NEXT:    whilels p1.s, xzr, x8
 ; CHECK-NEXT:    lastb s0, p1, z0.s
 ; CHECK-NEXT:    insr z1.s, s0
 ; CHECK-NEXT:    st1w { z1.s }, p0, [x0]
@@ -818,17 +818,17 @@ define void @shuffle_ext_byone_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @shuffle_ext_byone_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: shuffle_ext_byone_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    mov z0.d, z0.d[3]
 ; VBITS_GE_256-NEXT:    mov z3.d, z2.d[3]
-; VBITS_GE_256-NEXT:    insr z2.d, d0
+; VBITS_GE_256-NEXT:    mov z0.d, z0.d[3]
 ; VBITS_GE_256-NEXT:    insr z1.d, d3
-; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    insr z2.d, d0
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: shuffle_ext_byone_v8f64:
@@ -852,9 +852,9 @@ define void @shuffle_ext_byone_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl16
 ; CHECK-NEXT:    mov w8, #15 // =0xf
+; CHECK-NEXT:    whilels p1.d, xzr, x8
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; CHECK-NEXT:    whilels p1.d, xzr, x8
 ; CHECK-NEXT:    lastb d0, p1, z0.d
 ; CHECK-NEXT:    insr z1.d, d0
 ; CHECK-NEXT:    st1d { z1.d }, p0, [x0]
@@ -872,9 +872,9 @@ define void @shuffle_ext_byone_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl32
 ; CHECK-NEXT:    mov w8, #31 // =0x1f
+; CHECK-NEXT:    whilels p1.d, xzr, x8
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; CHECK-NEXT:    whilels p1.d, xzr, x8
 ; CHECK-NEXT:    lastb d0, p1, z0.d
 ; CHECK-NEXT:    insr z1.d, d0
 ; CHECK-NEXT:    st1d { z1.d }, p0, [x0]
@@ -921,10 +921,10 @@ define void @shuffle_ext_invalid(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    mov z2.d, z1.d[1]
-; CHECK-NEXT:    stp d1, d2, [sp, #16]
-; CHECK-NEXT:    mov z1.d, z0.d[3]
+; CHECK-NEXT:    mov z3.d, z0.d[3]
 ; CHECK-NEXT:    mov z0.d, z0.d[2]
-; CHECK-NEXT:    stp d0, d1, [sp]
+; CHECK-NEXT:    stp d1, d2, [sp, #16]
+; CHECK-NEXT:    stp d0, d3, [sp]
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    mov sp, x29

diff  --git a/llvm/test/CodeGen/AArch64/sve-fold-vscale.ll b/llvm/test/CodeGen/AArch64/sve-fold-vscale.ll
index f4ac22f7a6b040..0c65a29e8b2817 100644
--- a/llvm/test/CodeGen/AArch64/sve-fold-vscale.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fold-vscale.ll
@@ -7,9 +7,9 @@
 define void @ld1w_reg_loop(ptr %addr) {
 ; CHECK-LABEL: ld1w_reg_loop:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    cntw x9
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:  .LBB0_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
@@ -37,9 +37,9 @@ for.cond.cleanup:
 define void @st1w_reg_loop(ptr %addr, <vscale x 4 x i32> %val) {
 ; CHECK-LABEL: st1w_reg_loop:
 ; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    cntw x9
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:  .LBB1_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]

diff  --git a/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll b/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll
index 641cae07ba329d..193956f63c30d9 100644
--- a/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll
+++ b/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll
@@ -65,9 +65,9 @@ define <vscale x 4 x i32> @sti64ldi32(<vscale x 2 x i64>* nocapture %P, <vscale
 ; CHECK-LABEL: sti64ldi32:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0, #1, mul vl]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0, #1, mul vl]
 ; CHECK-NEXT:    ret
 entry:
   %0 = bitcast <vscale x 2 x i64>* %P to <vscale x 4 x i32>*

diff  --git a/llvm/test/CodeGen/AArch64/sve-fp-combine.ll b/llvm/test/CodeGen/AArch64/sve-fp-combine.ll
index e53f76f6512127..ddede0feca16a6 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-combine.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-combine.ll
@@ -956,9 +956,9 @@ define <vscale x 2 x double> @fsub_d_sel_negzero(<vscale x 2 x double> %a, <vsca
 define <vscale x 8 x half> @fadd_sel_fmul_h(<vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c, <vscale x 8 x i1> %mask) {
 ; CHECK-LABEL: fadd_sel_fmul_h:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z3.h, #0 // =0x0
 ; CHECK-NEXT:    fmul z1.h, z1.h, z2.h
-; CHECK-NEXT:    sel z1.h, p0, z1.h, z3.h
+; CHECK-NEXT:    mov z2.h, #0 // =0x0
+; CHECK-NEXT:    sel z1.h, p0, z1.h, z2.h
 ; CHECK-NEXT:    fadd z0.h, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %fmul = fmul <vscale x 8 x half> %b, %c
@@ -970,9 +970,9 @@ define <vscale x 8 x half> @fadd_sel_fmul_h(<vscale x 8 x half> %a, <vscale x 8
 define <vscale x 4 x float> @fadd_sel_fmul_s(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: fadd_sel_fmul_s:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z3.s, #0 // =0x0
 ; CHECK-NEXT:    fmul z1.s, z1.s, z2.s
-; CHECK-NEXT:    sel z1.s, p0, z1.s, z3.s
+; CHECK-NEXT:    mov z2.s, #0 // =0x0
+; CHECK-NEXT:    sel z1.s, p0, z1.s, z2.s
 ; CHECK-NEXT:    fadd z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %fmul = fmul <vscale x 4 x float> %b, %c
@@ -984,9 +984,9 @@ define <vscale x 4 x float> @fadd_sel_fmul_s(<vscale x 4 x float> %a, <vscale x
 define <vscale x 2 x double> @fadd_sel_fmul_d(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: fadd_sel_fmul_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z3.d, #0 // =0x0
 ; CHECK-NEXT:    fmul z1.d, z1.d, z2.d
-; CHECK-NEXT:    sel z1.d, p0, z1.d, z3.d
+; CHECK-NEXT:    mov z2.d, #0 // =0x0
+; CHECK-NEXT:    sel z1.d, p0, z1.d, z2.d
 ; CHECK-NEXT:    fadd z0.d, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %fmul = fmul <vscale x 2 x double> %b, %c

diff  --git a/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll b/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
index 435059482afdd1..f6059d715a0523 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
@@ -6,13 +6,13 @@ define i64 @scalable_int_min_max(ptr %arg, ptr %arg1, <vscale x 2 x ptr> %i37, <
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #3745 // =0xea1
-; CHECK-NEXT:    ld1w { z3.d }, p0/z, [x0]
 ; CHECK-NEXT:    movk w8, #16618, lsl #16
-; CHECK-NEXT:    mov w9, #57344 // =0xe000
-; CHECK-NEXT:    movk w9, #17535, lsl #16
 ; CHECK-NEXT:    mov z4.s, w8
+; CHECK-NEXT:    mov w8, #57344 // =0xe000
+; CHECK-NEXT:    movk w8, #17535, lsl #16
+; CHECK-NEXT:    mov z5.s, w8
+; CHECK-NEXT:    ld1w { z3.d }, p0/z, [x0]
 ; CHECK-NEXT:    fmul z4.s, p0/m, z4.s, z3.s
-; CHECK-NEXT:    mov z5.s, w9
 ; CHECK-NEXT:    fadd z4.s, p0/m, z4.s, z5.s
 ; CHECK-NEXT:    mov z5.d, #1023 // =0x3ff
 ; CHECK-NEXT:    fcvtzs z4.d, p0/m, z4.s

diff  --git a/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll b/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll
index 259f457d3ad241..3235dd37e19e4b 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll
@@ -11,8 +11,8 @@ target triple = "aarch64-linux-gnu"
 define half @fadda_nxv2f16(half %init, <vscale x 2 x half> %a) {
 ; CHECK-LABEL: fadda_nxv2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    fadda h0, p0, h0, z1.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
@@ -23,8 +23,8 @@ define half @fadda_nxv2f16(half %init, <vscale x 2 x half> %a) {
 define half @fadda_nxv4f16(half %init, <vscale x 4 x half> %a) {
 ; CHECK-LABEL: fadda_nxv4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    fadda h0, p0, h0, z1.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
@@ -35,8 +35,8 @@ define half @fadda_nxv4f16(half %init, <vscale x 4 x half> %a) {
 define half @fadda_nxv8f16(half %init, <vscale x 8 x half> %a) {
 ; CHECK-LABEL: fadda_nxv8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    fadda h0, p0, h0, z1.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
@@ -51,12 +51,12 @@ define half @fadda_nxv6f16(<vscale x 6 x half> %v, half %s) {
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    mov w8, #32768 // =0x8000
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov w8, #32768 // =0x8000
 ; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
 ; CHECK-NEXT:    fmov s0, s1
-; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    st1h { z2.d }, p1, [sp, #3, mul vl]
 ; CHECK-NEXT:    ld1h { z2.h }, p0/z, [sp]
 ; CHECK-NEXT:    fadda h0, p0, h0, z2.h
@@ -75,23 +75,23 @@ define half @fadda_nxv10f16(<vscale x 10 x half> %v, half %s) {
 ; CHECK-NEXT:    addvl sp, sp, #-3
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    mov w8, #32768 // =0x8000
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $z2
+; CHECK-NEXT:    mov w8, #32768 // =0x8000
 ; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    fadda h2, p0, h2, z0.h
 ; CHECK-NEXT:    st1h { z1.h }, p0, [sp]
-; CHECK-NEXT:    // kill: def $h2 killed $h2 def $z2
-; CHECK-NEXT:    mov z3.h, w8
+; CHECK-NEXT:    mov z0.h, w8
 ; CHECK-NEXT:    addvl x8, sp, #1
-; CHECK-NEXT:    st1h { z3.d }, p1, [sp, #1, mul vl]
-; CHECK-NEXT:    fadda h2, p0, h2, z0.h
+; CHECK-NEXT:    st1h { z0.d }, p1, [sp, #1, mul vl]
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [sp]
 ; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    st1h { z3.d }, p1, [sp, #6, mul vl]
+; CHECK-NEXT:    st1h { z0.d }, p1, [sp, #6, mul vl]
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #2, mul vl]
-; CHECK-NEXT:    st1h { z3.d }, p1, [x8, #7, mul vl]
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [sp, #2, mul vl]
-; CHECK-NEXT:    fadda h2, p0, h2, z1.h
+; CHECK-NEXT:    st1h { z0.d }, p1, [x8, #7, mul vl]
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [sp, #2, mul vl]
+; CHECK-NEXT:    fadda h2, p0, h2, z0.h
 ; CHECK-NEXT:    fmov s0, s2
 ; CHECK-NEXT:    addvl sp, sp, #3
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -103,14 +103,14 @@ define half @fadda_nxv10f16(<vscale x 10 x half> %v, half %s) {
 define half @fadda_nxv12f16(<vscale x 12 x half> %v, half %s) {
 ; CHECK-LABEL: fadda_nxv12f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32768 // =0x8000
-; CHECK-NEXT:    // kill: def $h2 killed $h2 def $z2
-; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $z2
+; CHECK-NEXT:    mov w8, #32768 // =0x8000
+; CHECK-NEXT:    fadda h2, p0, h2, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z1.h
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
 ; CHECK-NEXT:    fadda h2, p0, h2, z0.h
-; CHECK-NEXT:    mov z3.h, w8
-; CHECK-NEXT:    uzp1 z1.h, z1.h, z3.h
-; CHECK-NEXT:    fadda h2, p0, h2, z1.h
 ; CHECK-NEXT:    fmov s0, s2
 ; CHECK-NEXT:    ret
   %res = call half @llvm.vector.reduce.fadd.nxv12f16(half %s, <vscale x 12 x half> %v)
@@ -120,8 +120,8 @@ define half @fadda_nxv12f16(<vscale x 12 x half> %v, half %s) {
 define float @fadda_nxv2f32(float %init, <vscale x 2 x float> %a) {
 ; CHECK-LABEL: fadda_nxv2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    fadda s0, p0, s0, z1.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
@@ -132,8 +132,8 @@ define float @fadda_nxv2f32(float %init, <vscale x 2 x float> %a) {
 define float @fadda_nxv4f32(float %init, <vscale x 4 x float> %a) {
 ; CHECK-LABEL: fadda_nxv4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    fadda s0, p0, s0, z1.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
@@ -144,8 +144,8 @@ define float @fadda_nxv4f32(float %init, <vscale x 4 x float> %a) {
 define double @fadda_nxv2f64(double %init, <vscale x 2 x double> %a) {
 ; CHECK-LABEL: fadda_nxv2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fadda d0, p0, d0, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-fpext-load.ll b/llvm/test/CodeGen/AArch64/sve-fpext-load.ll
index f18252b6bfe76f..5a600915bf79de 100644
--- a/llvm/test/CodeGen/AArch64/sve-fpext-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fpext-load.ll
@@ -45,7 +45,6 @@ define <vscale x 8 x double> @ext8_f16_f64(<vscale x 8 x half> *%ptr, i64 %index
 ; CHECK-NEXT:    uunpklo z2.d, z1.s
 ; CHECK-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEXT:    uunpklo z3.d, z0.s
-; CHECK-NEXT:    fcvt z1.d, p0/m, z1.h
 ; CHECK-NEXT:    uunpkhi z4.d, z0.s
 ; CHECK-NEXT:    movprfx z0, z2
 ; CHECK-NEXT:    fcvt z0.d, p0/m, z2.h
@@ -53,6 +52,7 @@ define <vscale x 8 x double> @ext8_f16_f64(<vscale x 8 x half> *%ptr, i64 %index
 ; CHECK-NEXT:    fcvt z2.d, p0/m, z3.h
 ; CHECK-NEXT:    movprfx z3, z4
 ; CHECK-NEXT:    fcvt z3.d, p0/m, z4.h
+; CHECK-NEXT:    fcvt z1.d, p0/m, z1.h
 ; CHECK-NEXT:    ret
   %load = load <vscale x 8 x half>, <vscale x 8 x half>* %ptr, align 4
   %load.ext = fpext <vscale x 8 x half> %load to <vscale x 8 x double>

diff  --git a/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll b/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
index 28bcba4b063a9f..43113df07c079f 100644
--- a/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
@@ -14,23 +14,22 @@ declare <vscale x 4 x i64> @llvm.fptosi.sat.nxv4f32.nxv4i64(<vscale x 4 x float>
 define <vscale x 2 x i32> @test_signed_v2f32_v2i32(<vscale x 2 x float> %f) {
 ; CHECK-LABEL: test_signed_v2f32_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-822083584
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov w8, #-822083584 // =0xcf000000
 ; CHECK-NEXT:    mov z2.d, #0xffffffff80000000
 ; CHECK-NEXT:    mov z1.s, w8
-; CHECK-NEXT:    mov w8, #1325400063
+; CHECK-NEXT:    mov w8, #1325400063 // =0x4effffff
+; CHECK-NEXT:    mov z3.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z1.s
 ; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.s
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z3.s
+; CHECK-NEXT:    mov z3.d, #0x7fffffff
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    mov z2.s, w8
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    mov z2.d, #0x7fffffff
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    mov z1.d, p1/m, z2.d
+; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i32> @llvm.fptosi.sat.nxv2f32.nxv2i32(<vscale x 2 x float> %f)
     ret <vscale x 2 x i32> %x
@@ -39,23 +38,22 @@ define <vscale x 2 x i32> @test_signed_v2f32_v2i32(<vscale x 2 x float> %f) {
 define <vscale x 4 x i32> @test_signed_v4f32_v4i32(<vscale x 4 x float> %f) {
 ; CHECK-LABEL: test_signed_v4f32_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-822083584
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w8, #-822083584 // =0xcf000000
 ; CHECK-NEXT:    mov z2.s, #0x80000000
 ; CHECK-NEXT:    mov z1.s, w8
-; CHECK-NEXT:    mov w8, #1325400063
+; CHECK-NEXT:    mov w8, #1325400063 // =0x4effffff
+; CHECK-NEXT:    mov z3.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z1.s
 ; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    fcvtzs z1.s, p0/m, z0.s
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z3.s
+; CHECK-NEXT:    mov z3.s, #0x7fffffff
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    mov z1.s, p1/m, z2.s
-; CHECK-NEXT:    mov z2.s, w8
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    mov z2.s, #0x7fffffff
-; CHECK-NEXT:    mov z1.s, p1/m, z2.s
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    mov z1.s, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    mov z1.s, p1/m, z2.s
+; CHECK-NEXT:    sel z0.s, p2, z3.s, z1.s
+; CHECK-NEXT:    mov z0.s, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i32> @llvm.fptosi.sat.nxv4f32.nxv4i32(<vscale x 4 x float> %f)
     ret <vscale x 4 x i32> %x
@@ -64,33 +62,39 @@ define <vscale x 4 x i32> @test_signed_v4f32_v4i32(<vscale x 4 x float> %f) {
 define <vscale x 8 x i32> @test_signed_v8f32_v8i32(<vscale x 8 x float> %f) {
 ; CHECK-LABEL: test_signed_v8f32_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-822083584
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov w9, #1325400063
+; CHECK-NEXT:    mov w8, #-822083584 // =0xcf000000
 ; CHECK-NEXT:    mov z3.s, #0x80000000
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzs z4.s, p0/m, z0.s
-; CHECK-NEXT:    mov z6.s, #0x7fffffff
 ; CHECK-NEXT:    mov z2.s, w8
+; CHECK-NEXT:    mov w8, #1325400063 // =0x4effffff
+; CHECK-NEXT:    mov z6.s, #0x7fffffff
+; CHECK-NEXT:    mov z4.s, w8
+; CHECK-NEXT:    movprfx z5, z1
+; CHECK-NEXT:    fcvtzs z5.s, p0/m, z1.s
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    mov z5.s, w9
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z5.s
-; CHECK-NEXT:    mov z4.s, p1/m, z3.s
-; CHECK-NEXT:    fcmge p1.s, p0/z, z1.s, z2.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z1.s, z2.s
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    fcvtzs z2.s, p0/m, z0.s
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z0.s, z4.s
+; CHECK-NEXT:    fcmgt p4.s, p0/z, z1.s, z4.s
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    movprfx z2, z1
-; CHECK-NEXT:    fcvtzs z2.s, p0/m, z1.s
-; CHECK-NEXT:    sel z3.s, p1, z3.s, z2.s
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z1.s, z5.s
-; CHECK-NEXT:    sel z2.s, p2, z6.s, z4.s
-; CHECK-NEXT:    mov z3.s, p1/m, z6.s
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    mov z2.s, p1/m, z3.s
 ; CHECK-NEXT:    fcmuo p1.s, p0/z, z0.s, z0.s
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z1.s, z1.s
-; CHECK-NEXT:    mov z2.s, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z3.s, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    mov z1.d, z3.d
+; CHECK-NEXT:    sel z3.s, p2, z3.s, z5.s
+; CHECK-NEXT:    sel z0.s, p3, z6.s, z2.s
+; CHECK-NEXT:    sel z1.s, p4, z6.s, z3.s
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.s, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.s, p0/m, #0 // =0x0
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 8 x i32> @llvm.fptosi.sat.nxv8f32.nxv8i32(<vscale x 8 x float> %f)
     ret <vscale x 8 x i32> %x
@@ -99,23 +103,22 @@ define <vscale x 8 x i32> @test_signed_v8f32_v8i32(<vscale x 8 x float> %f) {
 define <vscale x 4 x i16> @test_signed_v4f32_v4i16(<vscale x 4 x float> %f) {
 ; CHECK-LABEL: test_signed_v4f32_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-956301312
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w8, #-956301312 // =0xc7000000
 ; CHECK-NEXT:    mov z1.s, w8
-; CHECK-NEXT:    mov w8, #65024
+; CHECK-NEXT:    mov w8, #65024 // =0xfe00
 ; CHECK-NEXT:    movk w8, #18175, lsl #16
+; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z1.s
 ; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    fcvtzs z1.s, p0/m, z0.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    mov z1.s, p1/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    mov z2.s, w8
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z2.s
 ; CHECK-NEXT:    mov z2.s, #32767 // =0x7fff
-; CHECK-NEXT:    mov z1.s, p1/m, z2.s
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    mov z1.s, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    mov z1.s, p1/m, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    sel z0.s, p2, z2.s, z1.s
+; CHECK-NEXT:    mov z0.s, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i16> @llvm.fptosi.sat.nxv4f32.nxv4i16(<vscale x 4 x float> %f)
     ret <vscale x 4 x i16> %x
@@ -124,32 +127,40 @@ define <vscale x 4 x i16> @test_signed_v4f32_v4i16(<vscale x 4 x float> %f) {
 define <vscale x 8 x i16> @test_signed_v8f32_v8i16(<vscale x 8 x float> %f) {
 ; CHECK-LABEL: test_signed_v8f32_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-956301312
-; CHECK-NEXT:    mov w9, #65024
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    movk w9, #18175, lsl #16
-; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    fcvtzs z4.s, p0/m, z1.s
-; CHECK-NEXT:    mov z3.s, #32767 // =0x7fff
+; CHECK-NEXT:    mov w8, #-956301312 // =0xc7000000
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    fcvtzs z4.s, p0/m, z0.s
 ; CHECK-NEXT:    mov z2.s, w8
+; CHECK-NEXT:    mov w8, #65024 // =0xfe00
+; CHECK-NEXT:    mov z5.s, #32767 // =0x7fff
+; CHECK-NEXT:    movk w8, #18175, lsl #16
+; CHECK-NEXT:    mov z3.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z1.s, z2.s
-; CHECK-NEXT:    mov z5.s, w9
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z1.s, z5.s
-; CHECK-NEXT:    mov z4.s, p1/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    fcvtzs z2.s, p0/m, z0.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    movprfx z2, z1
+; CHECK-NEXT:    fcvtzs z2.s, p0/m, z1.s
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z1.s, z3.s
+; CHECK-NEXT:    fcmgt p4.s, p0/z, z0.s, z3.s
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
 ; CHECK-NEXT:    mov z2.s, p1/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z0.s, z5.s
-; CHECK-NEXT:    mov z4.s, p2/m, z3.s
-; CHECK-NEXT:    mov z2.s, p1/m, z3.s
 ; CHECK-NEXT:    fcmuo p1.s, p0/z, z1.s, z1.s
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    mov z4.s, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.s, p0/m, #0 // =0x0
-; CHECK-NEXT:    uzp1 z0.h, z2.h, z4.h
+; CHECK-NEXT:    mov z4.s, p2/m, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    sel z0.s, p3, z5.s, z2.s
+; CHECK-NEXT:    sel z1.s, p4, z5.s, z4.s
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.s, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.s, p0/m, #0 // =0x0
+; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 8 x i16> @llvm.fptosi.sat.nxv8f32.nxv8i16(<vscale x 8 x float> %f)
     ret <vscale x 8 x i16> %x
@@ -158,23 +169,22 @@ define <vscale x 8 x i16> @test_signed_v8f32_v8i16(<vscale x 8 x float> %f) {
 define <vscale x 2 x i64> @test_signed_v2f32_v2i64(<vscale x 2 x float> %f) {
 ; CHECK-LABEL: test_signed_v2f32_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-553648128
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
 ; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.s, w8
-; CHECK-NEXT:    mov w8, #1593835519
+; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
+; CHECK-NEXT:    mov z3.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z1.s
 ; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.s
+; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z3.s
+; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    mov z2.s, w8
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    mov z1.d, p1/m, z2.d
+; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i64> @llvm.fptosi.sat.nxv2f32.nxv2i64(<vscale x 2 x float> %f)
     ret <vscale x 2 x i64> %x
@@ -183,33 +193,41 @@ define <vscale x 2 x i64> @test_signed_v2f32_v2i64(<vscale x 2 x float> %f) {
 define <vscale x 4 x i64> @test_signed_v4f32_v4i64(<vscale x 4 x float> %f) {
 ; CHECK-LABEL: test_signed_v4f32_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-553648128
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpklo z3.d, z0.s
-; CHECK-NEXT:    mov w9, #1593835519
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    uunpkhi z5.d, z0.s
-; CHECK-NEXT:    mov z1.s, w8
-; CHECK-NEXT:    movprfx z0, z3
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z3.s
-; CHECK-NEXT:    fcmge p1.s, p0/z, z3.s, z1.s
-; CHECK-NEXT:    mov z4.s, w9
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z3.s, z4.s
-; CHECK-NEXT:    mov z0.d, p1/m, z2.d
-; CHECK-NEXT:    fcmge p1.s, p0/z, z5.s, z1.s
-; CHECK-NEXT:    movprfx z1, z5
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z5.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
+; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    mov z2.s, w8
+; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
+; CHECK-NEXT:    mov z4.s, w8
+; CHECK-NEXT:    movprfx z5, z0
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.s
 ; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z5.s, z4.s
-; CHECK-NEXT:    mov z0.d, p2/m, z6.d
-; CHECK-NEXT:    mov z1.d, p1/m, z6.d
-; CHECK-NEXT:    fcmuo p1.s, p0/z, z3.s, z3.s
-; CHECK-NEXT:    fcmuo p0.s, p0/z, z5.s, z5.s
+; CHECK-NEXT:    fcmge p1.s, p0/z, z1.s, z2.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    movprfx z2, z1
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z1.s
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z1.s, z4.s
+; CHECK-NEXT:    fcmgt p4.s, p0/z, z0.s, z4.s
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    mov z2.d, p1/m, z3.d
+; CHECK-NEXT:    fcmuo p1.s, p0/z, z1.s, z1.s
+; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
+; CHECK-NEXT:    sel z3.d, p2, z3.d, z5.d
+; CHECK-NEXT:    sel z0.d, p3, z6.d, z2.d
+; CHECK-NEXT:    sel z1.d, p4, z6.d, z3.d
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i64> @llvm.fptosi.sat.nxv4f32.nxv4i64(<vscale x 4 x float> %f)
     ret <vscale x 4 x i64> %x
@@ -228,24 +246,23 @@ declare <vscale x 4 x i64> @llvm.fptosi.sat.nxv4f64.nxv4i64(<vscale x 4 x double
 define <vscale x 2 x i32> @test_signed_v2f64_v2i32(<vscale x 2 x double> %f) {
 ; CHECK-LABEL: test_signed_v2f64_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-4476578029606273024
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x8, #-4476578029606273024 // =0xc1e0000000000000
 ; CHECK-NEXT:    mov z2.d, #0xffffffff80000000
 ; CHECK-NEXT:    mov z1.d, x8
-; CHECK-NEXT:    mov x8, #281474972516352
+; CHECK-NEXT:    mov x8, #281474972516352 // =0xffffffc00000
 ; CHECK-NEXT:    movk x8, #16863, lsl #48
+; CHECK-NEXT:    mov z3.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z1.d
 ; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z3.d
+; CHECK-NEXT:    mov z3.d, #0x7fffffff
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z2.d
-; CHECK-NEXT:    mov z2.d, #0x7fffffff
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    mov z1.d, p1/m, z2.d
+; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i32> @llvm.fptosi.sat.nxv2f64.nxv2i32(<vscale x 2 x double> %f)
     ret <vscale x 2 x i32> %x
@@ -254,33 +271,41 @@ define <vscale x 2 x i32> @test_signed_v2f64_v2i32(<vscale x 2 x double> %f) {
 define <vscale x 4 x i32> @test_signed_v4f64_v4i32(<vscale x 4 x double> %f) {
 ; CHECK-LABEL: test_signed_v4f64_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-4476578029606273024
-; CHECK-NEXT:    mov x9, #281474972516352
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    movk x9, #16863, lsl #48
+; CHECK-NEXT:    mov x8, #-4476578029606273024 // =0xc1e0000000000000
 ; CHECK-NEXT:    mov z3.d, #0xffffffff80000000
-; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z1.d
 ; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    mov x8, #281474972516352 // =0xffffffc00000
 ; CHECK-NEXT:    mov z6.d, #0x7fffffff
+; CHECK-NEXT:    movk x8, #16863, lsl #48
+; CHECK-NEXT:    movprfx z5, z0
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.d
+; CHECK-NEXT:    mov z4.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, z2.d
-; CHECK-NEXT:    mov z5.d, x9
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z1.d, z5.d
-; CHECK-NEXT:    mov z4.d, p1/m, z3.d
-; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z2.d
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z0.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, z2.d
+; CHECK-NEXT:    movprfx z2, z1
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z1.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z4.d
+; CHECK-NEXT:    fcmgt p4.d, p0/z, z0.d, z4.d
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
 ; CHECK-NEXT:    mov z2.d, p1/m, z3.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z5.d
-; CHECK-NEXT:    sel z3.d, p2, z6.d, z4.d
-; CHECK-NEXT:    mov z2.d, p1/m, z6.d
 ; CHECK-NEXT:    fcmuo p1.d, p0/z, z1.d, z1.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    mov z3.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    uzp1 z0.s, z2.s, z3.s
+; CHECK-NEXT:    sel z3.d, p2, z3.d, z5.d
+; CHECK-NEXT:    sel z0.d, p3, z6.d, z2.d
+; CHECK-NEXT:    sel z1.d, p4, z6.d, z3.d
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i32> @llvm.fptosi.sat.nxv4f64.nxv4i32(<vscale x 4 x double> %f)
     ret <vscale x 4 x i32> %x
@@ -289,52 +314,66 @@ define <vscale x 4 x i32> @test_signed_v4f64_v4i32(<vscale x 4 x double> %f) {
 define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
 ; CHECK-LABEL: test_signed_v8f64_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-4476578029606273024
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z5.d, #0xffffffff80000000
+; CHECK-NEXT:    mov x8, #-4476578029606273024 // =0xc1e0000000000000
 ; CHECK-NEXT:    movprfx z6, z1
 ; CHECK-NEXT:    fcvtzs z6.d, p0/m, z1.d
-; CHECK-NEXT:    mov z24.d, #0x7fffffff
 ; CHECK-NEXT:    mov z4.d, x8
-; CHECK-NEXT:    mov x8, #281474972516352
+; CHECK-NEXT:    mov x8, #281474972516352 // =0xffffffc00000
+; CHECK-NEXT:    movprfx z7, z0
+; CHECK-NEXT:    fcvtzs z7.d, p0/m, z0.d
 ; CHECK-NEXT:    movk x8, #16863, lsl #48
+; CHECK-NEXT:    mov z26.d, #0x7fffffff
+; CHECK-NEXT:    movprfx z24, z3
+; CHECK-NEXT:    fcvtzs z24.d, p0/m, z3.d
+; CHECK-NEXT:    mov z5.d, x8
+; CHECK-NEXT:    movprfx z25, z2
+; CHECK-NEXT:    fcvtzs z25.d, p0/m, z2.d
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, z4.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, z4.d
-; CHECK-NEXT:    mov z6.d, p1/m, z5.d
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    mov z7.d, x8
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z7.d
-; CHECK-NEXT:    mov z6.d, p1/m, z24.d
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z1.d, z1.d
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.d
-; CHECK-NEXT:    mov z6.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p2/m, z5.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z7.d
-; CHECK-NEXT:    mov z1.d, p2/m, z24.d
-; CHECK-NEXT:    fcmge p2.d, p0/z, z3.d, z4.d
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    movprfx z0, z3
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z3.d
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    mov z1.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p2/m, z5.d
-; CHECK-NEXT:    fcmge p2.d, p0/z, z2.d, z4.d
-; CHECK-NEXT:    movprfx z4, z2
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z2.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z3.d, z4.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z2.d, z4.d
+; CHECK-NEXT:    mov z4.d, #0xffffffff80000000
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z1.d, z5.d
+; CHECK-NEXT:    fcmgt p6.d, p0/z, z0.d, z5.d
+; CHECK-NEXT:    fcmgt p7.d, p0/z, z3.d, z5.d
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z3.d, z7.d
-; CHECK-NEXT:    mov z4.d, p2/m, z5.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z2.d, z7.d
-; CHECK-NEXT:    sel z5.d, p1, z24.d, z0.d
-; CHECK-NEXT:    mov z4.d, p2/m, z24.d
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z3.d, z3.d
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
+; CHECK-NEXT:    mov z6.d, p1/m, z4.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z2.d, z5.d
+; CHECK-NEXT:    sel z5.d, p2, z4.d, z7.d
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    sel z7.d, p3, z4.d, z24.d
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z0.d, z0.d
+; CHECK-NEXT:    sel z4.d, p4, z4.d, z25.d
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z3.d, z3.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z2.d, z2.d
-; CHECK-NEXT:    mov z5.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z4.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    uzp1 z0.s, z1.s, z6.s
-; CHECK-NEXT:    uzp1 z1.s, z4.s, z5.s
+; CHECK-NEXT:    sel z0.d, p5, z26.d, z6.d
+; CHECK-NEXT:    sel z1.d, p6, z26.d, z5.d
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z2.d, p7, z26.d, z7.d
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z3.d, p1, z26.d, z4.d
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z2.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    uzp1 z1.s, z3.s, z2.s
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 8 x i32> @llvm.fptosi.sat.nxv8f64.nxv8i32(<vscale x 8 x double> %f)
     ret <vscale x 8 x i32> %x
@@ -343,32 +382,40 @@ define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
 define <vscale x 4 x i16> @test_signed_v4f64_v4i16(<vscale x 4 x double> %f) {
 ; CHECK-LABEL: test_signed_v4f64_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-4548635623644200960
-; CHECK-NEXT:    mov x9, #281200098803712
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    movk x9, #16607, lsl #48
-; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z1.d
-; CHECK-NEXT:    mov z3.d, #32767 // =0x7fff
+; CHECK-NEXT:    mov x8, #-4548635623644200960 // =0xc0e0000000000000
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z0.d
 ; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    mov x8, #281200098803712 // =0xffc000000000
+; CHECK-NEXT:    mov z5.d, #32767 // =0x7fff
+; CHECK-NEXT:    movk x8, #16607, lsl #48
+; CHECK-NEXT:    mov z3.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, z2.d
-; CHECK-NEXT:    mov z5.d, x9
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z1.d, z5.d
-; CHECK-NEXT:    mov z4.d, p1/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z2.d
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z0.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, z2.d
+; CHECK-NEXT:    movprfx z2, z1
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z1.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z3.d
+; CHECK-NEXT:    fcmgt p4.d, p0/z, z0.d, z3.d
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
 ; CHECK-NEXT:    mov z2.d, p1/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z5.d
-; CHECK-NEXT:    mov z4.d, p2/m, z3.d
-; CHECK-NEXT:    mov z2.d, p1/m, z3.d
 ; CHECK-NEXT:    fcmuo p1.d, p0/z, z1.d, z1.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    mov z4.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    uzp1 z0.s, z2.s, z4.s
+; CHECK-NEXT:    mov z4.d, p2/m, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    sel z0.d, p3, z5.d, z2.d
+; CHECK-NEXT:    sel z1.d, p4, z5.d, z4.d
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i16> @llvm.fptosi.sat.nxv4f64.nxv4i16(<vscale x 4 x double> %f)
     ret <vscale x 4 x i16> %x
@@ -377,52 +424,66 @@ define <vscale x 4 x i16> @test_signed_v4f64_v4i16(<vscale x 4 x double> %f) {
 define <vscale x 8 x i16> @test_signed_v8f64_v8i16(<vscale x 8 x double> %f) {
 ; CHECK-LABEL: test_signed_v8f64_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-4548635623644200960
-; CHECK-NEXT:    mov x9, #281200098803712
-; CHECK-NEXT:    movk x9, #16607, lsl #48
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x8, #-4548635623644200960 // =0xc0e0000000000000
 ; CHECK-NEXT:    movprfx z5, z3
 ; CHECK-NEXT:    fcvtzs z5.d, p0/m, z3.d
-; CHECK-NEXT:    mov z7.d, #32767 // =0x7fff
 ; CHECK-NEXT:    mov z4.d, x8
+; CHECK-NEXT:    mov x8, #281200098803712 // =0xffc000000000
+; CHECK-NEXT:    movprfx z6, z2
+; CHECK-NEXT:    fcvtzs z6.d, p0/m, z2.d
+; CHECK-NEXT:    movk x8, #16607, lsl #48
+; CHECK-NEXT:    movprfx z7, z1
+; CHECK-NEXT:    fcvtzs z7.d, p0/m, z1.d
+; CHECK-NEXT:    movprfx z24, z0
+; CHECK-NEXT:    fcvtzs z24.d, p0/m, z0.d
+; CHECK-NEXT:    mov z25.d, #32767 // =0x7fff
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z3.d, z4.d
-; CHECK-NEXT:    mov z6.d, x9
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z3.d, z6.d
-; CHECK-NEXT:    mov z5.d, p1/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z3.d, z3.d
-; CHECK-NEXT:    mov z5.d, p2/m, z7.d
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z2.d, z4.d
-; CHECK-NEXT:    movprfx z3, z2
-; CHECK-NEXT:    fcvtzs z3.d, p0/m, z2.d
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    mov z3.d, p2/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z2.d, z6.d
-; CHECK-NEXT:    mov z3.d, p2/m, z7.d
-; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z4.d
-; CHECK-NEXT:    mov z5.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z2.d, z2.d
-; CHECK-NEXT:    movprfx z2, z1
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z1.d
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    mov z2.d, p2/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, z4.d
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z0.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z1.d, z4.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z0.d, z4.d
+; CHECK-NEXT:    mov z4.d, x8
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z3.d, z4.d
+; CHECK-NEXT:    fcmgt p6.d, p0/z, z2.d, z4.d
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    mov z3.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z6.d
-; CHECK-NEXT:    mov z4.d, p2/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z6.d
-; CHECK-NEXT:    mov z2.d, p1/m, z7.d
-; CHECK-NEXT:    mov z4.d, p2/m, z7.d
-; CHECK-NEXT:    fcmuo p1.d, p0/z, z1.d, z1.d
+; CHECK-NEXT:    fcmgt p7.d, p0/z, z1.d, z4.d
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
+; CHECK-NEXT:    mov z5.d, p1/m, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z4.d
+; CHECK-NEXT:    mov z6.d, p2/m, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    fcmuo p2.d, p0/z, z3.d, z3.d
+; CHECK-NEXT:    mov z7.d, p3/m, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    fcmuo p3.d, p0/z, z2.d, z2.d
+; CHECK-NEXT:    mov z24.d, p4/m, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    fcmuo p4.d, p0/z, z1.d, z1.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z4.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    uzp1 z0.s, z3.s, z5.s
-; CHECK-NEXT:    uzp1 z1.s, z4.s, z2.s
+; CHECK-NEXT:    sel z2.d, p5, z25.d, z5.d
+; CHECK-NEXT:    sel z0.d, p6, z25.d, z6.d
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z1.d, p7, z25.d, z7.d
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z3.d, p1, z25.d, z24.d
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z2.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z3.s, z1.s
 ; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 8 x i16> @llvm.fptosi.sat.nxv8f64.nxv8i16(<vscale x 8 x double> %f)
     ret <vscale x 8 x i16> %x
@@ -431,23 +492,22 @@ define <vscale x 8 x i16> @test_signed_v8f64_v8i16(<vscale x 8 x double> %f) {
 define <vscale x 2 x i64> @test_signed_v2f64_v2i64(<vscale x 2 x double> %f) {
 ; CHECK-LABEL: test_signed_v2f64_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-4332462841530417152
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
 ; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.d, x8
-; CHECK-NEXT:    mov x8, #4890909195324358655
+; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT:    mov z3.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z1.d
 ; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.d
+; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z3.d
+; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z2.d
-; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    mov z1.d, p1/m, z2.d
+; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i64> @llvm.fptosi.sat.nxv2f64.nxv2i64(<vscale x 2 x double> %f)
     ret <vscale x 2 x i64> %x
@@ -456,33 +516,39 @@ define <vscale x 2 x i64> @test_signed_v2f64_v2i64(<vscale x 2 x double> %f) {
 define <vscale x 4 x i64> @test_signed_v4f64_v4i64(<vscale x 4 x double> %f) {
 ; CHECK-LABEL: test_signed_v4f64_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-4332462841530417152
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov x9, #4890909195324358655
+; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
 ; CHECK-NEXT:    mov z3.d, #0x8000000000000000
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z0.d
-; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
+; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
+; CHECK-NEXT:    mov z4.d, x8
+; CHECK-NEXT:    movprfx z5, z1
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z1.d
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z2.d
-; CHECK-NEXT:    mov z5.d, x9
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z5.d
-; CHECK-NEXT:    mov z4.d, p1/m, z3.d
-; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, z2.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z2.d
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z0.d
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z0.d, z4.d
+; CHECK-NEXT:    fcmgt p4.d, p0/z, z1.d, z4.d
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    movprfx z2, z1
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z1.d
-; CHECK-NEXT:    sel z3.d, p1, z3.d, z2.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z5.d
-; CHECK-NEXT:    sel z2.d, p2, z6.d, z4.d
-; CHECK-NEXT:    mov z3.d, p1/m, z6.d
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    mov z2.d, p1/m, z3.d
 ; CHECK-NEXT:    fcmuo p1.d, p0/z, z0.d, z0.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z1.d, z1.d
-; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    mov z1.d, z3.d
+; CHECK-NEXT:    sel z3.d, p2, z3.d, z5.d
+; CHECK-NEXT:    sel z0.d, p3, z6.d, z2.d
+; CHECK-NEXT:    sel z1.d, p4, z6.d, z3.d
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i64> @llvm.fptosi.sat.nxv4f64.nxv4i64(<vscale x 4 x double> %f)
     ret <vscale x 4 x i64> %x
@@ -502,23 +568,22 @@ declare <vscale x 4 x i64> @llvm.fptosi.sat.nxv4f16.nxv4i64(<vscale x 4 x half>)
 define <vscale x 2 x i32> @test_signed_v2f16_v2i32(<vscale x 2 x half> %f) {
 ; CHECK-LABEL: test_signed_v2f16_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #64511
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov w8, #64511 // =0xfbff
 ; CHECK-NEXT:    mov z2.d, #0xffffffff80000000
 ; CHECK-NEXT:    mov z1.h, w8
-; CHECK-NEXT:    mov w8, #31743
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    mov z3.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
 ; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z3.h
+; CHECK-NEXT:    mov z3.d, #0x7fffffff
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z2.d, #0x7fffffff
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    mov z1.d, p1/m, z2.d
+; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i32> @llvm.fptosi.sat.nxv2f16.nxv2i32(<vscale x 2 x half> %f)
     ret <vscale x 2 x i32> %x
@@ -527,23 +592,22 @@ define <vscale x 2 x i32> @test_signed_v2f16_v2i32(<vscale x 2 x half> %f) {
 define <vscale x 4 x i32> @test_signed_v4f16_v4i32(<vscale x 4 x half> %f) {
 ; CHECK-LABEL: test_signed_v4f16_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #64511
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w8, #64511 // =0xfbff
 ; CHECK-NEXT:    mov z2.s, #0x80000000
 ; CHECK-NEXT:    mov z1.h, w8
-; CHECK-NEXT:    mov w8, #31743
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    mov z3.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
 ; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    fcvtzs z1.s, p0/m, z0.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z3.h
+; CHECK-NEXT:    mov z3.s, #0x7fffffff
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    mov z1.s, p1/m, z2.s
-; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z2.s, #0x7fffffff
-; CHECK-NEXT:    mov z1.s, p1/m, z2.s
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    mov z1.s, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    mov z1.s, p1/m, z2.s
+; CHECK-NEXT:    sel z0.s, p2, z3.s, z1.s
+; CHECK-NEXT:    mov z0.s, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i32> @llvm.fptosi.sat.nxv4f16.nxv4i32(<vscale x 4 x half> %f)
     ret <vscale x 4 x i32> %x
@@ -552,33 +616,41 @@ define <vscale x 4 x i32> @test_signed_v4f16_v4i32(<vscale x 4 x half> %f) {
 define <vscale x 8 x i32> @test_signed_v8f16_v8i32(<vscale x 8 x half> %f) {
 ; CHECK-LABEL: test_signed_v8f16_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #64511
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    uunpklo z3.s, z0.h
-; CHECK-NEXT:    mov w9, #31743
-; CHECK-NEXT:    mov z2.s, #0x80000000
-; CHECK-NEXT:    uunpkhi z5.s, z0.h
-; CHECK-NEXT:    mov z1.h, w8
-; CHECK-NEXT:    movprfx z0, z3
-; CHECK-NEXT:    fcvtzs z0.s, p0/m, z3.h
-; CHECK-NEXT:    fcmge p1.h, p0/z, z3.h, z1.h
-; CHECK-NEXT:    mov z4.h, w9
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z3.h, z4.h
-; CHECK-NEXT:    mov z0.s, p1/m, z2.s
-; CHECK-NEXT:    fcmge p1.h, p0/z, z5.h, z1.h
-; CHECK-NEXT:    movprfx z1, z5
-; CHECK-NEXT:    fcvtzs z1.s, p0/m, z5.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    uunpklo z1.s, z0.h
+; CHECK-NEXT:    uunpkhi z0.s, z0.h
+; CHECK-NEXT:    mov z2.h, w8
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    mov z3.s, #0x80000000
+; CHECK-NEXT:    mov z4.h, w8
+; CHECK-NEXT:    movprfx z5, z0
+; CHECK-NEXT:    fcvtzs z5.s, p0/m, z0.h
 ; CHECK-NEXT:    mov z6.s, #0x7fffffff
-; CHECK-NEXT:    mov z1.s, p1/m, z2.s
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z5.h, z4.h
-; CHECK-NEXT:    mov z0.s, p2/m, z6.s
-; CHECK-NEXT:    mov z1.s, p1/m, z6.s
-; CHECK-NEXT:    fcmuo p1.h, p0/z, z3.h, z3.h
-; CHECK-NEXT:    fcmuo p0.h, p0/z, z5.h, z5.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z1.h, z2.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    movprfx z2, z1
+; CHECK-NEXT:    fcvtzs z2.s, p0/m, z1.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z1.h, z4.h
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z0.h, z4.h
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    mov z2.s, p1/m, z3.s
+; CHECK-NEXT:    fcmuo p1.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
+; CHECK-NEXT:    sel z3.s, p2, z3.s, z5.s
+; CHECK-NEXT:    sel z0.s, p3, z6.s, z2.s
+; CHECK-NEXT:    sel z1.s, p4, z6.s, z3.s
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z0.s, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.s, p0/m, #0 // =0x0
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 8 x i32> @llvm.fptosi.sat.nxv8f16.nxv8i32(<vscale x 8 x half> %f)
     ret <vscale x 8 x i32> %x
@@ -587,22 +659,21 @@ define <vscale x 8 x i32> @test_signed_v8f16_v8i32(<vscale x 8 x half> %f) {
 define <vscale x 4 x i16> @test_signed_v4f16_v4i16(<vscale x 4 x half> %f) {
 ; CHECK-LABEL: test_signed_v4f16_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #63488
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w8, #63488 // =0xf800
 ; CHECK-NEXT:    mov z1.h, w8
-; CHECK-NEXT:    mov w8, #30719
+; CHECK-NEXT:    mov w8, #30719 // =0x77ff
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
 ; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    fcvtzs z1.s, p0/m, z0.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    mov z1.s, p1/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z2.h
 ; CHECK-NEXT:    mov z2.s, #32767 // =0x7fff
-; CHECK-NEXT:    mov z1.s, p1/m, z2.s
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    mov z1.s, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    mov z1.s, p1/m, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    sel z0.s, p2, z2.s, z1.s
+; CHECK-NEXT:    mov z0.s, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i16> @llvm.fptosi.sat.nxv4f16.nxv4i16(<vscale x 4 x half> %f)
     ret <vscale x 4 x i16> %x
@@ -611,22 +682,21 @@ define <vscale x 4 x i16> @test_signed_v4f16_v4i16(<vscale x 4 x half> %f) {
 define <vscale x 8 x i16> @test_signed_v8f16_v8i16(<vscale x 8 x half> %f) {
 ; CHECK-LABEL: test_signed_v8f16_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #63488
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov w8, #63488 // =0xf800
 ; CHECK-NEXT:    mov z1.h, w8
-; CHECK-NEXT:    mov w8, #30719
+; CHECK-NEXT:    mov w8, #30719 // =0x77ff
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
 ; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    fcvtzs z1.h, p0/m, z0.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    mov z1.h, p1/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z2.h
 ; CHECK-NEXT:    mov z2.h, #32767 // =0x7fff
-; CHECK-NEXT:    mov z1.h, p1/m, z2.h
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    mov z1.h, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    mov z1.h, p1/m, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    sel z0.h, p2, z2.h, z1.h
+; CHECK-NEXT:    mov z0.h, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 8 x i16> @llvm.fptosi.sat.nxv8f16.nxv8i16(<vscale x 8 x half> %f)
     ret <vscale x 8 x i16> %x
@@ -635,23 +705,22 @@ define <vscale x 8 x i16> @test_signed_v8f16_v8i16(<vscale x 8 x half> %f) {
 define <vscale x 2 x i64> @test_signed_v2f16_v2i64(<vscale x 2 x half> %f) {
 ; CHECK-LABEL: test_signed_v2f16_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #64511
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov w8, #64511 // =0xfbff
 ; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.h, w8
-; CHECK-NEXT:    mov w8, #31743
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    mov z3.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
 ; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z0.h
+; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z3.h
+; CHECK-NEXT:    mov z3.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    mov z1.d, p1/m, z2.d
+; CHECK-NEXT:    sel z0.d, p2, z3.d, z1.d
+; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i64> @llvm.fptosi.sat.nxv2f16.nxv2i64(<vscale x 2 x half> %f)
     ret <vscale x 2 x i64> %x
@@ -660,33 +729,41 @@ define <vscale x 2 x i64> @test_signed_v2f16_v2i64(<vscale x 2 x half> %f) {
 define <vscale x 4 x i64> @test_signed_v4f16_v4i64(<vscale x 4 x half> %f) {
 ; CHECK-LABEL: test_signed_v4f16_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #64511
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpklo z3.d, z0.s
-; CHECK-NEXT:    mov w9, #31743
-; CHECK-NEXT:    mov z2.d, #0x8000000000000000
-; CHECK-NEXT:    uunpkhi z5.d, z0.s
-; CHECK-NEXT:    mov z1.h, w8
-; CHECK-NEXT:    movprfx z0, z3
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z3.h
-; CHECK-NEXT:    fcmge p1.h, p0/z, z3.h, z1.h
-; CHECK-NEXT:    mov z4.h, w9
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z3.h, z4.h
-; CHECK-NEXT:    mov z0.d, p1/m, z2.d
-; CHECK-NEXT:    fcmge p1.h, p0/z, z5.h, z1.h
-; CHECK-NEXT:    movprfx z1, z5
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z5.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    mov z2.h, w8
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    mov z3.d, #0x8000000000000000
+; CHECK-NEXT:    mov z4.h, w8
+; CHECK-NEXT:    movprfx z5, z0
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.h
 ; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
-; CHECK-NEXT:    mov z1.d, p1/m, z2.d
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z5.h, z4.h
-; CHECK-NEXT:    mov z0.d, p2/m, z6.d
-; CHECK-NEXT:    mov z1.d, p1/m, z6.d
-; CHECK-NEXT:    fcmuo p1.h, p0/z, z3.h, z3.h
-; CHECK-NEXT:    fcmuo p0.h, p0/z, z5.h, z5.h
+; CHECK-NEXT:    fcmge p1.h, p0/z, z1.h, z2.h
+; CHECK-NEXT:    fcmge p2.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    movprfx z2, z1
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z1.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z1.h, z4.h
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z0.h, z4.h
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    mov z2.d, p1/m, z3.d
+; CHECK-NEXT:    fcmuo p1.h, p0/z, z1.h, z1.h
+; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
+; CHECK-NEXT:    sel z3.d, p2, z3.d, z5.d
+; CHECK-NEXT:    sel z0.d, p3, z6.d, z2.d
+; CHECK-NEXT:    sel z1.d, p4, z6.d, z3.d
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i64> @llvm.fptosi.sat.nxv4f16.nxv4i64(<vscale x 4 x half> %f)
     ret <vscale x 4 x i64> %x

diff  --git a/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll b/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll
index 9e847c178b3e2c..c9e06fd9f3414e 100644
--- a/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll
@@ -14,16 +14,17 @@ declare <vscale x 4 x i64> @llvm.fptoui.sat.nxv4f32.nxv4i64(<vscale x 4 x float>
 define <vscale x 2 x i32> @test_signed_v2f32_v2i32(<vscale x 2 x float> %f) {
 ; CHECK-LABEL: test_signed_v2f32_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1333788671
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    mov w8, #1333788671 // =0x4f7fffff
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    fcvtzu z2.d, p0/m, z0.s
 ; CHECK-NEXT:    mov z1.s, w8
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z1.s
-; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    mov z1.d, #0xffffffff
-; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p2/m, z1.d
+; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    mov z0.d, #0xffffffff
+; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z2.d
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i32> @llvm.fptoui.sat.nxv2f32.nxv2i32(<vscale x 2 x float> %f)
     ret <vscale x 2 x i32> %x
@@ -32,15 +33,15 @@ define <vscale x 2 x i32> @test_signed_v2f32_v2i32(<vscale x 2 x float> %f) {
 define <vscale x 4 x i32> @test_signed_v4f32_v4i32(<vscale x 4 x float> %f) {
 ; CHECK-LABEL: test_signed_v4f32_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1333788671
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    mov w8, #1333788671 // =0x4f7fffff
 ; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    fcvtzu z1.s, p0/m, z0.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    mov z2.s, w8
-; CHECK-NEXT:    mov z1.s, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    mov z1.s, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.s, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -51,22 +52,22 @@ define <vscale x 4 x i32> @test_signed_v4f32_v4i32(<vscale x 4 x float> %f) {
 define <vscale x 8 x i32> @test_signed_v8f32_v8i32(<vscale x 8 x float> %f) {
 ; CHECK-LABEL: test_signed_v8f32_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1333788671
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    fcmge p2.s, p0/z, z1.s, #0.0
+; CHECK-NEXT:    mov w8, #1333788671 // =0x4f7fffff
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    fcvtzu z2.s, p0/m, z0.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    mov z4.s, w8
 ; CHECK-NEXT:    movprfx z3, z1
 ; CHECK-NEXT:    fcvtzu z3.s, p0/m, z1.s
+; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    fcmge p2.s, p0/z, z1.s, #0.0
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z0.s, z4.s
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    mov z2.s, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z0.s, z4.s
 ; CHECK-NEXT:    fcmgt p0.s, p0/z, z1.s, z4.s
+; CHECK-NEXT:    mov z2.s, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z3.s, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.s, p1/m, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z2.s, p3/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z3.s, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    mov z1.d, z3.d
@@ -78,17 +79,18 @@ define <vscale x 8 x i32> @test_signed_v8f32_v8i32(<vscale x 8 x float> %f) {
 define <vscale x 4 x i16> @test_signed_v4f32_v4i16(<vscale x 4 x float> %f) {
 ; CHECK-LABEL: test_signed_v4f32_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #65280
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w8, #65280 // =0xff00
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    fcvtzu z2.s, p0/m, z0.s
 ; CHECK-NEXT:    movk w8, #18303, lsl #16
-; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
 ; CHECK-NEXT:    mov z1.s, w8
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z0.s, z1.s
-; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    mov z1.s, #65535 // =0xffff
-; CHECK-NEXT:    mov z0.s, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.s, p2/m, z1.s
+; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    mov z0.s, #65535 // =0xffff
+; CHECK-NEXT:    mov z2.s, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z2.s
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i16> @llvm.fptoui.sat.nxv4f32.nxv4i16(<vscale x 4 x float> %f)
     ret <vscale x 4 x i16> %x
@@ -97,26 +99,26 @@ define <vscale x 4 x i16> @test_signed_v4f32_v4i16(<vscale x 4 x float> %f) {
 define <vscale x 8 x i16> @test_signed_v8f32_v8i16(<vscale x 8 x float> %f) {
 ; CHECK-LABEL: test_signed_v8f32_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #65280
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    movk w8, #18303, lsl #16
-; CHECK-NEXT:    fcmge p1.s, p0/z, z1.s, #0.0
+; CHECK-NEXT:    mov w8, #65280 // =0xff00
 ; CHECK-NEXT:    movprfx z3, z1
 ; CHECK-NEXT:    fcvtzu z3.s, p0/m, z1.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    mov z3.s, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    movk w8, #18303, lsl #16
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    fcvtzu z4.s, p0/m, z0.s
 ; CHECK-NEXT:    mov z2.s, w8
+; CHECK-NEXT:    fcmge p1.s, p0/z, z1.s, #0.0
+; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z1.s, z2.s
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p2.s, p0/z, z1.s, z2.s
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzu z1.s, p0/m, z0.s
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
 ; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z2.s
 ; CHECK-NEXT:    mov z0.s, #65535 // =0xffff
-; CHECK-NEXT:    mov z1.s, p1/m, #0 // =0x0
-; CHECK-NEXT:    sel z2.s, p2, z0.s, z3.s
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT:    mov z3.s, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z4.s, p2/m, #0 // =0x0
+; CHECK-NEXT:    sel z1.s, p3, z0.s, z3.s
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z4.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
 ; CHECK-NEXT:    ret
     %x = call <vscale x 8 x i16> @llvm.fptoui.sat.nxv8f32.nxv8i16(<vscale x 8 x float> %f)
     ret <vscale x 8 x i16> %x
@@ -125,15 +127,15 @@ define <vscale x 8 x i16> @test_signed_v8f32_v8i16(<vscale x 8 x float> %f) {
 define <vscale x 2 x i64> @test_signed_v2f32_v2i64(<vscale x 2 x float> %f) {
 ; CHECK-LABEL: test_signed_v2f32_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1602224127
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    mov w8, #1602224127 // =0x5f7fffff
 ; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    fcvtzu z1.d, p0/m, z0.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    mov z2.s, w8
-; CHECK-NEXT:    mov z1.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    mov z1.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -144,24 +146,24 @@ define <vscale x 2 x i64> @test_signed_v2f32_v2i64(<vscale x 2 x float> %f) {
 define <vscale x 4 x i64> @test_signed_v4f32_v4i64(<vscale x 4 x float> %f) {
 ; CHECK-LABEL: test_signed_v4f32_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1602224127
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    uunpklo z2.d, z0.s
 ; CHECK-NEXT:    uunpkhi z3.d, z0.s
-; CHECK-NEXT:    fcmge p2.s, p0/z, z1.s, #0.0
-; CHECK-NEXT:    movprfx z0, z1
-; CHECK-NEXT:    fcvtzu z0.d, p0/m, z1.s
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    mov z2.s, w8
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    fcmge p2.s, p0/z, z3.s, #0.0
-; CHECK-NEXT:    fcmgt p1.s, p0/z, z1.s, z2.s
+; CHECK-NEXT:    mov w8, #1602224127 // =0x5f7fffff
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z2.s
 ; CHECK-NEXT:    movprfx z1, z3
 ; CHECK-NEXT:    fcvtzu z1.d, p0/m, z3.s
+; CHECK-NEXT:    mov z4.s, w8
+; CHECK-NEXT:    fcmge p1.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    fcmge p2.s, p0/z, z3.s, #0.0
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z2.s, z4.s
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    fcmgt p0.s, p0/z, z3.s, z2.s
+; CHECK-NEXT:    fcmgt p0.s, p0/z, z3.s, z4.s
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p1/m, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z0.d, p3/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.d, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i64> @llvm.fptoui.sat.nxv4f32.nxv4i64(<vscale x 4 x float> %f)
@@ -181,17 +183,18 @@ declare <vscale x 4 x i64> @llvm.fptoui.sat.nxv4f64.nxv4i64(<vscale x 4 x double
 define <vscale x 2 x i32> @test_signed_v2f64_v2i32(<vscale x 2 x double> %f) {
 ; CHECK-LABEL: test_signed_v2f64_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #281474974613504
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x8, #281474974613504 // =0xffffffe00000
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    fcvtzu z2.d, p0/m, z0.d
 ; CHECK-NEXT:    movk x8, #16879, lsl #48
-; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, #0.0
 ; CHECK-NEXT:    mov z1.d, x8
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z0.d, z1.d
-; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    mov z1.d, #0xffffffff
-; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p2/m, z1.d
+; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    mov z0.d, #0xffffffff
+; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z2.d
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i32> @llvm.fptoui.sat.nxv2f64.nxv2i32(<vscale x 2 x double> %f)
     ret <vscale x 2 x i32> %x
@@ -200,26 +203,26 @@ define <vscale x 2 x i32> @test_signed_v2f64_v2i32(<vscale x 2 x double> %f) {
 define <vscale x 4 x i32> @test_signed_v4f64_v4i32(<vscale x 4 x double> %f) {
 ; CHECK-LABEL: test_signed_v4f64_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #281474974613504
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    movk x8, #16879, lsl #48
-; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, #0.0
+; CHECK-NEXT:    mov x8, #281474974613504 // =0xffffffe00000
 ; CHECK-NEXT:    movprfx z3, z1
 ; CHECK-NEXT:    fcvtzu z3.d, p0/m, z1.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    mov z3.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    movk x8, #16879, lsl #48
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    fcvtzu z4.d, p0/m, z0.d
 ; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, #0.0
+; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z2.d
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z1.d, z2.d
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzu z1.d, p0/m, z0.d
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
 ; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z2.d
 ; CHECK-NEXT:    mov z0.d, #0xffffffff
-; CHECK-NEXT:    mov z1.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    sel z2.d, p2, z0.d, z3.d
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
+; CHECK-NEXT:    mov z3.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z4.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    sel z1.d, p3, z0.d, z3.d
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z4.d
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i32> @llvm.fptoui.sat.nxv4f64.nxv4i32(<vscale x 4 x double> %f)
     ret <vscale x 4 x i32> %x
@@ -228,40 +231,53 @@ define <vscale x 4 x i32> @test_signed_v4f64_v4i32(<vscale x 4 x double> %f) {
 define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
 ; CHECK-LABEL: test_signed_v8f64_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #281474974613504
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    movk x8, #16879, lsl #48
-; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, #0.0
+; CHECK-NEXT:    mov x8, #281474974613504 // =0xffffffe00000
 ; CHECK-NEXT:    movprfx z5, z1
 ; CHECK-NEXT:    fcvtzu z5.d, p0/m, z1.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    mov z5.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, #0.0
-; CHECK-NEXT:    mov z4.d, x8
+; CHECK-NEXT:    movk x8, #16879, lsl #48
 ; CHECK-NEXT:    movprfx z6, z0
 ; CHECK-NEXT:    fcvtzu z6.d, p0/m, z0.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z1.d, z4.d
-; CHECK-NEXT:    mov z1.d, #0xffffffff
+; CHECK-NEXT:    movprfx z7, z3
+; CHECK-NEXT:    fcvtzu z7.d, p0/m, z3.d
+; CHECK-NEXT:    mov z4.d, x8
+; CHECK-NEXT:    movprfx z24, z2
+; CHECK-NEXT:    fcvtzu z24.d, p0/m, z2.d
+; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, #0.0
+; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    fcmge p3.d, p0/z, z3.d, #0.0
+; CHECK-NEXT:    fcmge p4.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z1.d, z4.d
+; CHECK-NEXT:    fcmgt p6.d, p0/z, z0.d, z4.d
+; CHECK-NEXT:    mov z0.d, #0xffffffff
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    mov z6.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z4.d
-; CHECK-NEXT:    sel z0.d, p2, z1.d, z5.d
-; CHECK-NEXT:    fcmge p2.d, p0/z, z3.d, #0.0
-; CHECK-NEXT:    sel z5.d, p1, z1.d, z6.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z3.d, z4.d
-; CHECK-NEXT:    fcvtzu z3.d, p0/m, z3.d
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    mov z3.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    fcmge p2.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    movprfx z6, z2
-; CHECK-NEXT:    fcvtzu z6.d, p0/m, z2.d
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
+; CHECK-NEXT:    mov z5.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z3.d, z4.d
 ; CHECK-NEXT:    fcmgt p0.d, p0/z, z2.d, z4.d
 ; CHECK-NEXT:    mov z6.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    sel z2.d, p1, z1.d, z3.d
-; CHECK-NEXT:    sel z1.d, p0, z1.d, z6.d
-; CHECK-NEXT:    uzp1 z0.s, z5.s, z0.s
-; CHECK-NEXT:    uzp1 z1.s, z1.s, z2.s
+; CHECK-NEXT:    mov z7.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z24.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    sel z1.d, p5, z0.d, z5.d
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z2.d, p6, z0.d, z6.d
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z3.d, p1, z0.d, z7.d
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z4.d, p0, z0.d, z24.d
+; CHECK-NEXT:    uzp1 z0.s, z2.s, z1.s
+; CHECK-NEXT:    uzp1 z1.s, z4.s, z3.s
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 8 x i32> @llvm.fptoui.sat.nxv8f64.nxv8i32(<vscale x 8 x double> %f)
     ret <vscale x 8 x i32> %x
@@ -270,26 +286,26 @@ define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
 define <vscale x 4 x i16> @test_signed_v4f64_v4i16(<vscale x 4 x double> %f) {
 ; CHECK-LABEL: test_signed_v4f64_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #281337537757184
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    movk x8, #16623, lsl #48
-; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, #0.0
+; CHECK-NEXT:    mov x8, #281337537757184 // =0xffe000000000
 ; CHECK-NEXT:    movprfx z3, z1
 ; CHECK-NEXT:    fcvtzu z3.d, p0/m, z1.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    mov z3.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    movk x8, #16623, lsl #48
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    fcvtzu z4.d, p0/m, z0.d
 ; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, #0.0
+; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z2.d
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z1.d, z2.d
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    fcvtzu z1.d, p0/m, z0.d
+; CHECK-NEXT:    not p2.b, p0/z, p2.b
 ; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z2.d
 ; CHECK-NEXT:    mov z0.d, #65535 // =0xffff
-; CHECK-NEXT:    mov z1.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    sel z2.d, p2, z0.d, z3.d
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
+; CHECK-NEXT:    mov z3.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z4.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    sel z1.d, p3, z0.d, z3.d
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z4.d
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i16> @llvm.fptoui.sat.nxv4f64.nxv4i16(<vscale x 4 x double> %f)
     ret <vscale x 4 x i16> %x
@@ -298,41 +314,54 @@ define <vscale x 4 x i16> @test_signed_v4f64_v4i16(<vscale x 4 x double> %f) {
 define <vscale x 8 x i16> @test_signed_v8f64_v8i16(<vscale x 8 x double> %f) {
 ; CHECK-LABEL: test_signed_v8f64_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #281337537757184
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    movk x8, #16623, lsl #48
-; CHECK-NEXT:    fcmge p1.d, p0/z, z3.d, #0.0
+; CHECK-NEXT:    mov x8, #281337537757184 // =0xffe000000000
 ; CHECK-NEXT:    movprfx z5, z3
 ; CHECK-NEXT:    fcvtzu z5.d, p0/m, z3.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    mov z5.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmge p1.d, p0/z, z2.d, #0.0
-; CHECK-NEXT:    mov z4.d, x8
+; CHECK-NEXT:    movk x8, #16623, lsl #48
 ; CHECK-NEXT:    movprfx z6, z2
 ; CHECK-NEXT:    fcvtzu z6.d, p0/m, z2.d
-; CHECK-NEXT:    fcmgt p2.d, p0/z, z3.d, z4.d
-; CHECK-NEXT:    mov z3.d, #65535 // =0xffff
+; CHECK-NEXT:    movprfx z7, z1
+; CHECK-NEXT:    fcvtzu z7.d, p0/m, z1.d
+; CHECK-NEXT:    mov z4.d, x8
+; CHECK-NEXT:    movprfx z24, z0
+; CHECK-NEXT:    fcvtzu z24.d, p0/m, z0.d
+; CHECK-NEXT:    fcmge p1.d, p0/z, z3.d, #0.0
+; CHECK-NEXT:    fcmge p2.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    fcmge p3.d, p0/z, z1.d, #0.0
+; CHECK-NEXT:    fcmge p4.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z3.d, z4.d
+; CHECK-NEXT:    fcmgt p6.d, p0/z, z2.d, z4.d
+; CHECK-NEXT:    mov z2.d, #65535 // =0xffff
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    mov z6.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z2.d, z4.d
-; CHECK-NEXT:    sel z2.d, p2, z3.d, z5.d
-; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, #0.0
-; CHECK-NEXT:    sel z5.d, p1, z3.d, z6.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z4.d
-; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.d
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, #0.0
-; CHECK-NEXT:    movprfx z6, z0
-; CHECK-NEXT:    fcvtzu z6.d, p0/m, z0.d
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
+; CHECK-NEXT:    mov z5.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z4.d
 ; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z4.d
 ; CHECK-NEXT:    mov z6.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    sel z0.d, p1, z3.d, z1.d
-; CHECK-NEXT:    sel z1.d, p0, z3.d, z6.d
-; CHECK-NEXT:    uzp1 z2.s, z5.s, z2.s
+; CHECK-NEXT:    mov z7.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z24.d, p4/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p5, z2.d, z5.d
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z1.d, p6, z2.d, z6.d
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z3.d, p1, z2.d, z7.d
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    sel z2.d, p0, z2.d, z24.d
 ; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT:    uzp1 z1.s, z2.s, z3.s
+; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
     %x = call <vscale x 8 x i16> @llvm.fptoui.sat.nxv8f64.nxv8i16(<vscale x 8 x double> %f)
     ret <vscale x 8 x i16> %x
@@ -341,15 +370,15 @@ define <vscale x 8 x i16> @test_signed_v8f64_v8i16(<vscale x 8 x double> %f) {
 define <vscale x 2 x i64> @test_signed_v2f64_v2i64(<vscale x 2 x double> %f) {
 ; CHECK-LABEL: test_signed_v2f64_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #4895412794951729151
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    mov x8, #4895412794951729151 // =0x43efffffffffffff
 ; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    fcvtzu z1.d, p0/m, z0.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    mov z1.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z2.d
+; CHECK-NEXT:    mov z1.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -360,22 +389,22 @@ define <vscale x 2 x i64> @test_signed_v2f64_v2i64(<vscale x 2 x double> %f) {
 define <vscale x 4 x i64> @test_signed_v4f64_v4i64(<vscale x 4 x double> %f) {
 ; CHECK-LABEL: test_signed_v4f64_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #4895412794951729151
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, #0.0
-; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, #0.0
+; CHECK-NEXT:    mov x8, #4895412794951729151 // =0x43efffffffffffff
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    fcvtzu z2.d, p0/m, z0.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    mov z4.d, x8
 ; CHECK-NEXT:    movprfx z3, z1
 ; CHECK-NEXT:    fcvtzu z3.d, p0/m, z1.d
+; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, #0.0
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z0.d, z4.d
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z4.d
 ; CHECK-NEXT:    fcmgt p0.d, p0/z, z1.d, z4.d
+; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z3.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.d, p1/m, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z2.d, p3/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z3.d, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.d, z2.d
 ; CHECK-NEXT:    mov z1.d, z3.d
@@ -398,16 +427,17 @@ declare <vscale x 4 x i64> @llvm.fptoui.sat.nxv4f16.nxv4i64(<vscale x 4 x half>)
 define <vscale x 2 x i32> @test_signed_v2f16_v2i32(<vscale x 2 x half> %f) {
 ; CHECK-LABEL: test_signed_v2f16_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #31743
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    fcvtzu z2.d, p0/m, z0.h
 ; CHECK-NEXT:    mov z1.h, w8
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.h
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    mov z1.d, #0xffffffff
-; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p2/m, z1.d
+; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    fcmgt p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z0.d, #0xffffffff
+; CHECK-NEXT:    mov z2.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z2.d
 ; CHECK-NEXT:    ret
     %x = call <vscale x 2 x i32> @llvm.fptoui.sat.nxv2f16.nxv2i32(<vscale x 2 x half> %f)
     ret <vscale x 2 x i32> %x
@@ -416,15 +446,15 @@ define <vscale x 2 x i32> @test_signed_v2f16_v2i32(<vscale x 2 x half> %f) {
 define <vscale x 4 x i32> @test_signed_v4f16_v4i32(<vscale x 4 x half> %f) {
 ; CHECK-LABEL: test_signed_v4f16_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #31743
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    fcvtzu z1.s, p0/m, z0.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    mov z1.s, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    mov z1.s, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.s, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -435,24 +465,24 @@ define <vscale x 4 x i32> @test_signed_v4f16_v4i32(<vscale x 4 x half> %f) {
 define <vscale x 8 x i32> @test_signed_v8f16_v8i32(<vscale x 8 x half> %f) {
 ; CHECK-LABEL: test_signed_v8f16_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #31743
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    uunpklo z1.s, z0.h
+; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    uunpkhi z3.s, z0.h
-; CHECK-NEXT:    fcmge p2.h, p0/z, z1.h, #0.0
-; CHECK-NEXT:    movprfx z0, z1
-; CHECK-NEXT:    fcvtzu z0.s, p0/m, z1.h
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    mov z0.s, p2/m, #0 // =0x0
-; CHECK-NEXT:    fcmge p2.h, p0/z, z3.h, #0.0
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z2.h
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z2.h
 ; CHECK-NEXT:    movprfx z1, z3
 ; CHECK-NEXT:    fcvtzu z1.s, p0/m, z3.h
+; CHECK-NEXT:    mov z4.h, w8
+; CHECK-NEXT:    fcmge p1.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    fcmge p2.h, p0/z, z3.h, #0.0
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z2.h, z4.h
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    fcmgt p0.h, p0/z, z3.h, z2.h
+; CHECK-NEXT:    fcmgt p0.h, p0/z, z3.h, z4.h
+; CHECK-NEXT:    mov z0.s, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.s, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.s, p1/m, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z0.s, p3/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.s, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ret
     %x = call <vscale x 8 x i32> @llvm.fptoui.sat.nxv8f16.nxv8i32(<vscale x 8 x half> %f)
@@ -462,16 +492,17 @@ define <vscale x 8 x i32> @test_signed_v8f16_v8i32(<vscale x 8 x half> %f) {
 define <vscale x 4 x i16> @test_signed_v4f16_v4i16(<vscale x 4 x half> %f) {
 ; CHECK-LABEL: test_signed_v4f16_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #31743
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    movprfx z2, z0
+; CHECK-NEXT:    fcvtzu z2.s, p0/m, z0.h
 ; CHECK-NEXT:    mov z1.h, w8
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    mov z1.s, #65535 // =0xffff
-; CHECK-NEXT:    mov z0.s, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.s, p2/m, z1.s
+; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    fcmgt p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    mov z0.s, #65535 // =0xffff
+; CHECK-NEXT:    mov z2.s, p1/m, #0 // =0x0
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z2.s
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i16> @llvm.fptoui.sat.nxv4f16.nxv4i16(<vscale x 4 x half> %f)
     ret <vscale x 4 x i16> %x
@@ -480,15 +511,15 @@ define <vscale x 4 x i16> @test_signed_v4f16_v4i16(<vscale x 4 x half> %f) {
 define <vscale x 8 x i16> @test_signed_v8f16_v8i16(<vscale x 8 x half> %f) {
 ; CHECK-LABEL: test_signed_v8f16_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #31743
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    fcvtzu z1.h, p0/m, z0.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    mov z1.h, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    mov z1.h, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.h, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -499,15 +530,15 @@ define <vscale x 8 x i16> @test_signed_v8f16_v8i16(<vscale x 8 x half> %f) {
 define <vscale x 2 x i64> @test_signed_v2f16_v2i64(<vscale x 2 x half> %f) {
 ; CHECK-LABEL: test_signed_v2f16_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #31743
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
 ; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    fcvtzu z1.d, p0/m, z0.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    mov z1.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    mov z1.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -518,24 +549,24 @@ define <vscale x 2 x i64> @test_signed_v2f16_v2i64(<vscale x 2 x half> %f) {
 define <vscale x 4 x i64> @test_signed_v4f16_v4i64(<vscale x 4 x half> %f) {
 ; CHECK-LABEL: test_signed_v4f16_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #31743
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    uunpklo z2.d, z0.s
 ; CHECK-NEXT:    uunpkhi z3.d, z0.s
-; CHECK-NEXT:    fcmge p2.h, p0/z, z1.h, #0.0
-; CHECK-NEXT:    movprfx z0, z1
-; CHECK-NEXT:    fcvtzu z0.d, p0/m, z1.h
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    fcmge p2.h, p0/z, z3.h, #0.0
-; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z2.h
+; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z2.h
 ; CHECK-NEXT:    movprfx z1, z3
 ; CHECK-NEXT:    fcvtzu z1.d, p0/m, z3.h
+; CHECK-NEXT:    mov z4.h, w8
+; CHECK-NEXT:    fcmge p1.h, p0/z, z2.h, #0.0
+; CHECK-NEXT:    fcmge p2.h, p0/z, z3.h, #0.0
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z2.h, z4.h
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    fcmgt p0.h, p0/z, z3.h, z2.h
+; CHECK-NEXT:    fcmgt p0.h, p0/z, z3.h, z4.h
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p1/m, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z0.d, p3/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.d, p0/m, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ret
     %x = call <vscale x 4 x i64> @llvm.fptoui.sat.nxv4f16.nxv4i64(<vscale x 4 x half> %f)

diff  --git a/llvm/test/CodeGen/AArch64/sve-fptrunc-store.ll b/llvm/test/CodeGen/AArch64/sve-fptrunc-store.ll
index 6f66d45ae313cf..d1ca2ba0f95815 100644
--- a/llvm/test/CodeGen/AArch64/sve-fptrunc-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fptrunc-store.ll
@@ -69,14 +69,14 @@ define void @fptrunc8_f64_f16(<vscale x 8 x half> *%dst, <vscale x 8 x double> *
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1, #1, mul vl]
 ; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1, #2, mul vl]
 ; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x1, #3, mul vl]
+; CHECK-NEXT:    fcvt z2.h, p0/m, z2.d
 ; CHECK-NEXT:    fcvt z1.h, p0/m, z1.d
 ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
 ; CHECK-NEXT:    fcvt z3.h, p0/m, z3.d
-; CHECK-NEXT:    fcvt z2.h, p0/m, z2.d
-; CHECK-NEXT:    uzp1 z2.s, z2.s, z3.s
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z3.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll b/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll
index f96e411885df00..1a6e4814cee6d2 100644
--- a/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll
+++ b/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll
@@ -6,10 +6,10 @@
 define void @scatter_i8_index_offset_maximum(ptr %base, i64 %offset, <vscale x 4 x i1> %pg, <vscale x 4 x i8> %data) #0 {
 ; CHECK-LABEL: scatter_i8_index_offset_maximum:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #33554431
-; CHECK-NEXT:    add x9, x0, x1
+; CHECK-NEXT:    mov w8, #33554431 // =0x1ffffff
 ; CHECK-NEXT:    index z1.s, #0, w8
-; CHECK-NEXT:    st1b { z0.s }, p0, [x9, z1.s, sxtw]
+; CHECK-NEXT:    add x8, x0, x1
+; CHECK-NEXT:    st1b { z0.s }, p0, [x8, z1.s, sxtw]
 ; CHECK-NEXT:    ret
   %t0 = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
   %t1 = shufflevector <vscale x 4 x i64> %t0, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
@@ -27,10 +27,10 @@ define void @scatter_i8_index_offset_maximum(ptr %base, i64 %offset, <vscale x 4
 define void @scatter_i16_index_offset_minimum(ptr %base, i64 %offset, <vscale x 4 x i1> %pg, <vscale x 4 x i16> %data) #0 {
 ; CHECK-LABEL: scatter_i16_index_offset_minimum:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-33554432
-; CHECK-NEXT:    add x9, x0, x1, lsl #1
+; CHECK-NEXT:    mov w8, #-33554432 // =0xfe000000
 ; CHECK-NEXT:    index z1.s, #0, w8
-; CHECK-NEXT:    st1h { z0.s }, p0, [x9, z1.s, sxtw #1]
+; CHECK-NEXT:    add x8, x0, x1, lsl #1
+; CHECK-NEXT:    st1h { z0.s }, p0, [x8, z1.s, sxtw #1]
 ; CHECK-NEXT:    ret
   %t0 = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
   %t1 = shufflevector <vscale x 4 x i64> %t0, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
@@ -48,8 +48,8 @@ define void @scatter_i16_index_offset_minimum(ptr %base, i64 %offset, <vscale x
 define <vscale x 4 x i8> @gather_i8_index_offset_8(ptr %base, i64 %offset, <vscale x 4 x i1> %pg) #0 {
 ; CHECK-LABEL: gather_i8_index_offset_8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, x1
 ; CHECK-NEXT:    index z0.s, #0, #1
+; CHECK-NEXT:    add x8, x0, x1
 ; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x8, z0.s, sxtw]
 ; CHECK-NEXT:    ret
   %splat.insert0 = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
@@ -73,17 +73,17 @@ define void @scatter_f16_index_offset_var(ptr %base, i64 %offset, i64 %scale, <v
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    index z1.d, #0, #1
 ; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z3.d, x1
-; CHECK-NEXT:    incd z2.d
-; CHECK-NEXT:    mad z1.d, p1/m, z3.d, z3.d
-; CHECK-NEXT:    mad z2.d, p1/m, z3.d, z3.d
-; CHECK-NEXT:    punpklo p1.h, p0.b
+; CHECK-NEXT:    mov z2.d, x1
+; CHECK-NEXT:    movprfx z4, z2
+; CHECK-NEXT:    mla z4.d, p1/m, z1.d, z2.d
+; CHECK-NEXT:    punpklo p2.h, p0.b
 ; CHECK-NEXT:    uunpklo z3.d, z0.s
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
-; CHECK-NEXT:    st1h { z3.d }, p1, [x0, z1.d, lsl #1]
-; CHECK-NEXT:    st1h { z0.d }, p0, [x0, z2.d, lsl #1]
+; CHECK-NEXT:    incd z1.d
+; CHECK-NEXT:    st1h { z3.d }, p2, [x0, z4.d, lsl #1]
+; CHECK-NEXT:    mad z1.d, p1/m, z2.d, z2.d
+; CHECK-NEXT:    st1h { z0.d }, p0, [x0, z1.d, lsl #1]
 ; CHECK-NEXT:    ret
   %t0 = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
   %t1 = shufflevector <vscale x 4 x i64> %t0, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
@@ -101,18 +101,18 @@ define void @scatter_f16_index_offset_var(ptr %base, i64 %offset, i64 %scale, <v
 define void @scatter_i8_index_offset_maximum_plus_one(ptr %base, i64 %offset, <vscale x 4 x i1> %pg, <vscale x 4 x i8> %data) #0 {
 ; CHECK-LABEL: scatter_i8_index_offset_maximum_plus_one:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    rdvl x8, #1
-; CHECK-NEXT:    mov w9, #67108864
-; CHECK-NEXT:    lsr x8, x8, #4
-; CHECK-NEXT:    add x11, x0, x1
-; CHECK-NEXT:    mov w10, #33554432
 ; CHECK-NEXT:    punpklo p1.h, p0.b
-; CHECK-NEXT:    madd x8, x8, x9, x11
+; CHECK-NEXT:    mov w8, #33554432 // =0x2000000
 ; CHECK-NEXT:    uunpklo z2.d, z0.s
+; CHECK-NEXT:    index z1.d, #0, x8
+; CHECK-NEXT:    rdvl x9, #1
+; CHECK-NEXT:    add x8, x0, x1
+; CHECK-NEXT:    lsr x9, x9, #4
+; CHECK-NEXT:    mov w10, #67108864 // =0x4000000
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
-; CHECK-NEXT:    index z1.d, #0, x10
-; CHECK-NEXT:    st1b { z2.d }, p1, [x11, z1.d]
+; CHECK-NEXT:    st1b { z2.d }, p1, [x8, z1.d]
+; CHECK-NEXT:    madd x8, x9, x10, x8
 ; CHECK-NEXT:    st1b { z0.d }, p0, [x8, z1.d]
 ; CHECK-NEXT:    ret
   %t0 = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
@@ -131,19 +131,19 @@ define void @scatter_i8_index_offset_maximum_plus_one(ptr %base, i64 %offset, <v
 define void @scatter_i8_index_offset_minimum_minus_one(ptr %base, i64 %offset, <vscale x 4 x i1> %pg, <vscale x 4 x i8> %data) #0 {
 ; CHECK-LABEL: scatter_i8_index_offset_minimum_minus_one:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    rdvl x8, #1
-; CHECK-NEXT:    mov x9, #-2
-; CHECK-NEXT:    lsr x8, x8, #4
-; CHECK-NEXT:    movk x9, #64511, lsl #16
-; CHECK-NEXT:    add x11, x0, x1
-; CHECK-NEXT:    mov x10, #-33554433
-; CHECK-NEXT:    madd x8, x8, x9, x11
 ; CHECK-NEXT:    punpklo p1.h, p0.b
+; CHECK-NEXT:    mov x8, #-33554433 // =0xfffffffffdffffff
 ; CHECK-NEXT:    uunpklo z2.d, z0.s
+; CHECK-NEXT:    index z1.d, #0, x8
+; CHECK-NEXT:    rdvl x9, #1
+; CHECK-NEXT:    mov x10, #-2 // =0xfffffffffffffffe
+; CHECK-NEXT:    lsr x9, x9, #4
+; CHECK-NEXT:    add x8, x0, x1
+; CHECK-NEXT:    movk x10, #64511, lsl #16
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
-; CHECK-NEXT:    index z1.d, #0, x10
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
-; CHECK-NEXT:    st1b { z2.d }, p1, [x11, z1.d]
+; CHECK-NEXT:    st1b { z2.d }, p1, [x8, z1.d]
+; CHECK-NEXT:    madd x8, x9, x10, x8
 ; CHECK-NEXT:    st1b { z0.d }, p0, [x8, z1.d]
 ; CHECK-NEXT:    ret
   %t0 = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
@@ -162,18 +162,18 @@ define void @scatter_i8_index_offset_minimum_minus_one(ptr %base, i64 %offset, <
 define void @scatter_i8_index_stride_too_big(ptr %base, i64 %offset, <vscale x 4 x i1> %pg, <vscale x 4 x i8> %data) #0 {
 ; CHECK-LABEL: scatter_i8_index_stride_too_big:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    rdvl x8, #1
-; CHECK-NEXT:    mov x9, #-9223372036854775808
-; CHECK-NEXT:    lsr x8, x8, #4
-; CHECK-NEXT:    add x11, x0, x1
-; CHECK-NEXT:    mov x10, #4611686018427387904
 ; CHECK-NEXT:    punpklo p1.h, p0.b
-; CHECK-NEXT:    madd x8, x8, x9, x11
+; CHECK-NEXT:    mov x8, #4611686018427387904 // =0x4000000000000000
 ; CHECK-NEXT:    uunpklo z2.d, z0.s
+; CHECK-NEXT:    index z1.d, #0, x8
+; CHECK-NEXT:    rdvl x9, #1
+; CHECK-NEXT:    add x8, x0, x1
+; CHECK-NEXT:    lsr x9, x9, #4
+; CHECK-NEXT:    mov x10, #-9223372036854775808 // =0x8000000000000000
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
-; CHECK-NEXT:    index z1.d, #0, x10
-; CHECK-NEXT:    st1b { z2.d }, p1, [x11, z1.d]
+; CHECK-NEXT:    st1b { z2.d }, p1, [x8, z1.d]
+; CHECK-NEXT:    madd x8, x9, x10, x8
 ; CHECK-NEXT:    st1b { z0.d }, p0, [x8, z1.d]
 ; CHECK-NEXT:    ret
   %t0 = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
@@ -194,8 +194,8 @@ define void @scatter_i8_index_stride_too_big(ptr %base, i64 %offset, <vscale x 4
 define <vscale x 4 x i8> @gather_8i8_index_offset_8(ptr %base, i64 %offset, <vscale x 4 x i1> %pg) #0 {
 ; CHECK-LABEL: gather_8i8_index_offset_8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, x1, lsl #3
 ; CHECK-NEXT:    index z0.s, #0, #8
+; CHECK-NEXT:    add x8, x0, x1, lsl #3
 ; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x8, z0.s, sxtw]
 ; CHECK-NEXT:    ret
   %t0 = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
@@ -214,10 +214,10 @@ define <vscale x 4 x i8> @gather_8i8_index_offset_8(ptr %base, i64 %offset, <vsc
 define <vscale x 4 x float> @gather_f32_index_offset_8(ptr %base, i64 %offset, <vscale x 4 x i1> %pg) #0 {
 ; CHECK-LABEL: gather_f32_index_offset_8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32
-; CHECK-NEXT:    add x9, x0, x1, lsl #5
+; CHECK-NEXT:    mov w8, #32 // =0x20
 ; CHECK-NEXT:    index z0.s, #0, w8
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x9, z0.s, sxtw]
+; CHECK-NEXT:    add x8, x0, x1, lsl #5
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8, z0.s, sxtw]
 ; CHECK-NEXT:    ret
   %t0 = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
   %t1 = shufflevector <vscale x 4 x i64> %t0, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
@@ -235,8 +235,8 @@ define <vscale x 4 x float> @gather_f32_index_offset_8(ptr %base, i64 %offset, <
 define void @scatter_i8_index_offset_8(ptr %base, i64 %offset, <vscale x 4 x i1> %pg, <vscale x 4 x i8> %data) #0 {
 ; CHECK-LABEL: scatter_i8_index_offset_8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, x1, lsl #3
 ; CHECK-NEXT:    index z1.s, #0, #8
+; CHECK-NEXT:    add x8, x0, x1, lsl #3
 ; CHECK-NEXT:    st1b { z0.s }, p0, [x8, z1.s, sxtw]
 ; CHECK-NEXT:    ret
   %t0 = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
@@ -255,10 +255,10 @@ define void @scatter_i8_index_offset_8(ptr %base, i64 %offset, <vscale x 4 x i1>
 define void @scatter_f16_index_offset_8(ptr %base, i64 %offset, <vscale x 4 x i1> %pg, <vscale x 4 x half> %data) #0 {
 ; CHECK-LABEL: scatter_f16_index_offset_8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #16
-; CHECK-NEXT:    add x9, x0, x1, lsl #4
+; CHECK-NEXT:    mov w8, #16 // =0x10
 ; CHECK-NEXT:    index z1.s, #0, w8
-; CHECK-NEXT:    st1h { z0.s }, p0, [x9, z1.s, sxtw]
+; CHECK-NEXT:    add x8, x0, x1, lsl #4
+; CHECK-NEXT:    st1h { z0.s }, p0, [x8, z1.s, sxtw]
 ; CHECK-NEXT:    ret
   %t0 = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
   %t1 = shufflevector <vscale x 4 x i64> %t0, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
@@ -274,11 +274,11 @@ define void @scatter_f16_index_offset_8(ptr %base, i64 %offset, <vscale x 4 x i1
 define void @scatter_f16_index_add_add(ptr %base, i64 %offset, i64 %offset2, <vscale x 4 x i1> %pg, <vscale x 4 x half> %data) #0 {
 ; CHECK-LABEL: scatter_f16_index_add_add:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #16
+; CHECK-NEXT:    mov w8, #16 // =0x10
 ; CHECK-NEXT:    add x9, x0, x2, lsl #4
-; CHECK-NEXT:    add x9, x9, x1, lsl #4
 ; CHECK-NEXT:    index z1.s, #0, w8
-; CHECK-NEXT:    st1h { z0.s }, p0, [x9, z1.s, sxtw]
+; CHECK-NEXT:    add x8, x9, x1, lsl #4
+; CHECK-NEXT:    st1h { z0.s }, p0, [x8, z1.s, sxtw]
 ; CHECK-NEXT:    ret
   %splat.offset.ins = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
   %splat.offset = shufflevector <vscale x 4 x i64> %splat.offset.ins, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
@@ -297,11 +297,11 @@ define void @scatter_f16_index_add_add(ptr %base, i64 %offset, i64 %offset2, <vs
 define void @scatter_f16_index_add_add_mul(ptr %base, i64 %offset, i64 %offset2, <vscale x 4 x i1> %pg, <vscale x 4 x half> %data) #0 {
 ; CHECK-LABEL: scatter_f16_index_add_add_mul:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #128
+; CHECK-NEXT:    mov w8, #128 // =0x80
 ; CHECK-NEXT:    add x9, x0, x2, lsl #7
-; CHECK-NEXT:    add x9, x9, x1, lsl #7
 ; CHECK-NEXT:    index z1.s, #0, w8
-; CHECK-NEXT:    st1h { z0.s }, p0, [x9, z1.s, sxtw]
+; CHECK-NEXT:    add x8, x9, x1, lsl #7
+; CHECK-NEXT:    st1h { z0.s }, p0, [x8, z1.s, sxtw]
 ; CHECK-NEXT:    ret
   %splat.offset.ins = insertelement <vscale x 4 x i64> undef, i64 %offset, i32 0
   %splat.offset = shufflevector <vscale x 4 x i64> %splat.offset.ins, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
@@ -322,7 +322,7 @@ define void @scatter_f16_index_add_add_mul(ptr %base, i64 %offset, i64 %offset2,
 define <vscale x 2 x i64> @masked_gather_nxv2i64_const_with_vec_offsets(<vscale x 2 x i64> %vector_offsets, <vscale x 2 x i1> %pg) #0 {
 ; CHECK-LABEL: masked_gather_nxv2i64_const_with_vec_offsets:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #8
+; CHECK-NEXT:    mov w8, #8 // =0x8
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8, z0.d, lsl #3]
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i64, ptr inttoptr (i64 8 to ptr), <vscale x 2 x i64> %vector_offsets
@@ -347,7 +347,7 @@ define <vscale x 2 x i64> @masked_gather_nxv2i64_null_with_vec_plus_scalar_offse
 define <vscale x 2 x i64> @masked_gather_nxv2i64_null_with__vec_plus_imm_offsets(<vscale x 2 x i64> %vector_offsets, <vscale x 2 x i1> %pg) #0 {
 ; CHECK-LABEL: masked_gather_nxv2i64_null_with__vec_plus_imm_offsets:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #8
+; CHECK-NEXT:    mov w8, #8 // =0x8
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8, z0.d, lsl #3]
 ; CHECK-NEXT:    ret
   %scalar_offset.ins = insertelement <vscale x 2 x i64> undef, i64 1, i64 0
@@ -400,7 +400,7 @@ define <vscale x 4 x i32> @masked_gather_nxv4i32_u32s8_offsets(ptr %base, <vscal
 define void @masked_scatter_nxv2i64_const_with_vec_offsets(<vscale x 2 x i64> %vector_offsets, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %data) #0 {
 ; CHECK-LABEL: masked_scatter_nxv2i64_const_with_vec_offsets:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #8
+; CHECK-NEXT:    mov w8, #8 // =0x8
 ; CHECK-NEXT:    st1d { z1.d }, p0, [x8, z0.d, lsl #3]
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i64, ptr inttoptr (i64 8 to ptr), <vscale x 2 x i64> %vector_offsets
@@ -425,7 +425,7 @@ define void @masked_scatter_nxv2i64_null_with_vec_plus_scalar_offsets(<vscale x
 define void @masked_scatter_nxv2i64_null_with__vec_plus_imm_offsets(<vscale x 2 x i64> %vector_offsets, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %data) #0 {
 ; CHECK-LABEL: masked_scatter_nxv2i64_null_with__vec_plus_imm_offsets:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #8
+; CHECK-NEXT:    mov w8, #8 // =0x8
 ; CHECK-NEXT:    st1d { z1.d }, p0, [x8, z0.d, lsl #3]
 ; CHECK-NEXT:    ret
   %scalar_offset.ins = insertelement <vscale x 2 x i64> undef, i64 1, i64 0

diff  --git a/llvm/test/CodeGen/AArch64/sve-gather-scatter-dag-combine.ll b/llvm/test/CodeGen/AArch64/sve-gather-scatter-dag-combine.ll
index e8f97309149c73..9028d36528f262 100644
--- a/llvm/test/CodeGen/AArch64/sve-gather-scatter-dag-combine.ll
+++ b/llvm/test/CodeGen/AArch64/sve-gather-scatter-dag-combine.ll
@@ -31,9 +31,9 @@ define <vscale x 2 x i64> @no_dag_combine_sext(<vscale x 2 x i1> %pg,
 ; CHECK-LABEL: no_dag_combine_sext:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1b { z1.d }, p0/z, [z0.d, #16]
-; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p2.d
 ; CHECK-NEXT:    movprfx z0, z1
-; CHECK-NEXT:    sxtb z0.d, p0/m, z1.d
+; CHECK-NEXT:    sxtb z0.d, p2/m, z1.d
 ; CHECK-NEXT:    st1b { z1.d }, p1, [x0]
 ; CHECK-NEXT:    ret
                                                <vscale x 2 x i64> %base,
@@ -76,8 +76,8 @@ define <vscale x 2 x i64> @no_dag_combine_zext(<vscale x 2 x i1> %pg,
 define <vscale x 16 x i8> @narrow_i64_gather_index_i8_zext(i8* %out, i8* %in, <vscale x 16 x i8> %d, i64 %ptr){
 ; CHECK-LABEL: narrow_i64_gather_index_i8_zext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x1, x2
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    add x8, x1, x2
 ; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x1, x2]
 ; CHECK-NEXT:    ld1b { z1.s }, p0/z, [x8, #1, mul vl]
 ; CHECK-NEXT:    ld1b { z2.s }, p0/z, [x8, #2, mul vl]
@@ -102,8 +102,8 @@ define <vscale x 16 x i8> @narrow_i64_gather_index_i8_zext(i8* %out, i8* %in, <v
 define <vscale x 16 x i8> @narrow_i64_gather_index_i8_sext(i8* %out, i8* %in, <vscale x 16 x i8> %d, i64 %ptr){
 ; CHECK-LABEL: narrow_i64_gather_index_i8_sext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x1, x2
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    add x8, x1, x2
 ; CHECK-NEXT:    ld1sb { z0.s }, p0/z, [x1, x2]
 ; CHECK-NEXT:    ld1sb { z1.s }, p0/z, [x8, #1, mul vl]
 ; CHECK-NEXT:    ld1sb { z2.s }, p0/z, [x8, #2, mul vl]
@@ -128,8 +128,8 @@ define <vscale x 16 x i8> @narrow_i64_gather_index_i8_sext(i8* %out, i8* %in, <v
 define <vscale x 8 x i16> @narrow_i64_gather_index_i16_zext(i16* %out, i16* %in, <vscale x 8 x i16> %d, i64 %ptr){
 ; CHECK-LABEL: narrow_i64_gather_index_i16_zext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x1, x2, lsl #1
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    add x8, x1, x2, lsl #1
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x1, x2, lsl #1]
 ; CHECK-NEXT:    ld1h { z1.s }, p0/z, [x8, #1, mul vl]
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x1, z0.s, uxtw #1]
@@ -148,8 +148,8 @@ define <vscale x 8 x i16> @narrow_i64_gather_index_i16_zext(i16* %out, i16* %in,
 define <vscale x 8 x i16> @narrow_i64_gather_index_i16_sext(i16* %out, i16* %in, <vscale x 8 x i16> %d, i64 %ptr){
 ; CHECK-LABEL: narrow_i64_gather_index_i16_sext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x1, x2, lsl #1
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    add x8, x1, x2, lsl #1
 ; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x1, x2, lsl #1]
 ; CHECK-NEXT:    ld1sh { z1.s }, p0/z, [x8, #1, mul vl]
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x1, z0.s, sxtw #1]

diff  --git a/llvm/test/CodeGen/AArch64/sve-gep.ll b/llvm/test/CodeGen/AArch64/sve-gep.ll
index 209ebd33dd82bc..fd93e43613c52c 100644
--- a/llvm/test/CodeGen/AArch64/sve-gep.ll
+++ b/llvm/test/CodeGen/AArch64/sve-gep.ll
@@ -225,8 +225,8 @@ define <vscale x 2 x <vscale x 2 x i64>*> @scalable_of_scalable_3(<vscale x 2 x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    rdvl x8, #1
-; CHECK-NEXT:    sxtw z1.d, p0/m, z1.d
 ; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    sxtw z1.d, p0/m, z1.d
 ; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    ret
   %d = getelementptr <vscale x 2 x i64>, <vscale x 2 x <vscale x 2 x i64>*> %base, <vscale x 2 x i32> %idx

diff  --git a/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll b/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll
index 478fdc74f11f05..e20399de70bf83 100644
--- a/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll
+++ b/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll
@@ -175,15 +175,15 @@ define <vscale x 2 x i64> @uminv_zero_fill(<vscale x 2 x i1> %pg, <vscale x 2 x
 define <vscale x 2 x i64> @zero_fill_non_zero_index(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
 ; CHECK-LABEL: zero_fill_non_zero_index:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1
-; CHECK-NEXT:    uminv d0, p0, z0.d
-; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    index z1.d, #0, #1
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z0.d, x8
-; CHECK-NEXT:    cmpeq p0.d, p0/z, z1.d, z0.d
+; CHECK-NEXT:    uminv d3, p0, z0.d
+; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p0/m, x9
+; CHECK-NEXT:    fmov x8, d3
+; CHECK-NEXT:    cmpeq p0.d, p1/z, z1.d, z2.d
+; CHECK-NEXT:    mov z0.d, p0/m, x8
 ; CHECK-NEXT:    ret
   %t1 = call i64 @llvm.aarch64.sve.uminv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a)
   %t2 = insertelement <vscale x 2 x i64> zeroinitializer, i64 %t1, i64 1
@@ -195,8 +195,8 @@ define <vscale x 2 x i64> @zero_fill_non_zero_index(<vscale x 2 x i1> %pg, <vsca
 define <vscale x 4 x i64> @zero_fill_type_mismatch(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
 ; CHECK-LABEL: zero_fill_type_mismatch:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.d, #0 // =0x0
 ; CHECK-NEXT:    uminv d0, p0, z0.d
+; CHECK-NEXT:    mov z1.d, #0 // =0x0
 ; CHECK-NEXT:    ret
   %t1 = call i64 @llvm.aarch64.sve.uminv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a)
   %t2 = insertelement <vscale x 4 x i64> zeroinitializer, i64 %t1, i64 0
@@ -210,11 +210,12 @@ define <vscale x 4 x i64> @zero_fill_type_mismatch(<vscale x 2 x i1> %pg, <vscal
 define <vscale x 2 x i64> @zero_fill_no_zero_upper_lanes(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
 ; CHECK-LABEL: zero_fill_no_zero_upper_lanes:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d, vl1
 ; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z0.d
-; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    mov z1.d, #0 // =0x0
 ; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    mov z0.d, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p0/m, x8
+; CHECK-NEXT:    mov z1.d, p1/m, x8
+; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
   %t1 = call <vscale x 2 x i64> @llvm.aarch64.sve.umin.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %a)
   %t2 = extractelement <vscale x 2 x i64> %t1, i64 0

diff  --git a/llvm/test/CodeGen/AArch64/sve-insert-element.ll b/llvm/test/CodeGen/AArch64/sve-insert-element.ll
index 480c79f0ac691a..f327e32c92e006 100644
--- a/llvm/test/CodeGen/AArch64/sve-insert-element.ll
+++ b/llvm/test/CodeGen/AArch64/sve-insert-element.ll
@@ -4,8 +4,8 @@
 define <vscale x 16 x i8> @test_lane0_16xi8(<vscale x 16 x i8> %a) {
 ; CHECK-LABEL: test_lane0_16xi8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #30
 ; CHECK-NEXT:    ptrue p0.b, vl1
+; CHECK-NEXT:    mov w8, #30 // =0x1e
 ; CHECK-NEXT:    mov z0.b, p0/m, w8
 ; CHECK-NEXT:    ret
   %b = insertelement <vscale x 16 x i8> %a, i8 30, i32 0
@@ -15,8 +15,8 @@ define <vscale x 16 x i8> @test_lane0_16xi8(<vscale x 16 x i8> %a) {
 define <vscale x 8 x i16> @test_lane0_8xi16(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: test_lane0_8xi16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #30
 ; CHECK-NEXT:    ptrue p0.h, vl1
+; CHECK-NEXT:    mov w8, #30 // =0x1e
 ; CHECK-NEXT:    mov z0.h, p0/m, w8
 ; CHECK-NEXT:    ret
   %b = insertelement <vscale x 8 x i16> %a, i16 30, i32 0
@@ -26,8 +26,8 @@ define <vscale x 8 x i16> @test_lane0_8xi16(<vscale x 8 x i16> %a) {
 define <vscale x 4 x i32> @test_lane0_4xi32(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: test_lane0_4xi32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #30
 ; CHECK-NEXT:    ptrue p0.s, vl1
+; CHECK-NEXT:    mov w8, #30 // =0x1e
 ; CHECK-NEXT:    mov z0.s, p0/m, w8
 ; CHECK-NEXT:    ret
   %b = insertelement <vscale x 4 x i32> %a, i32 30, i32 0
@@ -37,8 +37,8 @@ define <vscale x 4 x i32> @test_lane0_4xi32(<vscale x 4 x i32> %a) {
 define <vscale x 2 x i64> @test_lane0_2xi64(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: test_lane0_2xi64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #30
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    mov w8, #30 // =0x1e
 ; CHECK-NEXT:    mov z0.d, p0/m, x8
 ; CHECK-NEXT:    ret
   %b = insertelement <vscale x 2 x i64> %a, i64 30, i32 0
@@ -48,8 +48,8 @@ define <vscale x 2 x i64> @test_lane0_2xi64(<vscale x 2 x i64> %a) {
 define <vscale x 2 x double> @test_lane0_2xf64(<vscale x 2 x double> %a) {
 ; CHECK-LABEL: test_lane0_2xf64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov d1, #1.00000000
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    fmov d1, #1.00000000
 ; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
   %b = insertelement <vscale x 2 x double> %a, double 1.0, i32 0
@@ -59,8 +59,8 @@ define <vscale x 2 x double> @test_lane0_2xf64(<vscale x 2 x double> %a) {
 define <vscale x 4 x float> @test_lane0_4xf32(<vscale x 4 x float> %a) {
 ; CHECK-LABEL: test_lane0_4xf32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov s1, #1.00000000
 ; CHECK-NEXT:    ptrue p0.s, vl1
+; CHECK-NEXT:    fmov s1, #1.00000000
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
   %b = insertelement <vscale x 4 x float> %a, float 1.0, i32 0
@@ -70,8 +70,8 @@ define <vscale x 4 x float> @test_lane0_4xf32(<vscale x 4 x float> %a) {
 define <vscale x 8 x half> @test_lane0_8xf16(<vscale x 8 x half> %a) {
 ; CHECK-LABEL: test_lane0_8xf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov h1, #1.00000000
 ; CHECK-NEXT:    ptrue p0.h, vl1
+; CHECK-NEXT:    fmov h1, #1.00000000
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
   %b = insertelement <vscale x 8 x half> %a, half 1.0, i32 0
@@ -93,13 +93,13 @@ define <vscale x 8 x bfloat> @test_lane0_8xbf16(<vscale x 8 x bfloat> %a, bfloat
 define <vscale x 2 x i64> @test_lane4_2xi64(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: test_lane4_2xi64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #4
-; CHECK-NEXT:    mov w9, #30
-; CHECK-NEXT:    index z2.d, #0, #1
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z1.d, x8
-; CHECK-NEXT:    cmpeq p0.d, p0/z, z2.d, z1.d
-; CHECK-NEXT:    mov z0.d, p0/m, x9
+; CHECK-NEXT:    mov w8, #4 // =0x4
+; CHECK-NEXT:    index z1.d, #0, #1
+; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    mov w8, #30 // =0x1e
+; CHECK-NEXT:    cmpeq p0.d, p0/z, z1.d, z2.d
+; CHECK-NEXT:    mov z0.d, p0/m, x8
 ; CHECK-NEXT:    ret
   %b = insertelement <vscale x 2 x i64> %a, i64 30, i32 4
   ret <vscale x 2 x i64> %b
@@ -109,12 +109,12 @@ define <vscale x 2 x i64> @test_lane4_2xi64(<vscale x 2 x i64> %a) {
 define <vscale x 8 x half> @test_lane9_8xf16(<vscale x 8 x half> %a) {
 ; CHECK-LABEL: test_lane9_8xf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #9
-; CHECK-NEXT:    fmov h1, #1.00000000
-; CHECK-NEXT:    index z3.h, #0, #1
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov w8, #9 // =0x9
+; CHECK-NEXT:    index z1.h, #0, #1
 ; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    cmpeq p0.h, p0/z, z3.h, z2.h
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z1.h, z2.h
+; CHECK-NEXT:    fmov h1, #1.00000000
 ; CHECK-NEXT:    mov z0.h, p0/m, h1
 ; CHECK-NEXT:    ret
   %b = insertelement <vscale x 8 x half> %a, half 1.0, i32 9
@@ -124,11 +124,11 @@ define <vscale x 8 x half> @test_lane9_8xf16(<vscale x 8 x half> %a) {
 define <vscale x 8 x bfloat> @test_lane9_8xbf16(<vscale x 8 x bfloat> %a, bfloat %x) {
 ; CHECK-LABEL: test_lane9_8xbf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #9
-; CHECK-NEXT:    index z3.h, #0, #1
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    cmpeq p0.h, p0/z, z3.h, z2.h
+; CHECK-NEXT:    mov w8, #9 // =0x9
+; CHECK-NEXT:    index z2.h, #0, #1
+; CHECK-NEXT:    mov z3.h, w8
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z2.h, z3.h
 ; CHECK-NEXT:    mov z0.h, p0/m, h1
 ; CHECK-NEXT:    ret
   %b = insertelement <vscale x 8 x bfloat> %a, bfloat %x, i32 9
@@ -138,13 +138,13 @@ define <vscale x 8 x bfloat> @test_lane9_8xbf16(<vscale x 8 x bfloat> %a, bfloat
 define <vscale x 16 x i8> @test_lane1_16xi8(<vscale x 16 x i8> %a) {
 ; CHECK-LABEL: test_lane1_16xi8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1
-; CHECK-NEXT:    mov w9, #30
-; CHECK-NEXT:    index z2.b, #0, #1
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov z1.b, w8
-; CHECK-NEXT:    cmpeq p0.b, p0/z, z2.b, z1.b
-; CHECK-NEXT:    mov z0.b, p0/m, w9
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    index z1.b, #0, #1
+; CHECK-NEXT:    mov z2.b, w8
+; CHECK-NEXT:    mov w8, #30 // =0x1e
+; CHECK-NEXT:    cmpeq p0.b, p0/z, z1.b, z2.b
+; CHECK-NEXT:    mov z0.b, p0/m, w8
 ; CHECK-NEXT:    ret
   %b = insertelement <vscale x 16 x i8> %a, i8 30, i32 1
   ret <vscale x 16 x i8> %b
@@ -153,13 +153,13 @@ define <vscale x 16 x i8> @test_lane1_16xi8(<vscale x 16 x i8> %a) {
 define <vscale x 16 x i8> @test_lanex_16xi8(<vscale x 16 x i8> %a, i32 %x) {
 ; CHECK-LABEL: test_lanex_16xi8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    mov w9, #30
-; CHECK-NEXT:    index z2.b, #0, #1
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov z1.b, w8
-; CHECK-NEXT:    cmpeq p0.b, p0/z, z2.b, z1.b
-; CHECK-NEXT:    mov z0.b, p0/m, w9
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    index z1.b, #0, #1
+; CHECK-NEXT:    mov z2.b, w8
+; CHECK-NEXT:    mov w8, #30 // =0x1e
+; CHECK-NEXT:    cmpeq p0.b, p0/z, z1.b, z2.b
+; CHECK-NEXT:    mov z0.b, p0/m, w8
 ; CHECK-NEXT:    ret
   %b = insertelement <vscale x 16 x i8> %a, i8 30, i32 %x
   ret <vscale x 16 x i8> %b
@@ -179,11 +179,11 @@ define <vscale x 4 x i32> @extract_insert_4xi32(<vscale x 4 x i32> %a) {
 define <vscale x 8 x i16> @test_lane6_undef_8xi16(i16 %a) {
 ; CHECK-LABEL: test_lane6_undef_8xi16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #6
-; CHECK-NEXT:    index z1.h, #0, #1
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z0.h, w8
-; CHECK-NEXT:    cmpeq p0.h, p0/z, z1.h, z0.h
+; CHECK-NEXT:    mov w8, #6 // =0x6
+; CHECK-NEXT:    index z0.h, #0, #1
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, z1.h
 ; CHECK-NEXT:    mov z0.h, p0/m, w0
 ; CHECK-NEXT:    ret
   %b = insertelement <vscale x 8 x i16> undef, i16 %a, i32 6
@@ -202,8 +202,8 @@ define <vscale x 16 x i8> @test_lane0_undef_16xi8(i8 %a) {
 define <vscale x 16 x i8> @test_insert0_of_extract0_16xi8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: test_insert0_of_extract0_16xi8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov w8, s1
 ; CHECK-NEXT:    ptrue p0.b, vl1
+; CHECK-NEXT:    fmov w8, s1
 ; CHECK-NEXT:    mov z0.b, p0/m, w8
 ; CHECK-NEXT:    ret
   %c = extractelement <vscale x 16 x i8> %b, i32 0
@@ -214,14 +214,14 @@ define <vscale x 16 x i8> @test_insert0_of_extract0_16xi8(<vscale x 16 x i8> %a,
 define <vscale x 16 x i8> @test_insert64_of_extract64_16xi8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: test_insert64_of_extract64_16xi8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #64
-; CHECK-NEXT:    index z3.b, #0, #1
+; CHECK-NEXT:    mov w8, #64 // =0x40
 ; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    whilels p0.b, xzr, x8
 ; CHECK-NEXT:    mov z2.b, w8
-; CHECK-NEXT:    lastb w8, p0, z1.b
-; CHECK-NEXT:    cmpeq p0.b, p1/z, z3.b, z2.b
-; CHECK-NEXT:    mov z0.b, p0/m, w8
+; CHECK-NEXT:    lastb w9, p0, z1.b
+; CHECK-NEXT:    index z1.b, #0, #1
+; CHECK-NEXT:    cmpeq p0.b, p1/z, z1.b, z2.b
+; CHECK-NEXT:    mov z0.b, p0/m, w9
 ; CHECK-NEXT:    ret
   %c = extractelement <vscale x 16 x i8> %b, i32 64
   %d = insertelement <vscale x 16 x i8> %a, i8 %c, i32 64
@@ -231,13 +231,13 @@ define <vscale x 16 x i8> @test_insert64_of_extract64_16xi8(<vscale x 16 x i8> %
 define <vscale x 16 x i8> @test_insert3_of_extract1_16xi8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: test_insert3_of_extract1_16xi8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #3
-; CHECK-NEXT:    umov w9, v1.b[1]
-; CHECK-NEXT:    index z2.b, #0, #1
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov z1.b, w8
-; CHECK-NEXT:    cmpeq p0.b, p0/z, z2.b, z1.b
-; CHECK-NEXT:    mov z0.b, p0/m, w9
+; CHECK-NEXT:    mov w8, #3 // =0x3
+; CHECK-NEXT:    index z2.b, #0, #1
+; CHECK-NEXT:    mov z3.b, w8
+; CHECK-NEXT:    umov w8, v1.b[1]
+; CHECK-NEXT:    cmpeq p0.b, p0/z, z2.b, z3.b
+; CHECK-NEXT:    mov z0.b, p0/m, w8
 ; CHECK-NEXT:    ret
   %c = extractelement <vscale x 16 x i8> %b, i32 1
   %d = insertelement <vscale x 16 x i8> %a, i8 %c, i32 3
@@ -329,9 +329,9 @@ define <vscale x 2 x double> @test_insert_into_undef_nxv2f64(double %a) {
 define <vscale x 2 x half> @test_insert_with_index_nxv2f16(half %h, i64 %idx) {
 ; CHECK-LABEL: test_insert_with_index_nxv2f16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    index z1.d, #0, #1
 ; CHECK-NEXT:    mov z2.d, x0
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cmpeq p0.d, p0/z, z1.d, z2.d
 ; CHECK-NEXT:    mov z0.h, p0/m, h0
 ; CHECK-NEXT:    ret
@@ -342,9 +342,9 @@ define <vscale x 2 x half> @test_insert_with_index_nxv2f16(half %h, i64 %idx) {
 define <vscale x 4 x half> @test_insert_with_index_nxv4f16(half %h, i64 %idx) {
 ; CHECK-LABEL: test_insert_with_index_nxv4f16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    index z1.s, #0, #1
 ; CHECK-NEXT:    mov z2.s, w0
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z1.s, z2.s
 ; CHECK-NEXT:    mov z0.h, p0/m, h0
 ; CHECK-NEXT:    ret
@@ -355,9 +355,9 @@ define <vscale x 4 x half> @test_insert_with_index_nxv4f16(half %h, i64 %idx) {
 define <vscale x 8 x half> @test_insert_with_index_nxv8f16(half %h, i64 %idx) {
 ; CHECK-LABEL: test_insert_with_index_nxv8f16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    index z1.h, #0, #1
 ; CHECK-NEXT:    mov z2.h, w0
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z1.h, z2.h
 ; CHECK-NEXT:    mov z0.h, p0/m, h0
 ; CHECK-NEXT:    ret
@@ -368,9 +368,9 @@ define <vscale x 8 x half> @test_insert_with_index_nxv8f16(half %h, i64 %idx) {
 define <vscale x 2 x bfloat> @test_insert_with_index_nxv2bf16(bfloat %h, i64 %idx) {
 ; CHECK-LABEL: test_insert_with_index_nxv2bf16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    index z1.d, #0, #1
 ; CHECK-NEXT:    mov z2.d, x0
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cmpeq p0.d, p0/z, z1.d, z2.d
 ; CHECK-NEXT:    mov z0.h, p0/m, h0
 ; CHECK-NEXT:    ret
@@ -381,9 +381,9 @@ define <vscale x 2 x bfloat> @test_insert_with_index_nxv2bf16(bfloat %h, i64 %id
 define <vscale x 4 x bfloat> @test_insert_with_index_nxv4bf16(bfloat %h, i64 %idx) {
 ; CHECK-LABEL: test_insert_with_index_nxv4bf16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    index z1.s, #0, #1
 ; CHECK-NEXT:    mov z2.s, w0
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z1.s, z2.s
 ; CHECK-NEXT:    mov z0.h, p0/m, h0
 ; CHECK-NEXT:    ret
@@ -394,9 +394,9 @@ define <vscale x 4 x bfloat> @test_insert_with_index_nxv4bf16(bfloat %h, i64 %id
 define <vscale x 8 x bfloat> @test_insert_with_index_nxv8bf16(bfloat %h, i64 %idx) {
 ; CHECK-LABEL: test_insert_with_index_nxv8bf16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    index z1.h, #0, #1
 ; CHECK-NEXT:    mov z2.h, w0
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z1.h, z2.h
 ; CHECK-NEXT:    mov z0.h, p0/m, h0
 ; CHECK-NEXT:    ret
@@ -407,9 +407,9 @@ define <vscale x 8 x bfloat> @test_insert_with_index_nxv8bf16(bfloat %h, i64 %id
 define <vscale x 2 x float> @test_insert_with_index_nxv2f32(float %f, i64 %idx) {
 ; CHECK-LABEL: test_insert_with_index_nxv2f32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    index z1.d, #0, #1
 ; CHECK-NEXT:    mov z2.d, x0
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cmpeq p0.d, p0/z, z1.d, z2.d
 ; CHECK-NEXT:    mov z0.s, p0/m, s0
 ; CHECK-NEXT:    ret
@@ -420,9 +420,9 @@ define <vscale x 2 x float> @test_insert_with_index_nxv2f32(float %f, i64 %idx)
 define <vscale x 4 x float> @test_insert_with_index_nxv4f32(float %f, i64 %idx) {
 ; CHECK-LABEL: test_insert_with_index_nxv4f32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    index z1.s, #0, #1
 ; CHECK-NEXT:    mov z2.s, w0
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z1.s, z2.s
 ; CHECK-NEXT:    mov z0.s, p0/m, s0
 ; CHECK-NEXT:    ret
@@ -433,9 +433,9 @@ define <vscale x 4 x float> @test_insert_with_index_nxv4f32(float %f, i64 %idx)
 define <vscale x 2 x double> @test_insert_with_index_nxv2f64(double %d, i64 %idx) {
 ; CHECK-LABEL: test_insert_with_index_nxv2f64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    index z1.d, #0, #1
 ; CHECK-NEXT:    mov z2.d, x0
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cmpeq p0.d, p0/z, z1.d, z2.d
 ; CHECK-NEXT:    mov z0.d, p0/m, d0
 ; CHECK-NEXT:    ret
@@ -450,8 +450,8 @@ define <vscale x 2 x i1> @test_predicate_insert_2xi1_immediate (<vscale x 2 x i1
 ; CHECK-NEXT:    ptrue p1.d, vl1
 ; CHECK-NEXT:    mov z0.d, p0/z, #1 // =0x1
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    mov z0.d, p1/m, x0
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z0.d, p1/m, x0
 ; CHECK-NEXT:    and z0.d, z0.d, #0x1
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    ret
@@ -462,11 +462,11 @@ define <vscale x 2 x i1> @test_predicate_insert_2xi1_immediate (<vscale x 2 x i1
 define <vscale x 4 x i1> @test_predicate_insert_4xi1_immediate (<vscale x 4 x i1> %val, i1 %elt) {
 ; CHECK-LABEL: test_predicate_insert_4xi1_immediate:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #2
-; CHECK-NEXT:    index z1.s, #0, #1
 ; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    mov z0.s, w8
-; CHECK-NEXT:    cmpeq p2.s, p1/z, z1.s, z0.s
+; CHECK-NEXT:    mov w8, #2 // =0x2
+; CHECK-NEXT:    index z0.s, #0, #1
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    cmpeq p2.s, p1/z, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.s, p0/z, #1 // =0x1
 ; CHECK-NEXT:    mov z0.s, p2/m, w0
 ; CHECK-NEXT:    and z0.s, z0.s, #0x1
@@ -479,14 +479,14 @@ define <vscale x 4 x i1> @test_predicate_insert_4xi1_immediate (<vscale x 4 x i1
 define <vscale x 8 x i1> @test_predicate_insert_8xi1_immediate (<vscale x 8 x i1> %val, i32 %idx) {
 ; CHECK-LABEL: test_predicate_insert_8xi1_immediate:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    mov w9, #1
-; CHECK-NEXT:    index z1.h, #0, #1
 ; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    mov z0.h, w8
-; CHECK-NEXT:    cmpeq p2.h, p1/z, z1.h, z0.h
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    index z0.h, #0, #1
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    cmpeq p2.h, p1/z, z0.h, z1.h
 ; CHECK-NEXT:    mov z0.h, p0/z, #1 // =0x1
-; CHECK-NEXT:    mov z0.h, p2/m, w9
+; CHECK-NEXT:    mov z0.h, p2/m, w8
 ; CHECK-NEXT:    and z0.h, z0.h, #0x1
 ; CHECK-NEXT:    cmpne p0.h, p1/z, z0.h, #0
 ; CHECK-NEXT:    ret
@@ -497,12 +497,12 @@ define <vscale x 8 x i1> @test_predicate_insert_8xi1_immediate (<vscale x 8 x i1
 define <vscale x 16 x i1> @test_predicate_insert_16xi1_immediate (<vscale x 16 x i1> %val) {
 ; CHECK-LABEL: test_predicate_insert_16xi1_immediate:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w9, #4
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    index z1.b, #0, #1
 ; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    mov z0.b, w9
-; CHECK-NEXT:    cmpeq p2.b, p1/z, z1.b, z0.b
+; CHECK-NEXT:    mov w8, #4 // =0x4
+; CHECK-NEXT:    index z0.b, #0, #1
+; CHECK-NEXT:    mov z1.b, w8
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    cmpeq p2.b, p1/z, z0.b, z1.b
 ; CHECK-NEXT:    mov z0.b, p0/z, #1 // =0x1
 ; CHECK-NEXT:    mov z0.b, p2/m, w8
 ; CHECK-NEXT:    and z0.b, z0.b, #0x1
@@ -516,12 +516,12 @@ define <vscale x 16 x i1> @test_predicate_insert_16xi1_immediate (<vscale x 16 x
 define <vscale x 2 x i1> @test_predicate_insert_2xi1(<vscale x 2 x i1> %val, i1 %elt, i32 %idx) {
 ; CHECK-LABEL: test_predicate_insert_2xi1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w1
-; CHECK-NEXT:    index z1.d, #0, #1
 ; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    mov w8, w1
+; CHECK-NEXT:    index z0.d, #0, #1
+; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    mov z0.d, x8
-; CHECK-NEXT:    cmpeq p2.d, p1/z, z1.d, z0.d
+; CHECK-NEXT:    cmpeq p2.d, p1/z, z0.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/z, #1 // =0x1
 ; CHECK-NEXT:    mov z0.d, p2/m, x0
 ; CHECK-NEXT:    and z0.d, z0.d, #0x1
@@ -534,11 +534,11 @@ define <vscale x 2 x i1> @test_predicate_insert_2xi1(<vscale x 2 x i1> %val, i1
 define <vscale x 4 x i1> @test_predicate_insert_4xi1(<vscale x 4 x i1> %val, i1 %elt, i32 %idx) {
 ; CHECK-LABEL: test_predicate_insert_4xi1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w1
-; CHECK-NEXT:    index z1.s, #0, #1
 ; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    mov z0.s, w8
-; CHECK-NEXT:    cmpeq p2.s, p1/z, z1.s, z0.s
+; CHECK-NEXT:    mov w8, w1
+; CHECK-NEXT:    index z0.s, #0, #1
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    cmpeq p2.s, p1/z, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.s, p0/z, #1 // =0x1
 ; CHECK-NEXT:    mov z0.s, p2/m, w0
 ; CHECK-NEXT:    and z0.s, z0.s, #0x1
@@ -550,11 +550,11 @@ define <vscale x 4 x i1> @test_predicate_insert_4xi1(<vscale x 4 x i1> %val, i1
 define <vscale x 8 x i1> @test_predicate_insert_8xi1(<vscale x 8 x i1> %val, i1 %elt, i32 %idx) {
 ; CHECK-LABEL: test_predicate_insert_8xi1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w1
-; CHECK-NEXT:    index z1.h, #0, #1
 ; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    mov z0.h, w8
-; CHECK-NEXT:    cmpeq p2.h, p1/z, z1.h, z0.h
+; CHECK-NEXT:    mov w8, w1
+; CHECK-NEXT:    index z0.h, #0, #1
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    cmpeq p2.h, p1/z, z0.h, z1.h
 ; CHECK-NEXT:    mov z0.h, p0/z, #1 // =0x1
 ; CHECK-NEXT:    mov z0.h, p2/m, w0
 ; CHECK-NEXT:    and z0.h, z0.h, #0x1
@@ -567,11 +567,11 @@ define <vscale x 8 x i1> @test_predicate_insert_8xi1(<vscale x 8 x i1> %val, i1
 define <vscale x 16 x i1> @test_predicate_insert_16xi1(<vscale x 16 x i1> %val, i1 %elt, i32 %idx) {
 ; CHECK-LABEL: test_predicate_insert_16xi1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w1
-; CHECK-NEXT:    index z1.b, #0, #1
 ; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    mov z0.b, w8
-; CHECK-NEXT:    cmpeq p2.b, p1/z, z1.b, z0.b
+; CHECK-NEXT:    mov w8, w1
+; CHECK-NEXT:    index z0.b, #0, #1
+; CHECK-NEXT:    mov z1.b, w8
+; CHECK-NEXT:    cmpeq p2.b, p1/z, z0.b, z1.b
 ; CHECK-NEXT:    mov z0.b, p0/z, #1 // =0x1
 ; CHECK-NEXT:    mov z0.b, p2/m, w0
 ; CHECK-NEXT:    and z0.b, z0.b, #0x1
@@ -589,24 +589,24 @@ define <vscale x 32 x i1> @test_predicate_insert_32xi1(<vscale x 32 x i1> %val,
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEXT:    mov x8, #-1
-; CHECK-NEXT:    mov w9, w1
+; CHECK-NEXT:    ptrue p2.b
+; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.b, p1/z, #1 // =0x1
-; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    st1b { z0.b }, p1, [sp, #1, mul vl]
-; CHECK-NEXT:    mov z0.b, p0/z, #1 // =0x1
+; CHECK-NEXT:    mov z1.b, p0/z, #1 // =0x1
 ; CHECK-NEXT:    addvl x8, x8, #2
-; CHECK-NEXT:    st1b { z0.b }, p1, [sp]
+; CHECK-NEXT:    mov w9, w1
 ; CHECK-NEXT:    cmp x9, x8
 ; CHECK-NEXT:    csel x8, x9, x8, lo
 ; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1b { z0.b }, p2, [sp, #1, mul vl]
+; CHECK-NEXT:    st1b { z1.b }, p2, [sp]
 ; CHECK-NEXT:    strb w0, [x9, x8]
-; CHECK-NEXT:    ld1b { z0.b }, p1/z, [sp]
-; CHECK-NEXT:    ld1b { z1.b }, p1/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1b { z0.b }, p2/z, [sp]
+; CHECK-NEXT:    ld1b { z1.b }, p2/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    and z0.b, z0.b, #0x1
 ; CHECK-NEXT:    and z1.b, z1.b, #0x1
-; CHECK-NEXT:    cmpne p0.b, p1/z, z0.b, #0
-; CHECK-NEXT:    cmpne p1.b, p1/z, z1.b, #0
+; CHECK-NEXT:    cmpne p0.b, p2/z, z0.b, #0
+; CHECK-NEXT:    cmpne p1.b, p2/z, z1.b, #0
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
index 37ea51ba787193..9ca928c00299fe 100644
--- a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
@@ -17,15 +17,15 @@ define <vscale x 2 x i64> @insert_v2i64_nxv2i64_idx2(<vscale x 2 x i64> %vec, <2
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cntd x8
-; CHECK-NEXT:    mov w9, #2
+; CHECK-NEXT:    mov w9, #2 // =0x2
 ; CHECK-NEXT:    sub x8, x8, #2
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cmp x8, #2
-; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    lsl x8, x8, #3
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
 ; CHECK-NEXT:    str q1, [x9, x8]
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -51,15 +51,15 @@ define <vscale x 4 x i32> @insert_v4i32_nxv4i32_idx4(<vscale x 4 x i32> %vec, <4
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cntw x8
-; CHECK-NEXT:    mov w9, #4
+; CHECK-NEXT:    mov w9, #4 // =0x4
 ; CHECK-NEXT:    sub x8, x8, #4
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cmp x8, #4
-; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    lsl x8, x8, #2
+; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
 ; CHECK-NEXT:    str q1, [x9, x8]
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [sp]
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -85,15 +85,15 @@ define <vscale x 8 x i16> @insert_v8i16_nxv8i16_idx8(<vscale x 8 x i16> %vec, <8
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    cnth x8
-; CHECK-NEXT:    mov w9, #8
+; CHECK-NEXT:    mov w9, #8 // =0x8
 ; CHECK-NEXT:    sub x8, x8, #8
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    cmp x8, #8
-; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    lsl x8, x8, #1
+; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
 ; CHECK-NEXT:    str q1, [x9, x8]
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [sp]
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -119,15 +119,15 @@ define <vscale x 16 x i8> @insert_v16i8_nxv16i8_idx16(<vscale x 16 x i8> %vec, <
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov x8, #-16
-; CHECK-NEXT:    mov w9, #16
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK-NEXT:    mov x8, #-16 // =0xfffffffffffffff0
+; CHECK-NEXT:    mov w9, #16 // =0x10
 ; CHECK-NEXT:    addvl x8, x8, #1
+; CHECK-NEXT:    mov x10, sp
 ; CHECK-NEXT:    cmp x8, #16
 ; CHECK-NEXT:    csel x8, x8, x9, lo
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    str q1, [x9, x8]
+; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK-NEXT:    str q1, [x10, x8]
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [sp]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -221,8 +221,8 @@ define void @insert_v2i64_nxv16i64(<2 x i64> %sv0, <2 x i64> %sv1, <vscale x 16
 define void @insert_v2i64_nxv16i64_lo0(<2 x i64>* %psv, <vscale x 16 x i64>* %out) {
 ; CHECK-LABEL: insert_v2i64_nxv16i64_lo0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
   %sv = load <2 x i64>, <2 x i64>* %psv
@@ -239,8 +239,8 @@ define void @insert_v2i64_nxv16i64_lo2(<2 x i64>* %psv, <vscale x 16 x i64>* %ou
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    str q0, [sp, #16]
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [sp, #1, mul vl]

diff  --git a/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll b/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll
index 8a9ef0f875789f..3ccbd58847401a 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll
@@ -55,8 +55,8 @@ define <vscale x 8 x i16> @smax_i16_neg(<vscale x 8 x i16> %a) {
 define <vscale x 8 x i16> @smax_i16_out_of_range(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: smax_i16_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    dupm z1.b, #0x1
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    dupm z1.b, #0x1
 ; CHECK-NEXT:    smax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 8 x i16> undef, i16 257, i32 0
@@ -93,8 +93,8 @@ define <vscale x 4 x i32> @smax_i32_neg(<vscale x 4 x i32> %a) {
 define <vscale x 4 x i32> @smax_i32_out_of_range(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: smax_i32_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.s, #-129 // =0xffffffffffffff7f
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov z1.s, #-129 // =0xffffffffffffff7f
 ; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 4 x i32> undef, i32 -129, i32 0
@@ -131,8 +131,8 @@ define <vscale x 2 x i64> @smax_i64_neg(<vscale x 2 x i64> %a) {
 define <vscale x 2 x i64> @smax_i64_out_of_range(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: smax_i64_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.d, #65535 // =0xffff
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z1.d, #65535 // =0xffff
 ; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 2 x i64> undef, i64 65535, i32 0
@@ -196,8 +196,8 @@ define <vscale x 8 x i16> @smin_i16_neg(<vscale x 8 x i16> %a) {
 define <vscale x 8 x i16> @smin_i16_out_of_range(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: smin_i16_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    dupm z1.b, #0x1
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    dupm z1.b, #0x1
 ; CHECK-NEXT:    smin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 8 x i16> undef, i16 257, i32 0
@@ -234,8 +234,8 @@ define <vscale x 4 x i32> @smin_i32_neg(<vscale x 4 x i32> %a) {
 define <vscale x 4 x i32> @smin_i32_out_of_range(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: smin_i32_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.s, #-129 // =0xffffffffffffff7f
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov z1.s, #-129 // =0xffffffffffffff7f
 ; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 4 x i32> undef, i32 -129, i32 0
@@ -272,8 +272,8 @@ define <vscale x 2 x i64> @smin_i64_neg(<vscale x 2 x i64> %a) {
 define <vscale x 2 x i64> @smin_i64_out_of_range(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: smin_i64_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.d, #65535 // =0xffff
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z1.d, #65535 // =0xffff
 ; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 2 x i64> undef, i64 65535, i32 0
@@ -325,8 +325,8 @@ define <vscale x 8 x i16> @umax_i16_pos(<vscale x 8 x i16> %a) {
 define <vscale x 8 x i16> @umax_i16_out_of_range(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: umax_i16_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    dupm z1.b, #0x1
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    dupm z1.b, #0x1
 ; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 8 x i16> undef, i16 257, i32 0
@@ -351,8 +351,8 @@ define <vscale x 4 x i32> @umax_i32_pos(<vscale x 4 x i32> %a) {
 define <vscale x 4 x i32> @umax_i32_out_of_range(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: umax_i32_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #257
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w8, #257 // =0x101
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -378,8 +378,8 @@ define <vscale x 2 x i64> @umax_i64_pos(<vscale x 2 x i64> %a) {
 define <vscale x 2 x i64> @umax_i64_out_of_range(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: umax_i64_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.d, #65535 // =0xffff
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z1.d, #65535 // =0xffff
 ; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 2 x i64> undef, i64 65535, i32 0
@@ -431,8 +431,8 @@ define <vscale x 8 x i16> @umin_i16_pos(<vscale x 8 x i16> %a) {
 define <vscale x 8 x i16> @umin_i16_out_of_range(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: umin_i16_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    dupm z1.b, #0x1
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    dupm z1.b, #0x1
 ; CHECK-NEXT:    umin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 8 x i16> undef, i16 257, i32 0
@@ -457,8 +457,8 @@ define <vscale x 4 x i32> @umin_i32_pos(<vscale x 4 x i32> %a) {
 define <vscale x 4 x i32> @umin_i32_out_of_range(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: umin_i32_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #257
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w8, #257 // =0x101
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -484,8 +484,8 @@ define <vscale x 2 x i64> @umin_i64_pos(<vscale x 2 x i64> %a) {
 define <vscale x 2 x i64> @umin_i64_out_of_range(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: umin_i64_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.d, #65535 // =0xffff
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z1.d, #65535 // =0xffff
 ; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 2 x i64> undef, i64 65535, i32 0
@@ -589,8 +589,8 @@ define <vscale x 2 x i64> @mul_i64_pos(<vscale x 2 x i64> %a) {
 define <vscale x 8 x i16> @mul_i16_range(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: mul_i16_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.h, #255 // =0xff
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z1.h, #255 // =0xff
 ; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 8 x i16> undef, i16 255, i32 0
@@ -602,8 +602,8 @@ define <vscale x 8 x i16> @mul_i16_range(<vscale x 8 x i16> %a) {
 define <vscale x 4 x i32> @mul_i32_range(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: mul_i32_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.s, #255 // =0xff
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov z1.s, #255 // =0xff
 ; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 4 x i32> undef, i32 255, i32 0
@@ -615,8 +615,8 @@ define <vscale x 4 x i32> @mul_i32_range(<vscale x 4 x i32> %a) {
 define <vscale x 2 x i64> @mul_i64_range(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: mul_i64_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.d, #255 // =0xff
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z1.d, #255 // =0xff
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 2 x i64> undef, i64 255, i32 0
@@ -766,8 +766,8 @@ define <vscale x 2 x i64> @lsr_i64(<vscale x 2 x i64> %a){
 define <vscale x 4 x i32> @sdiv_const(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: sdiv_const:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z1.s, #3 // =0x3
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov z1.s, #3 // =0x3
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
@@ -778,8 +778,8 @@ entry:
 define <vscale x 4 x i32> @udiv_const(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: udiv_const:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z1.s, #3 // =0x3
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov z1.s, #3 // =0x3
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/sve-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-int-arith.ll
index 6dcad12234f67b..1bace71db0c118 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-arith.ll
@@ -165,9 +165,9 @@ define <vscale x 8 x i64> @abs_nxv8i64(<vscale x 8 x i64> %a) {
 ; CHECK-LABEL: abs_nxv8i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    abs z2.d, p0/m, z2.d
 ; CHECK-NEXT:    abs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    abs z1.d, p0/m, z1.d
+; CHECK-NEXT:    abs z2.d, p0/m, z2.d
 ; CHECK-NEXT:    abs z3.d, p0/m, z3.d
 ; CHECK-NEXT:    ret
   %res = call <vscale x 8 x i64> @llvm.abs.nxv8i64(<vscale x 8 x i64> %a, i1 false)
@@ -748,8 +748,8 @@ define <vscale x 16 x i8> @mulsub_i8_negativeAddend(<vscale x 16 x i8> %a, <vsca
 define <vscale x 8 x i16> @multiple_fused_ops(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
 ; CHECK-LABEL: multiple_fused_ops:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #200 // =0xc8
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov w8, #200 // =0xc8
 ; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    mla z2.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z2.h
@@ -770,19 +770,19 @@ define void @mad_in_loop(ptr %dst, ptr %src1, ptr %src2, i32 %n) {
 ; CHECK-NEXT:    b.lt .LBB70_3
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-NEXT:    mov w9, w3
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    mov z0.s, #1 // =0x1
+; CHECK-NEXT:    whilelo p0.s, xzr, x9
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    cntw x10
-; CHECK-NEXT:    mov z0.s, #1 // =0x1
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    whilelo p1.s, xzr, x9
 ; CHECK-NEXT:  .LBB70_2: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1, x8, lsl #2]
-; CHECK-NEXT:    ld1w { z2.s }, p1/z, [x2, x8, lsl #2]
-; CHECK-NEXT:    mad z1.s, p0/m, z2.s, z0.s
-; CHECK-NEXT:    st1w { z1.s }, p1, [x0, x8, lsl #2]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x2, x8, lsl #2]
+; CHECK-NEXT:    mad z1.s, p1/m, z2.s, z0.s
+; CHECK-NEXT:    st1w { z1.s }, p0, [x0, x8, lsl #2]
 ; CHECK-NEXT:    add x8, x8, x10
-; CHECK-NEXT:    whilelo p1.s, x8, x9
+; CHECK-NEXT:    whilelo p0.s, x8, x9
 ; CHECK-NEXT:    b.mi .LBB70_2
 ; CHECK-NEXT:  .LBB70_3: // %for.cond.cleanup
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
index fea3c4aa455be2..d04da62451778a 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
@@ -380,12 +380,12 @@ define i8 @smin_nxv10i8(<vscale x 10 x i8> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uunpkhi z2.h, z0.b
 ; CHECK-NEXT:    mov z1.d, #127 // =0x7f
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    uunpklo z3.s, z2.h
 ; CHECK-NEXT:    uunpkhi z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.d, z3.s
-; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    uzp1 z3.s, z3.s, z1.s
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    uzp1 z2.h, z3.h, z2.h
 ; CHECK-NEXT:    uzp1 z2.b, z0.b, z2.b
 ; CHECK-NEXT:    uunpkhi z2.h, z2.b
@@ -416,10 +416,10 @@ define i8 @uaddv_nxv12i8(<vscale x 12 x i8> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uunpkhi z2.h, z0.b
 ; CHECK-NEXT:    mov z1.s, #0 // =0x0
-; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
-; CHECK-NEXT:    uzp1 z1.h, z2.h, z1.h
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    uunpklo z2.s, z2.h
+; CHECK-NEXT:    uzp1 z1.h, z2.h, z1.h
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
 ; CHECK-NEXT:    uaddv d0, p0, z0.b
 ; CHECK-NEXT:    fmov x0, d0
@@ -436,12 +436,12 @@ define i8 @umax_nxv14i8(<vscale x 14 x i8> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uunpkhi z2.h, z0.b
 ; CHECK-NEXT:    mov z1.d, #0 // =0x0
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    uunpkhi z3.s, z2.h
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.d, z3.s
-; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    uzp1 z1.s, z3.s, z1.s
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    uzp1 z1.h, z2.h, z1.h
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
 ; CHECK-NEXT:    umaxv b0, p0, z0.b

diff  --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-counting-elems-i32.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-counting-elems-i32.ll
index 721a35dcf0f17c..5062a43da931f8 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-counting-elems-i32.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-counting-elems-i32.ll
@@ -111,9 +111,9 @@ define i32 @inch(i32 %a) {
 define i32 @inch_mul(i32 %a) {
 ; NO_SCALAR_INC-LABEL: inch_mul:
 ; NO_SCALAR_INC:       // %bb.0:
-; NO_SCALAR_INC-NEXT:    mov w8, #5
-; NO_SCALAR_INC-NEXT:    cnth x9, vl8
-; NO_SCALAR_INC-NEXT:    madd w0, w9, w8, w0
+; NO_SCALAR_INC-NEXT:    cnth x8, vl8
+; NO_SCALAR_INC-NEXT:    mov w9, #5 // =0x5
+; NO_SCALAR_INC-NEXT:    madd w0, w8, w9, w0
 ; NO_SCALAR_INC-NEXT:    ret
 ;
 ; CHECK-LABEL: inch_mul:
@@ -155,9 +155,9 @@ define i32 @dech(i32 %a) {
 define i32 @dech_mul(i32 %a) {
 ; NO_SCALAR_INC-LABEL: dech_mul:
 ; NO_SCALAR_INC:       // %bb.0:
-; NO_SCALAR_INC-NEXT:    mov w8, #7
-; NO_SCALAR_INC-NEXT:    cnth x9, vl16
-; NO_SCALAR_INC-NEXT:    msub w0, w9, w8, w0
+; NO_SCALAR_INC-NEXT:    cnth x8, vl16
+; NO_SCALAR_INC-NEXT:    mov w9, #7 // =0x7
+; NO_SCALAR_INC-NEXT:    msub w0, w8, w9, w0
 ; NO_SCALAR_INC-NEXT:    ret
 ;
 ; CHECK-LABEL: dech_mul:
@@ -199,9 +199,9 @@ define i32 @incw(i32 %a) {
 define i32 @incw_mul(i32 %a) {
 ; NO_SCALAR_INC-LABEL: incw_mul:
 ; NO_SCALAR_INC:       // %bb.0:
-; NO_SCALAR_INC-NEXT:    mov w8, #12
-; NO_SCALAR_INC-NEXT:    cntw x9, vl32
-; NO_SCALAR_INC-NEXT:    madd w0, w9, w8, w0
+; NO_SCALAR_INC-NEXT:    cntw x8, vl32
+; NO_SCALAR_INC-NEXT:    mov w9, #12 // =0xc
+; NO_SCALAR_INC-NEXT:    madd w0, w8, w9, w0
 ; NO_SCALAR_INC-NEXT:    ret
 ;
 ; CHECK-LABEL: incw_mul:
@@ -284,9 +284,9 @@ define i32 @incd(i32 %base) {
 define i32 @incd_mul(i32 %base) {
 ; NO_SCALAR_INC-LABEL: incd_mul:
 ; NO_SCALAR_INC:       // %bb.0:
-; NO_SCALAR_INC-NEXT:    mov w8, #15
-; NO_SCALAR_INC-NEXT:    cntd x9, vl64
-; NO_SCALAR_INC-NEXT:    madd w0, w9, w8, w0
+; NO_SCALAR_INC-NEXT:    cntd x8, vl64
+; NO_SCALAR_INC-NEXT:    mov w9, #15 // =0xf
+; NO_SCALAR_INC-NEXT:    madd w0, w8, w9, w0
 ; NO_SCALAR_INC-NEXT:    ret
 ;
 ; CHECK-LABEL: incd_mul:
@@ -328,9 +328,9 @@ define i32 @decd(i32 %a) {
 define i32 @decd_mul(i32 %a) {
 ; NO_SCALAR_INC-LABEL: decd_mul:
 ; NO_SCALAR_INC:       // %bb.0:
-; NO_SCALAR_INC-NEXT:    mov w8, #9
-; NO_SCALAR_INC-NEXT:    cntd x9, vl2
-; NO_SCALAR_INC-NEXT:    msub w0, w9, w8, w0
+; NO_SCALAR_INC-NEXT:    cntd x8, vl2
+; NO_SCALAR_INC-NEXT:    mov w9, #9 // =0x9
+; NO_SCALAR_INC-NEXT:    msub w0, w8, w9, w0
 ; NO_SCALAR_INC-NEXT:    ret
 ;
 ; CHECK-LABEL: decd_mul:

diff  --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll
index 7c46cf9c239ef8..2464eacd185dd4 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll
@@ -44,8 +44,8 @@ define <vscale x 2 x i64> @index_ii_i64() {
 define <vscale x 2 x i64> @index_ii_range() {
 ; CHECK-LABEL: index_ii_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #16
-; CHECK-NEXT:    mov x9, #-17
+; CHECK-NEXT:    mov w8, #16 // =0x10
+; CHECK-NEXT:    mov x9, #-17 // =0xffffffffffffffef
 ; CHECK-NEXT:    index z0.d, x9, x8
 ; CHECK-NEXT:    ret
   %out = call <vscale x 2 x i64> @llvm.aarch64.sve.index.nxv2i64(i64 -17, i64 16)
@@ -109,7 +109,7 @@ define <vscale x 2 x i64> @index_ir_i64(i64 %a) {
 define <vscale x 4 x i32> @index_ir_range(i32 %a) {
 ; CHECK-LABEL: index_ir_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-17
+; CHECK-NEXT:    mov w8, #-17 // =0xffffffef
 ; CHECK-NEXT:    index z0.s, w8, w0
 ; CHECK-NEXT:    ret
   %out = call <vscale x 4 x i32> @llvm.aarch64.sve.index.nxv4i32(i32 -17, i32 %a)
@@ -174,7 +174,7 @@ define <vscale x 2 x i64> @index_ri_i64(i64 %a) {
 define <vscale x 8 x i16> @index_ri_range(i16 %a) {
 ; CHECK-LABEL: index_ri_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #16
+; CHECK-NEXT:    mov w8, #16 // =0x10
 ; CHECK-NEXT:    index z0.h, w0, w8
 ; CHECK-NEXT:    ret
   %out = call <vscale x 8 x i16> @llvm.aarch64.sve.index.nxv8i16(i16 %a, i16 16)
@@ -239,10 +239,10 @@ define <vscale x 4 x i32> @index_rr_i32_combine(i32 %a, i32 %b) {
 define <vscale x 4 x i32> @index_rr_i32_not_combine(i32 %a, i32 %b) {
 ; CHECK-LABEL: index_rr_i32_not_combine:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    index z0.s, #0, #1
 ; CHECK-NEXT:    mov z1.s, w0
 ; CHECK-NEXT:    mov z2.s, w1
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mla z1.s, p0/m, z0.s, z2.s
 ; CHECK-NEXT:    add z0.s, z1.s, z0.s
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll
index 59d52b00c00f55..c70006d988c19b 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll
@@ -63,7 +63,7 @@ define <vscale x 4 x i32> @add_i32(<vscale x 4 x i32> %a) {
 define <vscale x 4 x i32> @add_i32_out_of_range(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: add_i32_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #257
+; CHECK-NEXT:    mov w8, #257 // =0x101
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -93,7 +93,7 @@ define <vscale x 2 x i64> @add_i64(<vscale x 2 x i64> %a) {
 define <vscale x 2 x i64> @add_i64_out_of_range(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: add_i64_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #257
+; CHECK-NEXT:    mov w8, #257 // =0x101
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    add z0.d, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -168,7 +168,7 @@ define <vscale x 4 x i32> @sub_i32(<vscale x 4 x i32> %a) {
 define <vscale x 4 x i32> @sub_i32_out_of_range(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: sub_i32_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #257
+; CHECK-NEXT:    mov w8, #257 // =0x101
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    sub z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -198,7 +198,7 @@ define <vscale x 2 x i64> @sub_i64(<vscale x 2 x i64> %a) {
 define <vscale x 2 x i64> @sub_i64_out_of_range(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: sub_i64_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #257
+; CHECK-NEXT:    mov w8, #257 // =0x101
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    sub z0.d, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -323,7 +323,7 @@ define <vscale x 4 x i32> @subr_i32(<vscale x 4 x i32> %a) {
 define <vscale x 4 x i32> @subr_i32_out_of_range(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: subr_i32_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #257
+; CHECK-NEXT:    mov w8, #257 // =0x101
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    sub z0.s, z1.s, z0.s
 ; CHECK-NEXT:    ret
@@ -353,7 +353,7 @@ define <vscale x 2 x i64> @subr_i64(<vscale x 2 x i64> %a) {
 define <vscale x 2 x i64> @subr_i64_out_of_range(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: subr_i64_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #257
+; CHECK-NEXT:    mov w8, #257 // =0x101
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    sub z0.d, z1.d, z0.d
 ; CHECK-NEXT:    ret
@@ -449,8 +449,8 @@ define <vscale x 8 x i16> @smax_i16(<vscale x 8 x i16> %a) {
 define <vscale x 8 x i16> @smax_i16_out_of_range(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: smax_i16_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #129
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov w8, #129 // =0x81
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    smax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
@@ -636,8 +636,8 @@ define <vscale x 4 x i32> @smin_i32(<vscale x 4 x i32> %a) {
 define <vscale x 4 x i32> @smin_i32_out_of_range(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: smin_i32_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #257
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w8, #257 // =0x101
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -794,8 +794,8 @@ define <vscale x 4 x i32> @umax_i32(<vscale x 4 x i32> %a) {
 define <vscale x 4 x i32> @umax_i32_out_of_range(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: umax_i32_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #257
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w8, #257 // =0x101
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -951,8 +951,8 @@ define <vscale x 4 x i32> @umin_i32(<vscale x 4 x i32> %a) {
 define <vscale x 4 x i32> @umin_i32_out_of_range(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: umin_i32_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #257
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w8, #257 // =0x101
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll
index 42c65a22467788..ed820e0fc8a258 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll
@@ -415,9 +415,9 @@ define <vscale x 2 x i64> @bic_i64_zero_no_comm(<vscale x 2 x i1> %pg, <vscale x
 ; CHECK-LABEL: bic_i64_zero_no_comm:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z2.d, #0 // =0x0
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z2.d
-; CHECK-NEXT:    bic z1.d, p0/m, z1.d, z0.d
+; CHECK-NEXT:    mov z2.d, p0/m, z0.d
 ; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    bic z0.d, p0/m, z0.d, z2.d
 ; CHECK-NEXT:    ret
   %a_z = select <vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> zeroinitializer
   %out = call <vscale x 2 x i64> @llvm.aarch64.sve.bic.nxv2i64(<vscale x 2 x i1> %pg,

diff  --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
index 411f92fbb15265..7d81ebaefddb85 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
@@ -452,10 +452,10 @@ define <vscale x 2 x double> @dupq_f64(<vscale x 2 x double> %a) {
 define <vscale x 16 x i8> @dupq_lane_i8(<vscale x 16 x i8> %a, i64 %idx) {
 ; CHECK-LABEL: dupq_lane_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, x0
 ; CHECK-NEXT:    index z1.d, #0, #1
-; CHECK-NEXT:    and z1.d, z1.d, #0x1
+; CHECK-NEXT:    add x8, x0, x0
 ; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    and z1.d, z1.d, #0x1
 ; CHECK-NEXT:    add z1.d, z1.d, z2.d
 ; CHECK-NEXT:    tbl z0.d, { z0.d }, z1.d
 ; CHECK-NEXT:    ret
@@ -467,10 +467,10 @@ define <vscale x 16 x i8> @dupq_lane_i8(<vscale x 16 x i8> %a, i64 %idx) {
 define <vscale x 8 x i16> @dupq_lane_i16(<vscale x 8 x i16> %a, i64 %idx) {
 ; CHECK-LABEL: dupq_lane_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, x0
 ; CHECK-NEXT:    index z1.d, #0, #1
-; CHECK-NEXT:    and z1.d, z1.d, #0x1
+; CHECK-NEXT:    add x8, x0, x0
 ; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    and z1.d, z1.d, #0x1
 ; CHECK-NEXT:    add z1.d, z1.d, z2.d
 ; CHECK-NEXT:    tbl z0.d, { z0.d }, z1.d
 ; CHECK-NEXT:    ret
@@ -482,10 +482,10 @@ define <vscale x 8 x i16> @dupq_lane_i16(<vscale x 8 x i16> %a, i64 %idx) {
 define <vscale x 4 x i32> @dupq_lane_i32(<vscale x 4 x i32> %a, i64 %idx) {
 ; CHECK-LABEL: dupq_lane_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, x0
 ; CHECK-NEXT:    index z1.d, #0, #1
-; CHECK-NEXT:    and z1.d, z1.d, #0x1
+; CHECK-NEXT:    add x8, x0, x0
 ; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    and z1.d, z1.d, #0x1
 ; CHECK-NEXT:    add z1.d, z1.d, z2.d
 ; CHECK-NEXT:    tbl z0.d, { z0.d }, z1.d
 ; CHECK-NEXT:    ret
@@ -497,10 +497,10 @@ define <vscale x 4 x i32> @dupq_lane_i32(<vscale x 4 x i32> %a, i64 %idx) {
 define <vscale x 2 x i64> @dupq_lane_i64(<vscale x 2 x i64> %a, i64 %idx) {
 ; CHECK-LABEL: dupq_lane_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, x0
 ; CHECK-NEXT:    index z1.d, #0, #1
-; CHECK-NEXT:    and z1.d, z1.d, #0x1
+; CHECK-NEXT:    add x8, x0, x0
 ; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    and z1.d, z1.d, #0x1
 ; CHECK-NEXT:    add z1.d, z1.d, z2.d
 ; CHECK-NEXT:    tbl z0.d, { z0.d }, z1.d
 ; CHECK-NEXT:    ret
@@ -512,10 +512,10 @@ define <vscale x 2 x i64> @dupq_lane_i64(<vscale x 2 x i64> %a, i64 %idx) {
 define <vscale x 8 x half> @dupq_lane_f16(<vscale x 8 x half> %a, i64 %idx) {
 ; CHECK-LABEL: dupq_lane_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, x0
 ; CHECK-NEXT:    index z1.d, #0, #1
-; CHECK-NEXT:    and z1.d, z1.d, #0x1
+; CHECK-NEXT:    add x8, x0, x0
 ; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    and z1.d, z1.d, #0x1
 ; CHECK-NEXT:    add z1.d, z1.d, z2.d
 ; CHECK-NEXT:    tbl z0.d, { z0.d }, z1.d
 ; CHECK-NEXT:    ret
@@ -527,10 +527,10 @@ define <vscale x 8 x half> @dupq_lane_f16(<vscale x 8 x half> %a, i64 %idx) {
 define <vscale x 8 x bfloat> @dupq_lane_bf16(<vscale x 8 x bfloat> %a, i64 %idx) #0 {
 ; CHECK-LABEL: dupq_lane_bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, x0
 ; CHECK-NEXT:    index z1.d, #0, #1
-; CHECK-NEXT:    and z1.d, z1.d, #0x1
+; CHECK-NEXT:    add x8, x0, x0
 ; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    and z1.d, z1.d, #0x1
 ; CHECK-NEXT:    add z1.d, z1.d, z2.d
 ; CHECK-NEXT:    tbl z0.d, { z0.d }, z1.d
 ; CHECK-NEXT:    ret
@@ -542,10 +542,10 @@ define <vscale x 8 x bfloat> @dupq_lane_bf16(<vscale x 8 x bfloat> %a, i64 %idx)
 define <vscale x 4 x float> @dupq_lane_f32(<vscale x 4 x float> %a, i64 %idx) {
 ; CHECK-LABEL: dupq_lane_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, x0
 ; CHECK-NEXT:    index z1.d, #0, #1
-; CHECK-NEXT:    and z1.d, z1.d, #0x1
+; CHECK-NEXT:    add x8, x0, x0
 ; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    and z1.d, z1.d, #0x1
 ; CHECK-NEXT:    add z1.d, z1.d, z2.d
 ; CHECK-NEXT:    tbl z0.d, { z0.d }, z1.d
 ; CHECK-NEXT:    ret
@@ -557,10 +557,10 @@ define <vscale x 4 x float> @dupq_lane_f32(<vscale x 4 x float> %a, i64 %idx) {
 define <vscale x 2 x double> @dupq_lane_f64(<vscale x 2 x double> %a, i64 %idx) {
 ; CHECK-LABEL: dupq_lane_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, x0
 ; CHECK-NEXT:    index z1.d, #0, #1
-; CHECK-NEXT:    and z1.d, z1.d, #0x1
+; CHECK-NEXT:    add x8, x0, x0
 ; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    and z1.d, z1.d, #0x1
 ; CHECK-NEXT:    add z1.d, z1.d, z2.d
 ; CHECK-NEXT:    tbl z0.d, { z0.d }, z1.d
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-imm-addr-mode.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-imm-addr-mode.ll
index 3adafc98ef4a72..cace1200c4c1f0 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-imm-addr-mode.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-stN-reg-imm-addr-mode.ll
@@ -535,12 +535,12 @@ define void @st4b_i8_invalid_imm_out_of_lower_bound(<vscale x 16 x i8> %v0, <vsc
 ; CHECK-LABEL: st4b_i8_invalid_imm_out_of_lower_bound:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rdvl x8, #1
-; CHECK-NEXT:    mov x9, #-576
-; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mov x9, #-576 // =0xfffffffffffffdc0
 ; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    lsr x8, x8, #4
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    mul x8, x8, x9
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    mul x8, x8, x9
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    st4b { z0.b - z3.b }, p0, [x0, x8]
 ; CHECK-NEXT:    ret
@@ -562,12 +562,12 @@ define void @st4b_i8_invalid_imm_out_of_upper_bound(<vscale x 16 x i8> %v0, <vsc
 ; CHECK-LABEL: st4b_i8_invalid_imm_out_of_upper_bound:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rdvl x8, #1
-; CHECK-NEXT:    mov w9, #512
-; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mov w9, #512 // =0x200
 ; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    lsr x8, x8, #4
 ; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
-; CHECK-NEXT:    mul x8, x8, x9
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; CHECK-NEXT:    mul x8, x8, x9
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
 ; CHECK-NEXT:    st4b { z0.b - z3.b }, p0, [x0, x8]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll b/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll
index ad1f4c9ca17c25..e742836d79fbe5 100644
--- a/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll
@@ -39,18 +39,18 @@ define <4 x i64> @test_post_ld1_int_fixed(ptr %data, i64 %idx, ptr %addr)  #1 {
 ; CHECK-LABEL: test_post_ld1_int_fixed:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov w9, #2
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x2]
-; CHECK-NEXT:    ldr x10, [x0, x1, lsl #3]
-; CHECK-NEXT:    ldr x11, [x0]
-; CHECK-NEXT:    index z3.d, #0, #1
-; CHECK-NEXT:    mov z2.d, x9
+; CHECK-NEXT:    mov w9, #2 // =0x2
+; CHECK-NEXT:    index z0.d, #0, #1
 ; CHECK-NEXT:    ptrue p1.d, vl1
-; CHECK-NEXT:    cmpeq p2.d, p0/z, z3.d, z2.d
-; CHECK-NEXT:    mov z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, p2/m, x10
-; CHECK-NEXT:    mov z1.d, p1/m, x11
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    mov z1.d, x9
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x2]
+; CHECK-NEXT:    cmpeq p2.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    ldr x9, [x0]
+; CHECK-NEXT:    ldr x10, [x0, x1, lsl #3]
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    mov z2.d, p2/m, x10
+; CHECK-NEXT:    mov z0.d, p1/m, x9
+; CHECK-NEXT:    add z0.d, z0.d, z2.d
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
 ; CHECK-NEXT:    ret
   %A = load <4 x i64>, ptr %addr
@@ -67,17 +67,17 @@ define <4 x double> @test_post_ld1_double_fixed(ptr %data, i64 %idx, ptr %addr)
 ; CHECK-LABEL: test_post_ld1_double_fixed:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov w9, #2
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x2]
-; CHECK-NEXT:    ldr d1, [x0, x1, lsl #3]
-; CHECK-NEXT:    ldr d2, [x0]
-; CHECK-NEXT:    index z4.d, #0, #1
-; CHECK-NEXT:    mov z3.d, x9
+; CHECK-NEXT:    mov w9, #2 // =0x2
+; CHECK-NEXT:    index z0.d, #0, #1
+; CHECK-NEXT:    mov z1.d, x9
 ; CHECK-NEXT:    ptrue p1.d, vl1
-; CHECK-NEXT:    cmpeq p2.d, p0/z, z4.d, z3.d
-; CHECK-NEXT:    sel z2.d, p1, z2.d, z0.d
-; CHECK-NEXT:    mov z0.d, p2/m, d1
-; CHECK-NEXT:    fadd z0.d, z2.d, z0.d
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x2]
+; CHECK-NEXT:    cmpeq p2.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x0, x1, lsl #3]
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z2.d
+; CHECK-NEXT:    mov z2.d, p2/m, d1
+; CHECK-NEXT:    fadd z0.d, z0.d, z2.d
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
 ; CHECK-NEXT:    ret
   %A = load <4 x double>, ptr %addr

diff  --git a/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll
index a37cea885c3b4a..0a071f826d926e 100644
--- a/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll
@@ -39,8 +39,8 @@ define <vscale x 16 x i8> @ld1b_upper_bound(<vscale x 16 x i8>* %a) {
 define <vscale x 16 x i8> @ld1b_out_of_upper_bound(<vscale x 16 x i8>* %a) {
 ; CHECK-LABEL: ld1b_out_of_upper_bound:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    rdvl x8, #8
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    rdvl x8, #8
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; CHECK-NEXT:    ret
   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %a, i64 8
@@ -51,8 +51,8 @@ define <vscale x 16 x i8> @ld1b_out_of_upper_bound(<vscale x 16 x i8>* %a) {
 define <vscale x 16 x i8> @ld1b_out_of_lower_bound(<vscale x 16 x i8>* %a) {
 ; CHECK-LABEL: ld1b_out_of_lower_bound:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    rdvl x8, #-9
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    rdvl x8, #-9
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
 ; CHECK-NEXT:    ret
   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %a, i64 -9

diff  --git a/llvm/test/CodeGen/AArch64/sve-ld1r.ll b/llvm/test/CodeGen/AArch64/sve-ld1r.ll
index a1103fc28a2eed..fcfcb5619f7dda 100644
--- a/llvm/test/CodeGen/AArch64/sve-ld1r.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ld1r.ll
@@ -20,8 +20,8 @@ define <vscale x 16 x i8> @ld1r_stack() {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    adrp x8, :got:g8
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    adrp x8, :got:g8
 ; CHECK-NEXT:    ldr x8, [x8, :got_lo12:g8]
 ; CHECK-NEXT:    ldrb w8, [x8]
 ; CHECK-NEXT:    strb w8, [sp, #12]
@@ -66,8 +66,8 @@ define <vscale x 16 x i8> @ld1rb_gep(ptr %valp) {
 define <vscale x 16 x i8> @ld1rb_gep_out_of_range_up(ptr %valp) {
 ; CHECK-LABEL: ld1rb_gep_out_of_range_up:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #64
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    add x8, x0, #64
 ; CHECK-NEXT:    ld1rb { z0.b }, p0/z, [x8]
 ; CHECK-NEXT:    ret
   %valp2 = getelementptr i8, ptr %valp, i32 64
@@ -80,8 +80,8 @@ define <vscale x 16 x i8> @ld1rb_gep_out_of_range_up(ptr %valp) {
 define <vscale x 16 x i8> @ld1rb_gep_out_of_range_down(ptr %valp) {
 ; CHECK-LABEL: ld1rb_gep_out_of_range_down:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub x8, x0, #1
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    sub x8, x0, #1
 ; CHECK-NEXT:    ld1rb { z0.b }, p0/z, [x8]
 ; CHECK-NEXT:    ret
   %valp2 = getelementptr i8, ptr %valp, i32 -1
@@ -197,8 +197,8 @@ define <vscale x 8 x i16> @ld1rh_gep(ptr %valp) {
 define <vscale x 8 x i16> @ld1rh_gep_out_of_range_up(ptr %valp) {
 ; CHECK-LABEL: ld1rh_gep_out_of_range_up:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #128
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    add x8, x0, #128
 ; CHECK-NEXT:    ld1rh { z0.h }, p0/z, [x8]
 ; CHECK-NEXT:    ret
   %valp2 = getelementptr i16, ptr %valp, i32 64
@@ -211,8 +211,8 @@ define <vscale x 8 x i16> @ld1rh_gep_out_of_range_up(ptr %valp) {
 define <vscale x 8 x i16> @ld1rh_gep_out_of_range_down(ptr %valp) {
 ; CHECK-LABEL: ld1rh_gep_out_of_range_down:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub x8, x0, #2
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    sub x8, x0, #2
 ; CHECK-NEXT:    ld1rh { z0.h }, p0/z, [x8]
 ; CHECK-NEXT:    ret
   %valp2 = getelementptr i16, ptr %valp, i32 -1
@@ -302,8 +302,8 @@ define <vscale x 4 x i32> @ld1rw_gep(ptr %valp) {
 define <vscale x 4 x i32> @ld1rw_gep_out_of_range_up(ptr %valp) {
 ; CHECK-LABEL: ld1rw_gep_out_of_range_up:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #256
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    add x8, x0, #256
 ; CHECK-NEXT:    ld1rw { z0.s }, p0/z, [x8]
 ; CHECK-NEXT:    ret
   %valp2 = getelementptr i32, ptr %valp, i32 64
@@ -316,8 +316,8 @@ define <vscale x 4 x i32> @ld1rw_gep_out_of_range_up(ptr %valp) {
 define <vscale x 4 x i32> @ld1rw_gep_out_of_range_down(ptr %valp) {
 ; CHECK-LABEL: ld1rw_gep_out_of_range_down:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub x8, x0, #4
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    sub x8, x0, #4
 ; CHECK-NEXT:    ld1rw { z0.s }, p0/z, [x8]
 ; CHECK-NEXT:    ret
   %valp2 = getelementptr i32, ptr %valp, i32 -1
@@ -381,8 +381,8 @@ define <vscale x 2 x i64> @ld1rd_gep(ptr %valp) {
 define <vscale x 2 x i64> @ld1rd_gep_out_of_range_up(ptr %valp) {
 ; CHECK-LABEL: ld1rd_gep_out_of_range_up:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #512
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    add x8, x0, #512
 ; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x8]
 ; CHECK-NEXT:    ret
   %valp2 = getelementptr i64, ptr %valp, i32 64
@@ -395,8 +395,8 @@ define <vscale x 2 x i64> @ld1rd_gep_out_of_range_up(ptr %valp) {
 define <vscale x 2 x i64> @ld1rd_gep_out_of_range_down(ptr %valp) {
 ; CHECK-LABEL: ld1rd_gep_out_of_range_down:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub x8, x0, #8
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sub x8, x0, #8
 ; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x8]
 ; CHECK-NEXT:    ret
   %valp2 = getelementptr i64, ptr %valp, i32 -1
@@ -458,8 +458,8 @@ define <vscale x 8 x half> @ld1rh_half_gep(ptr %valp) {
 define <vscale x 8 x half> @ld1rh_half_gep_out_of_range_up(ptr %valp) {
 ; CHECK-LD1R-LABEL: ld1rh_half_gep_out_of_range_up:
 ; CHECK-LD1R:       // %bb.0:
-; CHECK-LD1R-NEXT:    add x8, x0, #128
 ; CHECK-LD1R-NEXT:    ptrue p0.h
+; CHECK-LD1R-NEXT:    add x8, x0, #128
 ; CHECK-LD1R-NEXT:    ld1rh { z0.h }, p0/z, [x8]
 ; CHECK-LD1R-NEXT:    ret
 ;
@@ -478,8 +478,8 @@ define <vscale x 8 x half> @ld1rh_half_gep_out_of_range_up(ptr %valp) {
 define <vscale x 8 x half> @ld1rh_half_gep_out_of_range_down(ptr %valp) {
 ; CHECK-LD1R-LABEL: ld1rh_half_gep_out_of_range_down:
 ; CHECK-LD1R:       // %bb.0:
-; CHECK-LD1R-NEXT:    sub x8, x0, #2
 ; CHECK-LD1R-NEXT:    ptrue p0.h
+; CHECK-LD1R-NEXT:    sub x8, x0, #2
 ; CHECK-LD1R-NEXT:    ld1rh { z0.h }, p0/z, [x8]
 ; CHECK-LD1R-NEXT:    ret
 ;
@@ -535,8 +535,8 @@ define <vscale x 4 x half> @ld1rh_half_unpacked4_gep(ptr %valp) {
 define <vscale x 4 x half> @ld1rh_half_unpacked4_gep_out_of_range_up(ptr %valp) {
 ; CHECK-LD1R-LABEL: ld1rh_half_unpacked4_gep_out_of_range_up:
 ; CHECK-LD1R:       // %bb.0:
-; CHECK-LD1R-NEXT:    add x8, x0, #128
 ; CHECK-LD1R-NEXT:    ptrue p0.s
+; CHECK-LD1R-NEXT:    add x8, x0, #128
 ; CHECK-LD1R-NEXT:    ld1rh { z0.s }, p0/z, [x8]
 ; CHECK-LD1R-NEXT:    ret
 ;
@@ -555,8 +555,8 @@ define <vscale x 4 x half> @ld1rh_half_unpacked4_gep_out_of_range_up(ptr %valp)
 define <vscale x 4 x half> @ld1rh_half_unpacked4_gep_out_of_range_down(ptr %valp) {
 ; CHECK-LD1R-LABEL: ld1rh_half_unpacked4_gep_out_of_range_down:
 ; CHECK-LD1R:       // %bb.0:
-; CHECK-LD1R-NEXT:    sub x8, x0, #2
 ; CHECK-LD1R-NEXT:    ptrue p0.s
+; CHECK-LD1R-NEXT:    sub x8, x0, #2
 ; CHECK-LD1R-NEXT:    ld1rh { z0.s }, p0/z, [x8]
 ; CHECK-LD1R-NEXT:    ret
 ;
@@ -612,8 +612,8 @@ define <vscale x 2 x half> @ld1rh_half_unpacked2_gep(ptr %valp) {
 define <vscale x 2 x half> @ld1rh_half_unpacked2_gep_out_of_range_up(ptr %valp) {
 ; CHECK-LD1R-LABEL: ld1rh_half_unpacked2_gep_out_of_range_up:
 ; CHECK-LD1R:       // %bb.0:
-; CHECK-LD1R-NEXT:    add x8, x0, #128
 ; CHECK-LD1R-NEXT:    ptrue p0.d
+; CHECK-LD1R-NEXT:    add x8, x0, #128
 ; CHECK-LD1R-NEXT:    ld1rh { z0.d }, p0/z, [x8]
 ; CHECK-LD1R-NEXT:    ret
 ;
@@ -632,8 +632,8 @@ define <vscale x 2 x half> @ld1rh_half_unpacked2_gep_out_of_range_up(ptr %valp)
 define <vscale x 2 x half> @ld1rh_half_unpacked2_gep_out_of_range_down(ptr %valp) {
 ; CHECK-LD1R-LABEL: ld1rh_half_unpacked2_gep_out_of_range_down:
 ; CHECK-LD1R:       // %bb.0:
-; CHECK-LD1R-NEXT:    sub x8, x0, #2
 ; CHECK-LD1R-NEXT:    ptrue p0.d
+; CHECK-LD1R-NEXT:    sub x8, x0, #2
 ; CHECK-LD1R-NEXT:    ld1rh { z0.d }, p0/z, [x8]
 ; CHECK-LD1R-NEXT:    ret
 ;
@@ -689,8 +689,8 @@ define <vscale x 4 x float> @ld1rw_float_gep(ptr %valp) {
 define <vscale x 4 x float> @ld1rw_float_gep_out_of_range_up(ptr %valp) {
 ; CHECK-LD1R-LABEL: ld1rw_float_gep_out_of_range_up:
 ; CHECK-LD1R:       // %bb.0:
-; CHECK-LD1R-NEXT:    add x8, x0, #256
 ; CHECK-LD1R-NEXT:    ptrue p0.s
+; CHECK-LD1R-NEXT:    add x8, x0, #256
 ; CHECK-LD1R-NEXT:    ld1rw { z0.s }, p0/z, [x8]
 ; CHECK-LD1R-NEXT:    ret
 ;
@@ -709,8 +709,8 @@ define <vscale x 4 x float> @ld1rw_float_gep_out_of_range_up(ptr %valp) {
 define <vscale x 4 x float> @ld1rw_float_gep_out_of_range_down(ptr %valp) {
 ; CHECK-LD1R-LABEL: ld1rw_float_gep_out_of_range_down:
 ; CHECK-LD1R:       // %bb.0:
-; CHECK-LD1R-NEXT:    sub x8, x0, #4
 ; CHECK-LD1R-NEXT:    ptrue p0.s
+; CHECK-LD1R-NEXT:    sub x8, x0, #4
 ; CHECK-LD1R-NEXT:    ld1rw { z0.s }, p0/z, [x8]
 ; CHECK-LD1R-NEXT:    ret
 ;
@@ -766,8 +766,8 @@ define <vscale x 2 x float> @ld1rw_float_unpacked2_gep(ptr %valp) {
 define <vscale x 2 x float> @ld1rw_float_unpacked2_gep_out_of_range_up(ptr %valp) {
 ; CHECK-LD1R-LABEL: ld1rw_float_unpacked2_gep_out_of_range_up:
 ; CHECK-LD1R:       // %bb.0:
-; CHECK-LD1R-NEXT:    add x8, x0, #256
 ; CHECK-LD1R-NEXT:    ptrue p0.d
+; CHECK-LD1R-NEXT:    add x8, x0, #256
 ; CHECK-LD1R-NEXT:    ld1rw { z0.d }, p0/z, [x8]
 ; CHECK-LD1R-NEXT:    ret
 ;
@@ -786,8 +786,8 @@ define <vscale x 2 x float> @ld1rw_float_unpacked2_gep_out_of_range_up(ptr %valp
 define <vscale x 2 x float> @ld1rw_float_unpacked2_gep_out_of_range_down(ptr %valp) {
 ; CHECK-LD1R-LABEL: ld1rw_float_unpacked2_gep_out_of_range_down:
 ; CHECK-LD1R:       // %bb.0:
-; CHECK-LD1R-NEXT:    sub x8, x0, #4
 ; CHECK-LD1R-NEXT:    ptrue p0.d
+; CHECK-LD1R-NEXT:    sub x8, x0, #4
 ; CHECK-LD1R-NEXT:    ld1rw { z0.d }, p0/z, [x8]
 ; CHECK-LD1R-NEXT:    ret
 ;
@@ -843,8 +843,8 @@ define <vscale x 2 x double> @ld1rd_double_gep(ptr %valp) {
 define <vscale x 2 x double> @ld1rd_double_gep_out_of_range_up(ptr %valp) {
 ; CHECK-LD1R-LABEL: ld1rd_double_gep_out_of_range_up:
 ; CHECK-LD1R:       // %bb.0:
-; CHECK-LD1R-NEXT:    add x8, x0, #512
 ; CHECK-LD1R-NEXT:    ptrue p0.d
+; CHECK-LD1R-NEXT:    add x8, x0, #512
 ; CHECK-LD1R-NEXT:    ld1rd { z0.d }, p0/z, [x8]
 ; CHECK-LD1R-NEXT:    ret
 ;
@@ -863,8 +863,8 @@ define <vscale x 2 x double> @ld1rd_double_gep_out_of_range_up(ptr %valp) {
 define <vscale x 2 x double> @ld1rd_double_gep_out_of_range_down(ptr %valp) {
 ; CHECK-LD1R-LABEL: ld1rd_double_gep_out_of_range_down:
 ; CHECK-LD1R:       // %bb.0:
-; CHECK-LD1R-NEXT:    sub x8, x0, #8
 ; CHECK-LD1R-NEXT:    ptrue p0.d
+; CHECK-LD1R-NEXT:    sub x8, x0, #8
 ; CHECK-LD1R-NEXT:    ld1rd { z0.d }, p0/z, [x8]
 ; CHECK-LD1R-NEXT:    ret
 ;
@@ -1250,8 +1250,8 @@ define <vscale x 8 x half> @dup_ld1rh_half_passthruzero_nxv8f16(<vscale x 8 x i1
 ;
 ; CHECK-NO-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv8f16:
 ; CHECK-NO-LD1R:       // %bb.0:
-; CHECK-NO-LD1R-NEXT:    ldr h1, [x0]
 ; CHECK-NO-LD1R-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NO-LD1R-NEXT:    ldr h1, [x0]
 ; CHECK-NO-LD1R-NEXT:    mov z0.h, p0/m, h1
 ; CHECK-NO-LD1R-NEXT:    ret
     %ld = load half, ptr %addr
@@ -1266,8 +1266,8 @@ define <vscale x 4 x float> @dup_ld1rs_float_passthruzero_nxv4f32(<vscale x 4 x
 ;
 ; CHECK-NO-LD1R-LABEL: dup_ld1rs_float_passthruzero_nxv4f32:
 ; CHECK-NO-LD1R:       // %bb.0:
-; CHECK-NO-LD1R-NEXT:    ldr s1, [x0]
 ; CHECK-NO-LD1R-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NO-LD1R-NEXT:    ldr s1, [x0]
 ; CHECK-NO-LD1R-NEXT:    mov z0.s, p0/m, s1
 ; CHECK-NO-LD1R-NEXT:    ret
     %ld = load float, ptr %addr
@@ -1282,8 +1282,8 @@ define <vscale x 2 x double> @dup_ld1rd_double_passthruzero_nxv2f64(<vscale x 2
 ;
 ; CHECK-NO-LD1R-LABEL: dup_ld1rd_double_passthruzero_nxv2f64:
 ; CHECK-NO-LD1R:       // %bb.0:
-; CHECK-NO-LD1R-NEXT:    ldr d1, [x0]
 ; CHECK-NO-LD1R-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NO-LD1R-NEXT:    ldr d1, [x0]
 ; CHECK-NO-LD1R-NEXT:    mov z0.d, p0/m, d1
 ; CHECK-NO-LD1R-NEXT:    ret
     %ld = load double, ptr %addr
@@ -1298,8 +1298,8 @@ define <vscale x 4 x half> @dup_ld1rh_half_passthruzero_nxv4f16(<vscale x 4 x i1
 ;
 ; CHECK-NO-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv4f16:
 ; CHECK-NO-LD1R:       // %bb.0:
-; CHECK-NO-LD1R-NEXT:    ldr h1, [x0]
 ; CHECK-NO-LD1R-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NO-LD1R-NEXT:    ldr h1, [x0]
 ; CHECK-NO-LD1R-NEXT:    mov z0.h, p0/m, h1
 ; CHECK-NO-LD1R-NEXT:    ret
     %ld = load half, ptr %addr
@@ -1314,8 +1314,8 @@ define <vscale x 2 x half> @dup_ld1rh_half_passthruzero_nxv2f16(<vscale x 2 x i1
 ;
 ; CHECK-NO-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv2f16:
 ; CHECK-NO-LD1R:       // %bb.0:
-; CHECK-NO-LD1R-NEXT:    ldr h1, [x0]
 ; CHECK-NO-LD1R-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NO-LD1R-NEXT:    ldr h1, [x0]
 ; CHECK-NO-LD1R-NEXT:    mov z0.h, p0/m, h1
 ; CHECK-NO-LD1R-NEXT:    ret
     %ld = load half, ptr %addr
@@ -1330,8 +1330,8 @@ define <vscale x 2 x float> @dup_ld1rs_float_passthruzero_nxv2f32(<vscale x 2 x
 ;
 ; CHECK-NO-LD1R-LABEL: dup_ld1rs_float_passthruzero_nxv2f32:
 ; CHECK-NO-LD1R:       // %bb.0:
-; CHECK-NO-LD1R-NEXT:    ldr s1, [x0]
 ; CHECK-NO-LD1R-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NO-LD1R-NEXT:    ldr s1, [x0]
 ; CHECK-NO-LD1R-NEXT:    mov z0.s, p0/m, s1
 ; CHECK-NO-LD1R-NEXT:    ret
     %ld = load float, ptr %addr
@@ -1415,9 +1415,8 @@ define i8* @avoid_preindex_load(i8* %src, <vscale x 2 x i64>* %out) {
 ; CHECK-LABEL: avoid_preindex_load:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    add x8, x0, #1
 ; CHECK-NEXT:    ld1rsb { z0.d }, p0/z, [x0, #1]
-; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    add x0, x0, #1
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
   %ptr = getelementptr inbounds i8, i8* %src, i64 1
@@ -1434,11 +1433,10 @@ define i8* @avoid_preindex_load(i8* %src, <vscale x 2 x i64>* %out) {
 define i8* @avoid_preindex_load_dup(i8* %src, <vscale x 2 x i1> %pg, <vscale x 2 x i64>* %out) {
 ; CHECK-LABEL: avoid_preindex_load_dup:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #1
+; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    ld1rsb { z0.d }, p0/z, [x0, #1]
-; CHECK-NEXT:    mov x0, x8
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    add x0, x0, #1
+; CHECK-NEXT:    st1d { z0.d }, p1, [x1]
 ; CHECK-NEXT:    ret
   %ptr = getelementptr inbounds i8, i8* %src, i64 1
   %tmp = load i8, i8* %ptr, align 4
@@ -1452,11 +1450,10 @@ define i8* @avoid_preindex_load_dup(i8* %src, <vscale x 2 x i1> %pg, <vscale x 2
 define i8* @avoid_preindex_load_dup_passthru_zero(i8* %src, <vscale x 2 x i1> %pg, <vscale x 2 x i64>* %out) {
 ; CHECK-LABEL: avoid_preindex_load_dup_passthru_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #1
+; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    ld1rsb { z0.d }, p0/z, [x0, #1]
-; CHECK-NEXT:    mov x0, x8
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    add x0, x0, #1
+; CHECK-NEXT:    st1d { z0.d }, p1, [x1]
 ; CHECK-NEXT:    ret
   %ptr = getelementptr inbounds i8, i8* %src, i64 1
   %tmp = load i8, i8* %ptr, align 4
@@ -1470,8 +1467,8 @@ define i8* @avoid_preindex_load_dup_passthru_zero(i8* %src, <vscale x 2 x i1> %p
 define i8* @preindex_load_dup_passthru(<vscale x 2 x i64> %passthru, i8* %src, <vscale x 2 x i1> %pg, <vscale x 2 x i64>* %out) {
 ; CHECK-LABEL: preindex_load_dup_passthru:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrsb x8, [x0, #1]!
 ; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    ldrsb x8, [x0, #1]!
 ; CHECK-NEXT:    mov z0.d, p0/m, x8
 ; CHECK-NEXT:    st1d { z0.d }, p1, [x1]
 ; CHECK-NEXT:    ret
@@ -1488,8 +1485,8 @@ define i8* @preindex_load_dup_passthru(<vscale x 2 x i64> %passthru, i8* %src, <
 define i8* @preidx8sext64_instead_of_ld1r(i8* %src, <vscale x 2 x i64>* %out, i64* %dst) {
 ; CHECK-LABEL: preidx8sext64_instead_of_ld1r:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrsb x8, [x0, #1]!
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldrsb x8, [x0, #1]!
 ; CHECK-NEXT:    mov z0.d, x8
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
 ; CHECK-NEXT:    str x8, [x2]

diff  --git a/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll b/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll
index 585c5d7a2472e6..06ec132808154a 100644
--- a/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll
+++ b/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll
@@ -38,10 +38,10 @@ define void @ld_st_nxv8i16(ptr %in, ptr %out) {
 ;
 ; ASM-LABEL: ld_st_nxv8i16:
 ; ASM:       // %bb.0: // %entry
-; ASM-NEXT:    mov x8, xzr
+; ASM-NEXT:    ptrue p0.h
 ; ASM-NEXT:    mov z0.h, #3 // =0x3
+; ASM-NEXT:    mov x8, xzr
 ; ASM-NEXT:    cnth x9
-; ASM-NEXT:    ptrue p0.h
 ; ASM-NEXT:  .LBB0_1: // %loop
 ; ASM-NEXT:    // =>This Inner Loop Header: Depth=1
 ; ASM-NEXT:    ld1h { z1.h }, p0/z, [x0, x8, lsl #1]
@@ -111,9 +111,9 @@ define void @masked_ld_st_nxv8i16(ptr %in, ptr %out, i64 %n) {
 ;
 ; ASM-LABEL: masked_ld_st_nxv8i16:
 ; ASM:       // %bb.0: // %entry
-; ASM-NEXT:    mov x8, xzr
-; ASM-NEXT:    mov z0.h, #3 // =0x3
 ; ASM-NEXT:    ptrue p0.h
+; ASM-NEXT:    mov z0.h, #3 // =0x3
+; ASM-NEXT:    mov x8, xzr
 ; ASM-NEXT:    cnth x9
 ; ASM-NEXT:  .LBB1_1: // %loop
 ; ASM-NEXT:    // =>This Inner Loop Header: Depth=1

diff  --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
index 767789866a0bfb..0d06c21abfd2ee 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
@@ -113,9 +113,9 @@ define <vscale x 8 x half> @masked_gather_nxv8f16(<vscale x 8 x ptr> %ptrs, <vsc
 ; CHECK-NEXT:    ld1h { z2.d }, p1/z, [z2.d]
 ; CHECK-NEXT:    punpkhi p1.h, p0.b
 ; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z3.s
 ; CHECK-NEXT:    ld1h { z1.d }, p1/z, [z1.d]
 ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z0.d]
-; CHECK-NEXT:    uzp1 z2.s, z2.s, z3.s
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
@@ -126,8 +126,8 @@ define <vscale x 8 x half> @masked_gather_nxv8f16(<vscale x 8 x ptr> %ptrs, <vsc
 define <vscale x 8 x bfloat> @masked_gather_nxv8bf16(ptr %base, <vscale x 8 x i16> %indices, <vscale x 8 x i1> %mask) #0 {
 ; CHECK-LABEL: masked_gather_nxv8bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sunpkhi z1.s, z0.h
 ; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    sunpkhi z1.s, z0.h
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    ld1h { z1.s }, p1/z, [x0, z1.s, sxtw #1]
@@ -175,16 +175,16 @@ define <vscale x 8 x float> @masked_gather_nxv8f32(ptr %base, <vscale x 8 x i32>
 define <vscale x 16 x i8> @masked_gather_nxv16i8(ptr %base, <vscale x 16 x i8> %indices, <vscale x 16 x i1> %mask) #0 {
 ; CHECK-LABEL: masked_gather_nxv16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sunpkhi z1.h, z0.b
 ; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    sunpkhi z1.h, z0.b
+; CHECK-NEXT:    sunpklo z0.h, z0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    sunpkhi z2.s, z1.h
-; CHECK-NEXT:    punpkhi p2.h, p1.b
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    punpkhi p2.h, p1.b
 ; CHECK-NEXT:    punpklo p1.h, p1.b
 ; CHECK-NEXT:    ld1b { z2.s }, p2/z, [x0, z2.s, sxtw]
 ; CHECK-NEXT:    ld1b { z1.s }, p1/z, [x0, z1.s, sxtw]
-; CHECK-NEXT:    sunpklo z0.h, z0.b
-; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    punpkhi p1.h, p0.b
 ; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    uzp1 z1.h, z1.h, z2.h

diff  --git a/llvm/test/CodeGen/AArch64/sve-masked-ldst-sext.ll b/llvm/test/CodeGen/AArch64/sve-masked-ldst-sext.ll
index 85afb5f8e61f73..476401c7ebd805 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-ldst-sext.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-ldst-sext.ll
@@ -68,8 +68,8 @@ define <vscale x 8 x i16> @masked_sload_nxv8i8(<vscale x 8 x i8> *%a, <vscale x
 define <vscale x 2 x i64> @masked_sload_passthru(<vscale x 2 x i32> *%a, <vscale x 2 x i1> %mask, <vscale x 2 x i32> %passthru) {
 ; CHECK-LABEL: masked_sload_passthru:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1sw { z1.d }, p0/z, [x0]
 ; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    ld1sw { z1.d }, p0/z, [x0]
 ; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
 ; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-masked-ldst-zext.ll b/llvm/test/CodeGen/AArch64/sve-masked-ldst-zext.ll
index aa26352e998bbf..c4447f2f26ae04 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-ldst-zext.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-ldst-zext.ll
@@ -98,9 +98,9 @@ define <vscale x 8 x i64> @masked_zload_nxv8i16(<vscale x 8 x i16>* %a, <vscale
 define <vscale x 2 x double> @masked_zload_2i16_2f64(<vscale x 2 x i16>* noalias %in, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_zload_2i16_2f64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    ucvtf z0.d, p1/m, z0.d
 ; CHECK-NEXT:    ret
   %wide.load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %in, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
   %zext = zext <vscale x 2 x i16> %wide.load to <vscale x 2 x i32>

diff  --git a/llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll b/llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll
index 9216381942e87a..10a6445c6fa2f1 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll
@@ -9,25 +9,25 @@ target triple = "aarch64-linux-gnu"
 define void @masked_scatter_nxv16i8(<vscale x 16 x i8> %data, ptr %base, <vscale x 16 x i8> %offsets, <vscale x 16 x i1> %mask) #0 {
 ; CHECK-LABEL: masked_scatter_nxv16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sunpklo z2.h, z1.b
 ; CHECK-NEXT:    punpklo p1.h, p0.b
+; CHECK-NEXT:    sunpklo z2.h, z1.b
 ; CHECK-NEXT:    uunpklo z4.h, z0.b
+; CHECK-NEXT:    punpkhi p0.h, p0.b
+; CHECK-NEXT:    sunpkhi z1.h, z1.b
+; CHECK-NEXT:    uunpkhi z0.h, z0.b
 ; CHECK-NEXT:    sunpklo z3.s, z2.h
-; CHECK-NEXT:    punpklo p2.h, p1.b
 ; CHECK-NEXT:    uunpklo z5.s, z4.h
-; CHECK-NEXT:    st1b { z5.s }, p2, [x0, z3.s, sxtw]
 ; CHECK-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-NEXT:    punpklo p2.h, p1.b
 ; CHECK-NEXT:    punpkhi p1.h, p1.b
+; CHECK-NEXT:    st1b { z5.s }, p2, [x0, z3.s, sxtw]
 ; CHECK-NEXT:    uunpkhi z3.s, z4.h
-; CHECK-NEXT:    sunpkhi z1.h, z1.b
-; CHECK-NEXT:    punpkhi p0.h, p0.b
-; CHECK-NEXT:    uunpkhi z0.h, z0.b
 ; CHECK-NEXT:    st1b { z3.s }, p1, [x0, z2.s, sxtw]
-; CHECK-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEXT:    punpklo p1.h, p0.b
+; CHECK-NEXT:    sunpklo z2.s, z1.h
+; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    uunpklo z3.s, z0.h
 ; CHECK-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    uunpkhi z0.s, z0.h
 ; CHECK-NEXT:    st1b { z3.s }, p1, [x0, z2.s, sxtw]
 ; CHECK-NEXT:    st1b { z0.s }, p0, [x0, z1.s, sxtw]
@@ -40,11 +40,11 @@ define void @masked_scatter_nxv16i8(<vscale x 16 x i8> %data, ptr %base, <vscale
 define void @masked_scatter_nxv8i16(<vscale x 8 x i16> %data, ptr %base, <vscale x 8 x i16> %offsets, <vscale x 8 x i1> %mask) #0 {
 ; CHECK-LABEL: masked_scatter_nxv8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEXT:    punpklo p1.h, p0.b
+; CHECK-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEXT:    uunpklo z3.s, z0.h
-; CHECK-NEXT:    sunpkhi z1.s, z1.h
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
+; CHECK-NEXT:    sunpkhi z1.s, z1.h
 ; CHECK-NEXT:    uunpkhi z0.s, z0.h
 ; CHECK-NEXT:    st1h { z3.s }, p1, [x0, z2.s, sxtw #1]
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x0, z1.s, sxtw #1]
@@ -57,11 +57,11 @@ define void @masked_scatter_nxv8i16(<vscale x 8 x i16> %data, ptr %base, <vscale
 define void @masked_scatter_nxv8bf16(<vscale x 8 x bfloat> %data, ptr %base, <vscale x 8 x i16> %offsets, <vscale x 8 x i1> %mask) #0 {
 ; CHECK-LABEL: masked_scatter_nxv8bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEXT:    punpklo p1.h, p0.b
+; CHECK-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEXT:    uunpklo z3.s, z0.h
-; CHECK-NEXT:    sunpkhi z1.s, z1.h
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
+; CHECK-NEXT:    sunpkhi z1.s, z1.h
 ; CHECK-NEXT:    uunpkhi z0.s, z0.h
 ; CHECK-NEXT:    st1h { z3.s }, p1, [x0, z2.s, sxtw #1]
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x0, z1.s, sxtw #1]

diff  --git a/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll
index 7518ee66b6f6c4..5a7287bb61885f 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll
@@ -76,8 +76,8 @@ define void @masked_scatter_nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x d
 define void @masked_scatter_splat_constant_pointer (<vscale x 4 x i1> %pg) {
 ; CHECK-LABEL: masked_scatter_splat_constant_pointer:
 ; CHECK:       // %bb.0: // %vector.body
-; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    punpklo p1.h, p0.b
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    st1w { z0.d }, p1, [z0.d]
 ; CHECK-NEXT:    st1w { z0.d }, p0, [z0.d]

diff  --git a/llvm/test/CodeGen/AArch64/sve-pr62151.ll b/llvm/test/CodeGen/AArch64/sve-pr62151.ll
index 5ed34f14a0b140..7cec20fda429c1 100644
--- a/llvm/test/CodeGen/AArch64/sve-pr62151.ll
+++ b/llvm/test/CodeGen/AArch64/sve-pr62151.ll
@@ -5,8 +5,8 @@
 define i32 @build_interpolation(<2 x i32> %0, <2 x i32> %1, <2 x i32> %2) {
 ; CHECK-LABEL: build_interpolation:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mul v0.2s, v1.2s, v0.2s
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    mul v0.2s, v1.2s, v0.2s
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $z2
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z2.s
 ; CHECK-NEXT:    mla v0.2s, v1.2s, v0.s[1]

diff  --git a/llvm/test/CodeGen/AArch64/sve-pred-arith.ll b/llvm/test/CodeGen/AArch64/sve-pred-arith.ll
index d53dba17dd9695..4d46ac5ecbaa95 100644
--- a/llvm/test/CodeGen/AArch64/sve-pred-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-pred-arith.ll
@@ -54,23 +54,23 @@ define aarch64_sve_vector_pcs <vscale x 64 x i1> @add_nxv64i1(<vscale x 64 x i1>
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ptrue p4.b
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    ptrue p8.b
-; CHECK-NEXT:    ldr p4, [x1]
 ; CHECK-NEXT:    ldr p5, [x0]
-; CHECK-NEXT:    ldr p6, [x3]
+; CHECK-NEXT:    ldr p6, [x1]
 ; CHECK-NEXT:    ldr p7, [x2]
-; CHECK-NEXT:    eor p0.b, p8/z, p0.b, p5.b
-; CHECK-NEXT:    eor p1.b, p8/z, p1.b, p4.b
-; CHECK-NEXT:    eor p2.b, p8/z, p2.b, p7.b
-; CHECK-NEXT:    eor p3.b, p8/z, p3.b, p6.b
-; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [x3]
+; CHECK-NEXT:    eor p0.b, p4/z, p0.b, p5.b
+; CHECK-NEXT:    eor p1.b, p4/z, p1.b, p6.b
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    eor p2.b, p4/z, p2.b, p7.b
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    eor p3.b, p4/z, p3.b, p8.b
+; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -138,23 +138,23 @@ define aarch64_sve_vector_pcs <vscale x 64 x i1> @sub_nxv64i1(<vscale x 64 x i1>
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ptrue p4.b
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    ptrue p8.b
-; CHECK-NEXT:    ldr p4, [x1]
 ; CHECK-NEXT:    ldr p5, [x0]
-; CHECK-NEXT:    ldr p6, [x3]
+; CHECK-NEXT:    ldr p6, [x1]
 ; CHECK-NEXT:    ldr p7, [x2]
-; CHECK-NEXT:    eor p0.b, p8/z, p0.b, p5.b
-; CHECK-NEXT:    eor p1.b, p8/z, p1.b, p4.b
-; CHECK-NEXT:    eor p2.b, p8/z, p2.b, p7.b
-; CHECK-NEXT:    eor p3.b, p8/z, p3.b, p6.b
-; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [x3]
+; CHECK-NEXT:    eor p0.b, p4/z, p0.b, p5.b
+; CHECK-NEXT:    eor p1.b, p4/z, p1.b, p6.b
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    eor p2.b, p4/z, p2.b, p7.b
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    eor p3.b, p4/z, p3.b, p8.b
+; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1

diff  --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll
index a8b28c756f20b5..539f443de18a1f 100644
--- a/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll
+++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll
@@ -322,8 +322,8 @@ entry:
 define <vscale x 4 x i32> @ornot_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
 ; CHECK-LABEL: ornot_v4i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z3.s, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov z3.s, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    eor z2.d, z2.d, z3.d
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    orr z1.d, z1.d, z2.d
@@ -340,8 +340,8 @@ entry:
 define <vscale x 8 x i16> @ornot_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
 ; CHECK-LABEL: ornot_v8i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z3.h, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z3.h, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    eor z2.d, z2.d, z3.d
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    orr z1.d, z1.d, z2.d
@@ -358,8 +358,8 @@ entry:
 define <vscale x 16 x i8> @ornot_v16i8(<vscale x 16 x i8> %z, <vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
 ; CHECK-LABEL: ornot_v16i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z3.b, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov z3.b, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    eor z2.d, z2.d, z3.d
 ; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    orr z1.d, z1.d, z2.d
@@ -467,9 +467,9 @@ define <vscale x 4 x i32> @icmp_slt_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x i
 ; CHECK-LABEL: icmp_slt_v4i32:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    smin z1.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
-; CHECK-NEXT:    mov z0.s, p0/m, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp eq <vscale x 4 x i32> %z, zeroinitializer
@@ -483,9 +483,9 @@ define <vscale x 8 x i16> @icmp_slt_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x i
 ; CHECK-LABEL: icmp_slt_v8i16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    cmpeq p1.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    smin z1.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
+; CHECK-NEXT:    mov z0.h, p1/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp eq <vscale x 8 x i16> %z, zeroinitializer
@@ -499,9 +499,9 @@ define <vscale x 16 x i8> @icmp_slt_v16i8(<vscale x 16 x i8> %z, <vscale x 16 x
 ; CHECK-LABEL: icmp_slt_v16i8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    cmpeq p1.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    smin z1.b, p0/m, z1.b, z2.b
-; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
-; CHECK-NEXT:    mov z0.b, p0/m, z1.b
+; CHECK-NEXT:    mov z0.b, p1/m, z1.b
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp eq <vscale x 16 x i8> %z, zeroinitializer
@@ -515,9 +515,9 @@ define <vscale x 4 x i32> @icmp_sgt_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x i
 ; CHECK-LABEL: icmp_sgt_v4i32:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    smax z1.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
-; CHECK-NEXT:    mov z0.s, p0/m, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp eq <vscale x 4 x i32> %z, zeroinitializer
@@ -531,9 +531,9 @@ define <vscale x 8 x i16> @icmp_sgt_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x i
 ; CHECK-LABEL: icmp_sgt_v8i16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    cmpeq p1.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    smax z1.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
+; CHECK-NEXT:    mov z0.h, p1/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp eq <vscale x 8 x i16> %z, zeroinitializer
@@ -547,9 +547,9 @@ define <vscale x 16 x i8> @icmp_sgt_v16i8(<vscale x 16 x i8> %z, <vscale x 16 x
 ; CHECK-LABEL: icmp_sgt_v16i8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    cmpeq p1.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    smax z1.b, p0/m, z1.b, z2.b
-; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
-; CHECK-NEXT:    mov z0.b, p0/m, z1.b
+; CHECK-NEXT:    mov z0.b, p1/m, z1.b
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp eq <vscale x 16 x i8> %z, zeroinitializer
@@ -563,9 +563,9 @@ define <vscale x 4 x i32> @icmp_ult_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x i
 ; CHECK-LABEL: icmp_ult_v4i32:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    umin z1.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
-; CHECK-NEXT:    mov z0.s, p0/m, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp eq <vscale x 4 x i32> %z, zeroinitializer
@@ -579,9 +579,9 @@ define <vscale x 8 x i16> @icmp_ult_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x i
 ; CHECK-LABEL: icmp_ult_v8i16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    cmpeq p1.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    umin z1.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
+; CHECK-NEXT:    mov z0.h, p1/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp eq <vscale x 8 x i16> %z, zeroinitializer
@@ -595,9 +595,9 @@ define <vscale x 16 x i8> @icmp_ult_v16i8(<vscale x 16 x i8> %z, <vscale x 16 x
 ; CHECK-LABEL: icmp_ult_v16i8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    cmpeq p1.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    umin z1.b, p0/m, z1.b, z2.b
-; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
-; CHECK-NEXT:    mov z0.b, p0/m, z1.b
+; CHECK-NEXT:    mov z0.b, p1/m, z1.b
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp eq <vscale x 16 x i8> %z, zeroinitializer
@@ -611,9 +611,9 @@ define <vscale x 4 x i32> @icmp_ugt_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x i
 ; CHECK-LABEL: icmp_ugt_v4i32:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    umax z1.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
-; CHECK-NEXT:    mov z0.s, p0/m, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp eq <vscale x 4 x i32> %z, zeroinitializer
@@ -627,9 +627,9 @@ define <vscale x 8 x i16> @icmp_ugt_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x i
 ; CHECK-LABEL: icmp_ugt_v8i16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    cmpeq p1.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    umax z1.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
+; CHECK-NEXT:    mov z0.h, p1/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp eq <vscale x 8 x i16> %z, zeroinitializer
@@ -643,9 +643,9 @@ define <vscale x 16 x i8> @icmp_ugt_v16i8(<vscale x 16 x i8> %z, <vscale x 16 x
 ; CHECK-LABEL: icmp_ugt_v16i8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    cmpeq p1.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    umax z1.b, p0/m, z1.b, z2.b
-; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
-; CHECK-NEXT:    mov z0.b, p0/m, z1.b
+; CHECK-NEXT:    mov z0.b, p1/m, z1.b
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp eq <vscale x 16 x i8> %z, zeroinitializer
@@ -659,9 +659,9 @@ define <vscale x 4 x float> @fcmp_fast_olt_v4f32(<vscale x 4 x float> %z, <vscal
 ; CHECK-LABEL: fcmp_fast_olt_v4f32:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
 ; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    mov z0.s, p0/m, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp oeq <vscale x 4 x float> %z, zeroinitializer
@@ -675,9 +675,9 @@ define <vscale x 8 x half> @fcmp_fast_olt_v8f16(<vscale x 8 x half> %z, <vscale
 ; CHECK-LABEL: fcmp_fast_olt_v8f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    fcmeq p1.h, p0/z, z0.h, #0.0
 ; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
+; CHECK-NEXT:    mov z0.h, p1/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp oeq <vscale x 8 x half> %z, zeroinitializer
@@ -691,9 +691,9 @@ define <vscale x 4 x float> @fcmp_fast_ogt_v4f32(<vscale x 4 x float> %z, <vscal
 ; CHECK-LABEL: fcmp_fast_ogt_v4f32:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
 ; CHECK-NEXT:    fmaxnm z1.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    mov z0.s, p0/m, z1.s
+; CHECK-NEXT:    mov z0.s, p1/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp oeq <vscale x 4 x float> %z, zeroinitializer
@@ -707,9 +707,9 @@ define <vscale x 8 x half> @fcmp_fast_ogt_v8f16(<vscale x 8 x half> %z, <vscale
 ; CHECK-LABEL: fcmp_fast_ogt_v8f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    fcmeq p1.h, p0/z, z0.h, #0.0
 ; CHECK-NEXT:    fmaxnm z1.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
+; CHECK-NEXT:    mov z0.h, p1/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp oeq <vscale x 8 x half> %z, zeroinitializer
@@ -904,8 +904,8 @@ define <vscale x 4 x i32> @addqr_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x i32>
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, w0
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    add z1.s, z1.s, z2.s
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
@@ -922,8 +922,8 @@ define <vscale x 8 x i16> @addqr_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x i16>
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w0
-; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    add z1.h, z1.h, z2.h
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
@@ -940,8 +940,8 @@ define <vscale x 16 x i8> @addqr_v16i8(<vscale x 16 x i8> %z, <vscale x 16 x i8>
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z2.b, w0
-; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    add z1.b, z1.b, z2.b
+; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    mov z0.b, p0/m, z1.b
 ; CHECK-NEXT:    ret
 entry:
@@ -958,8 +958,8 @@ define <vscale x 4 x i32> @subqr_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x i32>
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, w0
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    sub z1.s, z1.s, z2.s
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
@@ -976,8 +976,8 @@ define <vscale x 8 x i16> @subqr_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x i16>
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w0
-; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    sub z1.h, z1.h, z2.h
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
@@ -994,8 +994,8 @@ define <vscale x 16 x i8> @subqr_v16i8(<vscale x 16 x i8> %z, <vscale x 16 x i8>
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z2.b, w0
-; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    sub z1.b, z1.b, z2.b
+; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    mov z0.b, p0/m, z1.b
 ; CHECK-NEXT:    ret
 entry:
@@ -1012,8 +1012,8 @@ define <vscale x 4 x i32> @mulqr_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x i32>
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, w0
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    mul z1.s, z1.s, z2.s
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
@@ -1030,8 +1030,8 @@ define <vscale x 8 x i16> @mulqr_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x i16>
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w0
-; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    mul z1.h, z1.h, z2.h
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
@@ -1048,8 +1048,8 @@ define <vscale x 16 x i8> @mulqr_v16i8(<vscale x 16 x i8> %z, <vscale x 16 x i8>
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z2.b, w0
-; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    mul z1.b, z1.b, z2.b
+; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    mov z0.b, p0/m, z1.b
 ; CHECK-NEXT:    ret
 entry:
@@ -1064,11 +1064,11 @@ entry:
 define <vscale x 4 x float> @faddqr_v4f32(<vscale x 4 x float> %z, <vscale x 4 x float> %x, float %y) {
 ; CHECK-LABEL: faddqr_v4f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $s2 killed $s2 def $z2
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    // kill: def $s2 killed $s2 def $z2
 ; CHECK-NEXT:    mov z2.s, s2
-; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
 ; CHECK-NEXT:    fadd z1.s, z1.s, z2.s
+; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
@@ -1083,11 +1083,11 @@ entry:
 define <vscale x 8 x half> @faddqr_v8f16(<vscale x 8 x half> %z, <vscale x 8 x half> %x, half %y) {
 ; CHECK-LABEL: faddqr_v8f16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h2 killed $h2 def $z2
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $z2
 ; CHECK-NEXT:    mov z2.h, h2
-; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
 ; CHECK-NEXT:    fadd z1.h, z1.h, z2.h
+; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
@@ -1102,11 +1102,11 @@ entry:
 define <vscale x 4 x float> @fsubqr_v4f32(<vscale x 4 x float> %z, <vscale x 4 x float> %x, float %y) {
 ; CHECK-LABEL: fsubqr_v4f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $s2 killed $s2 def $z2
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    // kill: def $s2 killed $s2 def $z2
 ; CHECK-NEXT:    mov z2.s, s2
-; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
 ; CHECK-NEXT:    fsub z1.s, z1.s, z2.s
+; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
@@ -1121,11 +1121,11 @@ entry:
 define <vscale x 8 x half> @fsubqr_v8f16(<vscale x 8 x half> %z, <vscale x 8 x half> %x, half %y) {
 ; CHECK-LABEL: fsubqr_v8f16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h2 killed $h2 def $z2
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $z2
 ; CHECK-NEXT:    mov z2.h, h2
-; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
 ; CHECK-NEXT:    fsub z1.h, z1.h, z2.h
+; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
@@ -1140,11 +1140,11 @@ entry:
 define <vscale x 4 x float> @fmulqr_v4f32(<vscale x 4 x float> %z, <vscale x 4 x float> %x, float %y) {
 ; CHECK-LABEL: fmulqr_v4f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $s2 killed $s2 def $z2
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    // kill: def $s2 killed $s2 def $z2
 ; CHECK-NEXT:    mov z2.s, s2
-; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
 ; CHECK-NEXT:    fmul z1.s, z1.s, z2.s
+; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
@@ -1159,11 +1159,11 @@ entry:
 define <vscale x 8 x half> @fmulqr_v8f16(<vscale x 8 x half> %z, <vscale x 8 x half> %x, half %y) {
 ; CHECK-LABEL: fmulqr_v8f16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $h2 killed $h2 def $z2
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $z2
 ; CHECK-NEXT:    mov z2.h, h2
-; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
 ; CHECK-NEXT:    fmul z1.h, z1.h, z2.h
+; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
@@ -1180,8 +1180,8 @@ define <vscale x 4 x i32> @sadd_satqr_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, w0
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    sqadd z1.s, z1.s, z2.s
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
@@ -1198,8 +1198,8 @@ define <vscale x 8 x i16> @sadd_satqr_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w0
-; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    sqadd z1.h, z1.h, z2.h
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
@@ -1216,8 +1216,8 @@ define <vscale x 16 x i8> @sadd_satqr_v16i8(<vscale x 16 x i8> %z, <vscale x 16
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z2.b, w0
-; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    sqadd z1.b, z1.b, z2.b
+; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    mov z0.b, p0/m, z1.b
 ; CHECK-NEXT:    ret
 entry:
@@ -1234,8 +1234,8 @@ define <vscale x 4 x i32> @uadd_satqr_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, w0
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    uqadd z1.s, z1.s, z2.s
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
@@ -1252,8 +1252,8 @@ define <vscale x 8 x i16> @uadd_satqr_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w0
-; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    uqadd z1.h, z1.h, z2.h
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
@@ -1270,8 +1270,8 @@ define <vscale x 16 x i8> @uadd_satqr_v16i8(<vscale x 16 x i8> %z, <vscale x 16
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z2.b, w0
-; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    uqadd z1.b, z1.b, z2.b
+; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    mov z0.b, p0/m, z1.b
 ; CHECK-NEXT:    ret
 entry:
@@ -1288,8 +1288,8 @@ define <vscale x 4 x i32> @ssub_satqr_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, w0
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    sqsub z1.s, z1.s, z2.s
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
@@ -1306,8 +1306,8 @@ define <vscale x 8 x i16> @ssub_satqr_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w0
-; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    sqsub z1.h, z1.h, z2.h
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
@@ -1324,8 +1324,8 @@ define <vscale x 16 x i8> @ssub_satqr_v16i8(<vscale x 16 x i8> %z, <vscale x 16
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z2.b, w0
-; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    sqsub z1.b, z1.b, z2.b
+; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    mov z0.b, p0/m, z1.b
 ; CHECK-NEXT:    ret
 entry:
@@ -1342,8 +1342,8 @@ define <vscale x 4 x i32> @usub_satqr_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, w0
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    uqsub z1.s, z1.s, z2.s
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
@@ -1360,8 +1360,8 @@ define <vscale x 8 x i16> @usub_satqr_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w0
-; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    uqsub z1.h, z1.h, z2.h
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
@@ -1378,8 +1378,8 @@ define <vscale x 16 x i8> @usub_satqr_v16i8(<vscale x 16 x i8> %z, <vscale x 16
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z2.b, w0
-; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    uqsub z1.b, z1.b, z2.b
+; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    mov z0.b, p0/m, z1.b
 ; CHECK-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll
index 64788d349cf36f..14bc1b45e79ee3 100644
--- a/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll
+++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll
@@ -173,9 +173,9 @@ define <vscale x 2 x i64> @sdiv_nxv2i64_x(<vscale x 2 x i64> %x, <vscale x 2 x i
 ; CHECK-LABEL: sdiv_nxv2i64_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    cmpgt p1.d, p0/z, z2.d, #0
 ; CHECK-NEXT:    sdivr z1.d, p0/m, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, p1/m, z1.d
+; CHECK-NEXT:    cmpgt p0.d, p0/z, z2.d, #0
+; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 2 x i64> %n, zeroinitializer
@@ -188,9 +188,9 @@ define <vscale x 4 x i32> @sdiv_nxv4i32_x(<vscale x 4 x i32> %x, <vscale x 4 x i
 ; CHECK-LABEL: sdiv_nxv4i32_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    cmpgt p1.s, p0/z, z2.s, #0
 ; CHECK-NEXT:    sdivr z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT:    mov z0.s, p1/m, z1.s
+; CHECK-NEXT:    cmpgt p0.s, p0/z, z2.s, #0
+; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 4 x i32> %n, zeroinitializer
@@ -202,14 +202,14 @@ entry:
 define <vscale x 8 x i16> @sdiv_nxv8i16_x(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y, <vscale x 8 x i16> %n) {
 ; CHECK-LABEL: sdiv_nxv8i16_x:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sunpkhi z3.s, z1.h
 ; CHECK-NEXT:    sunpkhi z4.s, z0.h
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEXT:    sunpklo z5.s, z0.h
+; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z4.s
+; CHECK-NEXT:    sunpklo z4.s, z0.h
+; CHECK-NEXT:    sdivr z1.s, p0/m, z1.s, z4.s
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    sdivr z3.s, p1/m, z3.s, z4.s
-; CHECK-NEXT:    sdivr z1.s, p1/m, z1.s, z5.s
 ; CHECK-NEXT:    cmpgt p0.h, p0/z, z2.h, #0
 ; CHECK-NEXT:    uzp1 z1.h, z1.h, z3.h
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
@@ -227,25 +227,24 @@ define <vscale x 16 x i8> @sdiv_nxv16i8_x(<vscale x 16 x i8> %x, <vscale x 16 x
 ; CHECK-NEXT:    sunpkhi z3.h, z1.b
 ; CHECK-NEXT:    sunpkhi z4.h, z0.b
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    sunpklo z1.h, z1.b
 ; CHECK-NEXT:    sunpkhi z5.s, z3.h
 ; CHECK-NEXT:    sunpkhi z6.s, z4.h
-; CHECK-NEXT:    sunpklo z1.h, z1.b
-; CHECK-NEXT:    sdivr z5.s, p0/m, z5.s, z6.s
-; CHECK-NEXT:    sunpklo z6.h, z0.b
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sunpklo z4.s, z4.h
-; CHECK-NEXT:    sunpkhi z7.s, z1.h
-; CHECK-NEXT:    sunpkhi z24.s, z6.h
+; CHECK-NEXT:    sdivr z5.s, p0/m, z5.s, z6.s
+; CHECK-NEXT:    sunpkhi z6.s, z1.h
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEXT:    sunpklo z6.s, z6.h
 ; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z4.s
-; CHECK-NEXT:    sdivr z1.s, p0/m, z1.s, z6.s
-; CHECK-NEXT:    movprfx z4, z24
-; CHECK-NEXT:    sdiv z4.s, p0/m, z4.s, z7.s
-; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    sunpklo z4.h, z0.b
+; CHECK-NEXT:    sunpkhi z7.s, z4.h
+; CHECK-NEXT:    sunpklo z4.s, z4.h
+; CHECK-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z5.h
-; CHECK-NEXT:    uzp1 z1.h, z1.h, z4.h
+; CHECK-NEXT:    sdivr z1.s, p0/m, z1.s, z4.s
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    cmpgt p0.b, p0/z, z2.b, #0
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z6.h
 ; CHECK-NEXT:    uzp1 z1.b, z1.b, z3.b
 ; CHECK-NEXT:    mov z0.b, p0/m, z1.b
 ; CHECK-NEXT:    ret
@@ -260,9 +259,9 @@ define <vscale x 2 x i64> @udiv_nxv2i64_x(<vscale x 2 x i64> %x, <vscale x 2 x i
 ; CHECK-LABEL: udiv_nxv2i64_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    cmpgt p1.d, p0/z, z2.d, #0
 ; CHECK-NEXT:    udivr z1.d, p0/m, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, p1/m, z1.d
+; CHECK-NEXT:    cmpgt p0.d, p0/z, z2.d, #0
+; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 2 x i64> %n, zeroinitializer
@@ -275,9 +274,9 @@ define <vscale x 4 x i32> @udiv_nxv4i32_x(<vscale x 4 x i32> %x, <vscale x 4 x i
 ; CHECK-LABEL: udiv_nxv4i32_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    cmpgt p1.s, p0/z, z2.s, #0
 ; CHECK-NEXT:    udivr z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT:    mov z0.s, p1/m, z1.s
+; CHECK-NEXT:    cmpgt p0.s, p0/z, z2.s, #0
+; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 4 x i32> %n, zeroinitializer
@@ -289,14 +288,14 @@ entry:
 define <vscale x 8 x i16> @udiv_nxv8i16_x(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y, <vscale x 8 x i16> %n) {
 ; CHECK-LABEL: udiv_nxv8i16_x:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpkhi z3.s, z1.h
 ; CHECK-NEXT:    uunpkhi z4.s, z0.h
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    uunpklo z5.s, z0.h
+; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z4.s
+; CHECK-NEXT:    uunpklo z4.s, z0.h
+; CHECK-NEXT:    udivr z1.s, p0/m, z1.s, z4.s
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    udivr z3.s, p1/m, z3.s, z4.s
-; CHECK-NEXT:    udivr z1.s, p1/m, z1.s, z5.s
 ; CHECK-NEXT:    cmpgt p0.h, p0/z, z2.h, #0
 ; CHECK-NEXT:    uzp1 z1.h, z1.h, z3.h
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
@@ -314,25 +313,24 @@ define <vscale x 16 x i8> @udiv_nxv16i8_x(<vscale x 16 x i8> %x, <vscale x 16 x
 ; CHECK-NEXT:    uunpkhi z3.h, z1.b
 ; CHECK-NEXT:    uunpkhi z4.h, z0.b
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    uunpklo z1.h, z1.b
 ; CHECK-NEXT:    uunpkhi z5.s, z3.h
 ; CHECK-NEXT:    uunpkhi z6.s, z4.h
-; CHECK-NEXT:    uunpklo z1.h, z1.b
-; CHECK-NEXT:    udivr z5.s, p0/m, z5.s, z6.s
-; CHECK-NEXT:    uunpklo z6.h, z0.b
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    uunpklo z4.s, z4.h
-; CHECK-NEXT:    uunpkhi z7.s, z1.h
-; CHECK-NEXT:    uunpkhi z24.s, z6.h
+; CHECK-NEXT:    udivr z5.s, p0/m, z5.s, z6.s
+; CHECK-NEXT:    uunpkhi z6.s, z1.h
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    uunpklo z6.s, z6.h
 ; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z4.s
-; CHECK-NEXT:    udivr z1.s, p0/m, z1.s, z6.s
-; CHECK-NEXT:    movprfx z4, z24
-; CHECK-NEXT:    udiv z4.s, p0/m, z4.s, z7.s
-; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    uunpklo z4.h, z0.b
+; CHECK-NEXT:    uunpkhi z7.s, z4.h
+; CHECK-NEXT:    uunpklo z4.s, z4.h
+; CHECK-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z5.h
-; CHECK-NEXT:    uzp1 z1.h, z1.h, z4.h
+; CHECK-NEXT:    udivr z1.s, p0/m, z1.s, z4.s
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    cmpgt p0.b, p0/z, z2.b, #0
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z6.h
 ; CHECK-NEXT:    uzp1 z1.b, z1.b, z3.b
 ; CHECK-NEXT:    mov z0.b, p0/m, z1.b
 ; CHECK-NEXT:    ret
@@ -347,10 +345,10 @@ define <vscale x 2 x i64> @srem_nxv2i64_x(<vscale x 2 x i64> %x, <vscale x 2 x i
 ; CHECK-LABEL: srem_nxv2i64_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    cmpgt p1.d, p0/z, z2.d, #0
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
-; CHECK-NEXT:    mls z0.d, p1/m, z2.d, z1.d
+; CHECK-NEXT:    movprfx z3, z0
+; CHECK-NEXT:    sdiv z3.d, p0/m, z3.d, z1.d
+; CHECK-NEXT:    cmpgt p0.d, p0/z, z2.d, #0
+; CHECK-NEXT:    mls z0.d, p0/m, z3.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 2 x i64> %n, zeroinitializer
@@ -363,10 +361,10 @@ define <vscale x 4 x i32> @srem_nxv4i32_x(<vscale x 4 x i32> %x, <vscale x 4 x i
 ; CHECK-LABEL: srem_nxv4i32_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    cmpgt p1.s, p0/z, z2.s, #0
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
-; CHECK-NEXT:    mls z0.s, p1/m, z2.s, z1.s
+; CHECK-NEXT:    movprfx z3, z0
+; CHECK-NEXT:    sdiv z3.s, p0/m, z3.s, z1.s
+; CHECK-NEXT:    cmpgt p0.s, p0/z, z2.s, #0
+; CHECK-NEXT:    mls z0.s, p0/m, z3.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 4 x i32> %n, zeroinitializer
@@ -378,18 +376,17 @@ entry:
 define <vscale x 8 x i16> @srem_nxv8i16_x(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y, <vscale x 8 x i16> %n) {
 ; CHECK-LABEL: srem_nxv8i16_x:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sunpkhi z3.s, z1.h
 ; CHECK-NEXT:    sunpkhi z4.s, z0.h
+; CHECK-NEXT:    sunpklo z5.s, z0.h
+; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z4.s
+; CHECK-NEXT:    sunpklo z4.s, z1.h
+; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    sdivr z3.s, p1/m, z3.s, z4.s
-; CHECK-NEXT:    sunpklo z5.s, z1.h
-; CHECK-NEXT:    sunpklo z6.s, z0.h
-; CHECK-NEXT:    movprfx z4, z6
-; CHECK-NEXT:    sdiv z4.s, p1/m, z4.s, z5.s
 ; CHECK-NEXT:    cmpgt p0.h, p0/z, z2.h, #0
-; CHECK-NEXT:    uzp1 z2.h, z4.h, z3.h
-; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
+; CHECK-NEXT:    uzp1 z3.h, z4.h, z3.h
+; CHECK-NEXT:    mls z0.h, p0/m, z3.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 8 x i16> %n, zeroinitializer
@@ -406,25 +403,24 @@ define <vscale x 16 x i8> @srem_nxv16i8_x(<vscale x 16 x i8> %x, <vscale x 16 x
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sunpkhi z5.s, z3.h
 ; CHECK-NEXT:    sunpkhi z6.s, z4.h
-; CHECK-NEXT:    sunpklo z7.h, z1.b
-; CHECK-NEXT:    sdivr z5.s, p0/m, z5.s, z6.s
-; CHECK-NEXT:    sunpklo z6.h, z0.b
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sunpklo z4.s, z4.h
-; CHECK-NEXT:    sunpkhi z24.s, z7.h
-; CHECK-NEXT:    sunpkhi z25.s, z6.h
-; CHECK-NEXT:    sunpklo z7.s, z7.h
+; CHECK-NEXT:    sdivr z5.s, p0/m, z5.s, z6.s
+; CHECK-NEXT:    sunpklo z6.h, z0.b
+; CHECK-NEXT:    sunpkhi z24.s, z6.h
 ; CHECK-NEXT:    sunpklo z6.s, z6.h
 ; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z4.s
-; CHECK-NEXT:    movprfx z4, z25
-; CHECK-NEXT:    sdiv z4.s, p0/m, z4.s, z24.s
-; CHECK-NEXT:    sdiv z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    sunpklo z4.h, z1.b
+; CHECK-NEXT:    sunpkhi z7.s, z4.h
+; CHECK-NEXT:    sunpklo z4.s, z4.h
+; CHECK-NEXT:    sdivr z7.s, p0/m, z7.s, z24.s
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z5.h
-; CHECK-NEXT:    uzp1 z4.h, z6.h, z4.h
+; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z6.s
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    cmpgt p0.b, p0/z, z2.b, #0
-; CHECK-NEXT:    uzp1 z2.b, z4.b, z3.b
-; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z7.h
+; CHECK-NEXT:    uzp1 z3.b, z4.b, z3.b
+; CHECK-NEXT:    mls z0.b, p0/m, z3.b, z1.b
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 16 x i8> %n, zeroinitializer
@@ -437,10 +433,10 @@ define <vscale x 2 x i64> @urem_nxv2i64_x(<vscale x 2 x i64> %x, <vscale x 2 x i
 ; CHECK-LABEL: urem_nxv2i64_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    cmpgt p1.d, p0/z, z2.d, #0
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
-; CHECK-NEXT:    mls z0.d, p1/m, z2.d, z1.d
+; CHECK-NEXT:    movprfx z3, z0
+; CHECK-NEXT:    udiv z3.d, p0/m, z3.d, z1.d
+; CHECK-NEXT:    cmpgt p0.d, p0/z, z2.d, #0
+; CHECK-NEXT:    mls z0.d, p0/m, z3.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 2 x i64> %n, zeroinitializer
@@ -453,10 +449,10 @@ define <vscale x 4 x i32> @urem_nxv4i32_x(<vscale x 4 x i32> %x, <vscale x 4 x i
 ; CHECK-LABEL: urem_nxv4i32_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    cmpgt p1.s, p0/z, z2.s, #0
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
-; CHECK-NEXT:    mls z0.s, p1/m, z2.s, z1.s
+; CHECK-NEXT:    movprfx z3, z0
+; CHECK-NEXT:    udiv z3.s, p0/m, z3.s, z1.s
+; CHECK-NEXT:    cmpgt p0.s, p0/z, z2.s, #0
+; CHECK-NEXT:    mls z0.s, p0/m, z3.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 4 x i32> %n, zeroinitializer
@@ -468,18 +464,17 @@ entry:
 define <vscale x 8 x i16> @urem_nxv8i16_x(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y, <vscale x 8 x i16> %n) {
 ; CHECK-LABEL: urem_nxv8i16_x:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpkhi z3.s, z1.h
 ; CHECK-NEXT:    uunpkhi z4.s, z0.h
+; CHECK-NEXT:    uunpklo z5.s, z0.h
+; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z4.s
+; CHECK-NEXT:    uunpklo z4.s, z1.h
+; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    udivr z3.s, p1/m, z3.s, z4.s
-; CHECK-NEXT:    uunpklo z5.s, z1.h
-; CHECK-NEXT:    uunpklo z6.s, z0.h
-; CHECK-NEXT:    movprfx z4, z6
-; CHECK-NEXT:    udiv z4.s, p1/m, z4.s, z5.s
 ; CHECK-NEXT:    cmpgt p0.h, p0/z, z2.h, #0
-; CHECK-NEXT:    uzp1 z2.h, z4.h, z3.h
-; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
+; CHECK-NEXT:    uzp1 z3.h, z4.h, z3.h
+; CHECK-NEXT:    mls z0.h, p0/m, z3.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 8 x i16> %n, zeroinitializer
@@ -496,25 +491,24 @@ define <vscale x 16 x i8> @urem_nxv16i8_x(<vscale x 16 x i8> %x, <vscale x 16 x
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpkhi z5.s, z3.h
 ; CHECK-NEXT:    uunpkhi z6.s, z4.h
-; CHECK-NEXT:    uunpklo z7.h, z1.b
-; CHECK-NEXT:    udivr z5.s, p0/m, z5.s, z6.s
-; CHECK-NEXT:    uunpklo z6.h, z0.b
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    uunpklo z4.s, z4.h
-; CHECK-NEXT:    uunpkhi z24.s, z7.h
-; CHECK-NEXT:    uunpkhi z25.s, z6.h
-; CHECK-NEXT:    uunpklo z7.s, z7.h
+; CHECK-NEXT:    udivr z5.s, p0/m, z5.s, z6.s
+; CHECK-NEXT:    uunpklo z6.h, z0.b
+; CHECK-NEXT:    uunpkhi z24.s, z6.h
 ; CHECK-NEXT:    uunpklo z6.s, z6.h
 ; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z4.s
-; CHECK-NEXT:    movprfx z4, z25
-; CHECK-NEXT:    udiv z4.s, p0/m, z4.s, z24.s
-; CHECK-NEXT:    udiv z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    uunpklo z4.h, z1.b
+; CHECK-NEXT:    uunpkhi z7.s, z4.h
+; CHECK-NEXT:    uunpklo z4.s, z4.h
+; CHECK-NEXT:    udivr z7.s, p0/m, z7.s, z24.s
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z5.h
-; CHECK-NEXT:    uzp1 z4.h, z6.h, z4.h
+; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z6.s
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    cmpgt p0.b, p0/z, z2.b, #0
-; CHECK-NEXT:    uzp1 z2.b, z4.b, z3.b
-; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z7.h
+; CHECK-NEXT:    uzp1 z3.b, z4.b, z3.b
+; CHECK-NEXT:    mls z0.b, p0/m, z3.b, z1.b
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 16 x i8> %n, zeroinitializer
@@ -1130,10 +1124,10 @@ define <vscale x 4 x float> @fdiv_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fdiv_nxv4f32_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
 ; CHECK-NEXT:    fdivr z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    mov z0.s, p1/m, z1.s
+; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1148,8 +1142,8 @@ define <vscale x 8 x half> @fdiv_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
 ; CHECK-NEXT:    fdivr z1.h, p0/m, z1.h, z0.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    mov z0.h, p1/m, z1.h
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1162,10 +1156,10 @@ define <vscale x 2 x double> @fdiv_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fdiv_nxv2f64_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
 ; CHECK-NEXT:    fdivr z1.d, p0/m, z1.d, z0.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    mov z0.d, p1/m, z1.d
+; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1631,9 +1625,9 @@ define <vscale x 2 x i64> @sdiv_nxv2i64_y(<vscale x 2 x i64> %x, <vscale x 2 x i
 ; CHECK-LABEL: sdiv_nxv2i64_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    cmpgt p1.d, p0/z, z2.d, #0
 ; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    cmpgt p0.d, p0/z, z2.d, #0
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 2 x i64> %n, zeroinitializer
@@ -1646,9 +1640,9 @@ define <vscale x 4 x i32> @sdiv_nxv4i32_y(<vscale x 4 x i32> %x, <vscale x 4 x i
 ; CHECK-LABEL: sdiv_nxv4i32_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    cmpgt p1.s, p0/z, z2.s, #0
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    cmpgt p0.s, p0/z, z2.s, #0
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 4 x i32> %n, zeroinitializer
@@ -1660,14 +1654,14 @@ entry:
 define <vscale x 8 x i16> @sdiv_nxv8i16_y(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y, <vscale x 8 x i16> %n) {
 ; CHECK-LABEL: sdiv_nxv8i16_y:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sunpkhi z3.s, z1.h
 ; CHECK-NEXT:    sunpkhi z4.s, z0.h
-; CHECK-NEXT:    sunpklo z5.s, z1.h
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z4.s
+; CHECK-NEXT:    sunpklo z4.s, z1.h
+; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z4.s
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    sdivr z3.s, p1/m, z3.s, z4.s
-; CHECK-NEXT:    sdiv z0.s, p1/m, z0.s, z5.s
 ; CHECK-NEXT:    cmpgt p0.h, p0/z, z2.h, #0
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z3.h
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
@@ -1685,25 +1679,24 @@ define <vscale x 16 x i8> @sdiv_nxv16i8_y(<vscale x 16 x i8> %x, <vscale x 16 x
 ; CHECK-NEXT:    sunpkhi z3.h, z1.b
 ; CHECK-NEXT:    sunpkhi z4.h, z0.b
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    sunpkhi z5.s, z3.h
 ; CHECK-NEXT:    sunpkhi z6.s, z4.h
-; CHECK-NEXT:    sunpklo z7.h, z1.b
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sunpklo z4.s, z4.h
+; CHECK-NEXT:    sunpkhi z7.s, z0.h
+; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sdivr z5.s, p0/m, z5.s, z6.s
-; CHECK-NEXT:    sunpklo z0.h, z0.b
-; CHECK-NEXT:    sunpkhi z6.s, z7.h
-; CHECK-NEXT:    sunpkhi z24.s, z0.h
 ; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z4.s
-; CHECK-NEXT:    movprfx z4, z24
-; CHECK-NEXT:    sdiv z4.s, p0/m, z4.s, z6.s
-; CHECK-NEXT:    sunpklo z6.s, z7.h
-; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z6.s
-; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    sunpklo z4.h, z1.b
+; CHECK-NEXT:    sunpkhi z6.s, z4.h
+; CHECK-NEXT:    sunpklo z4.s, z4.h
+; CHECK-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z5.h
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z4.h
+; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z4.s
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    cmpgt p0.b, p0/z, z2.b, #0
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z6.h
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z3.b
 ; CHECK-NEXT:    sel z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    ret
@@ -1718,9 +1711,9 @@ define <vscale x 2 x i64> @udiv_nxv2i64_y(<vscale x 2 x i64> %x, <vscale x 2 x i
 ; CHECK-LABEL: udiv_nxv2i64_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    cmpgt p1.d, p0/z, z2.d, #0
 ; CHECK-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    cmpgt p0.d, p0/z, z2.d, #0
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 2 x i64> %n, zeroinitializer
@@ -1733,9 +1726,9 @@ define <vscale x 4 x i32> @udiv_nxv4i32_y(<vscale x 4 x i32> %x, <vscale x 4 x i
 ; CHECK-LABEL: udiv_nxv4i32_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    cmpgt p1.s, p0/z, z2.s, #0
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    cmpgt p0.s, p0/z, z2.s, #0
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 4 x i32> %n, zeroinitializer
@@ -1747,14 +1740,14 @@ entry:
 define <vscale x 8 x i16> @udiv_nxv8i16_y(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y, <vscale x 8 x i16> %n) {
 ; CHECK-LABEL: udiv_nxv8i16_y:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpkhi z3.s, z1.h
 ; CHECK-NEXT:    uunpkhi z4.s, z0.h
-; CHECK-NEXT:    uunpklo z5.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z4.s
+; CHECK-NEXT:    uunpklo z4.s, z1.h
+; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z4.s
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    udivr z3.s, p1/m, z3.s, z4.s
-; CHECK-NEXT:    udiv z0.s, p1/m, z0.s, z5.s
 ; CHECK-NEXT:    cmpgt p0.h, p0/z, z2.h, #0
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z3.h
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
@@ -1772,25 +1765,24 @@ define <vscale x 16 x i8> @udiv_nxv16i8_y(<vscale x 16 x i8> %x, <vscale x 16 x
 ; CHECK-NEXT:    uunpkhi z3.h, z1.b
 ; CHECK-NEXT:    uunpkhi z4.h, z0.b
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    uunpkhi z5.s, z3.h
 ; CHECK-NEXT:    uunpkhi z6.s, z4.h
-; CHECK-NEXT:    uunpklo z7.h, z1.b
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    uunpklo z4.s, z4.h
+; CHECK-NEXT:    uunpkhi z7.s, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    udivr z5.s, p0/m, z5.s, z6.s
-; CHECK-NEXT:    uunpklo z0.h, z0.b
-; CHECK-NEXT:    uunpkhi z6.s, z7.h
-; CHECK-NEXT:    uunpkhi z24.s, z0.h
 ; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z4.s
-; CHECK-NEXT:    movprfx z4, z24
-; CHECK-NEXT:    udiv z4.s, p0/m, z4.s, z6.s
-; CHECK-NEXT:    uunpklo z6.s, z7.h
-; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z6.s
-; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    uunpklo z4.h, z1.b
+; CHECK-NEXT:    uunpkhi z6.s, z4.h
+; CHECK-NEXT:    uunpklo z4.s, z4.h
+; CHECK-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z5.h
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z4.h
+; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z4.s
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    cmpgt p0.b, p0/z, z2.b, #0
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z6.h
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z3.b
 ; CHECK-NEXT:    sel z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    ret
@@ -1805,10 +1797,10 @@ define <vscale x 2 x i64> @srem_nxv2i64_y(<vscale x 2 x i64> %x, <vscale x 2 x i
 ; CHECK-LABEL: srem_nxv2i64_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    cmpgt p1.d, p0/z, z2.d, #0
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
-; CHECK-NEXT:    msb z1.d, p1/m, z2.d, z0.d
+; CHECK-NEXT:    movprfx z3, z0
+; CHECK-NEXT:    sdiv z3.d, p0/m, z3.d, z1.d
+; CHECK-NEXT:    cmpgt p0.d, p0/z, z2.d, #0
+; CHECK-NEXT:    msb z1.d, p0/m, z3.d, z0.d
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
@@ -1822,10 +1814,10 @@ define <vscale x 4 x i32> @srem_nxv4i32_y(<vscale x 4 x i32> %x, <vscale x 4 x i
 ; CHECK-LABEL: srem_nxv4i32_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    cmpgt p1.s, p0/z, z2.s, #0
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
-; CHECK-NEXT:    msb z1.s, p1/m, z2.s, z0.s
+; CHECK-NEXT:    movprfx z3, z0
+; CHECK-NEXT:    sdiv z3.s, p0/m, z3.s, z1.s
+; CHECK-NEXT:    cmpgt p0.s, p0/z, z2.s, #0
+; CHECK-NEXT:    msb z1.s, p0/m, z3.s, z0.s
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
@@ -1838,18 +1830,17 @@ entry:
 define <vscale x 8 x i16> @srem_nxv8i16_y(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y, <vscale x 8 x i16> %n) {
 ; CHECK-LABEL: srem_nxv8i16_y:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sunpkhi z3.s, z1.h
 ; CHECK-NEXT:    sunpkhi z4.s, z0.h
+; CHECK-NEXT:    sunpklo z5.s, z0.h
+; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z4.s
+; CHECK-NEXT:    sunpklo z4.s, z1.h
+; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    sdivr z3.s, p1/m, z3.s, z4.s
-; CHECK-NEXT:    sunpklo z5.s, z1.h
-; CHECK-NEXT:    sunpklo z6.s, z0.h
-; CHECK-NEXT:    movprfx z4, z6
-; CHECK-NEXT:    sdiv z4.s, p1/m, z4.s, z5.s
 ; CHECK-NEXT:    cmpgt p0.h, p0/z, z2.h, #0
-; CHECK-NEXT:    uzp1 z2.h, z4.h, z3.h
-; CHECK-NEXT:    msb z1.h, p0/m, z2.h, z0.h
+; CHECK-NEXT:    uzp1 z3.h, z4.h, z3.h
+; CHECK-NEXT:    msb z1.h, p0/m, z3.h, z0.h
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
@@ -1867,25 +1858,24 @@ define <vscale x 16 x i8> @srem_nxv16i8_y(<vscale x 16 x i8> %x, <vscale x 16 x
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sunpkhi z5.s, z3.h
 ; CHECK-NEXT:    sunpkhi z6.s, z4.h
-; CHECK-NEXT:    sunpklo z7.h, z1.b
-; CHECK-NEXT:    sdivr z5.s, p0/m, z5.s, z6.s
-; CHECK-NEXT:    sunpklo z6.h, z0.b
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sunpklo z4.s, z4.h
-; CHECK-NEXT:    sunpkhi z24.s, z7.h
-; CHECK-NEXT:    sunpkhi z25.s, z6.h
-; CHECK-NEXT:    sunpklo z7.s, z7.h
+; CHECK-NEXT:    sdivr z5.s, p0/m, z5.s, z6.s
+; CHECK-NEXT:    sunpklo z6.h, z0.b
+; CHECK-NEXT:    sunpkhi z24.s, z6.h
 ; CHECK-NEXT:    sunpklo z6.s, z6.h
 ; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z4.s
-; CHECK-NEXT:    movprfx z4, z25
-; CHECK-NEXT:    sdiv z4.s, p0/m, z4.s, z24.s
-; CHECK-NEXT:    sdiv z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    sunpklo z4.h, z1.b
+; CHECK-NEXT:    sunpkhi z7.s, z4.h
+; CHECK-NEXT:    sunpklo z4.s, z4.h
+; CHECK-NEXT:    sdivr z7.s, p0/m, z7.s, z24.s
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z5.h
-; CHECK-NEXT:    uzp1 z4.h, z6.h, z4.h
+; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z6.s
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    cmpgt p0.b, p0/z, z2.b, #0
-; CHECK-NEXT:    uzp1 z2.b, z4.b, z3.b
-; CHECK-NEXT:    msb z1.b, p0/m, z2.b, z0.b
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z7.h
+; CHECK-NEXT:    uzp1 z3.b, z4.b, z3.b
+; CHECK-NEXT:    msb z1.b, p0/m, z3.b, z0.b
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
@@ -1899,10 +1889,10 @@ define <vscale x 2 x i64> @urem_nxv2i64_y(<vscale x 2 x i64> %x, <vscale x 2 x i
 ; CHECK-LABEL: urem_nxv2i64_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    cmpgt p1.d, p0/z, z2.d, #0
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
-; CHECK-NEXT:    msb z1.d, p1/m, z2.d, z0.d
+; CHECK-NEXT:    movprfx z3, z0
+; CHECK-NEXT:    udiv z3.d, p0/m, z3.d, z1.d
+; CHECK-NEXT:    cmpgt p0.d, p0/z, z2.d, #0
+; CHECK-NEXT:    msb z1.d, p0/m, z3.d, z0.d
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
@@ -1916,10 +1906,10 @@ define <vscale x 4 x i32> @urem_nxv4i32_y(<vscale x 4 x i32> %x, <vscale x 4 x i
 ; CHECK-LABEL: urem_nxv4i32_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    cmpgt p1.s, p0/z, z2.s, #0
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
-; CHECK-NEXT:    msb z1.s, p1/m, z2.s, z0.s
+; CHECK-NEXT:    movprfx z3, z0
+; CHECK-NEXT:    udiv z3.s, p0/m, z3.s, z1.s
+; CHECK-NEXT:    cmpgt p0.s, p0/z, z2.s, #0
+; CHECK-NEXT:    msb z1.s, p0/m, z3.s, z0.s
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
@@ -1932,18 +1922,17 @@ entry:
 define <vscale x 8 x i16> @urem_nxv8i16_y(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y, <vscale x 8 x i16> %n) {
 ; CHECK-LABEL: urem_nxv8i16_y:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpkhi z3.s, z1.h
 ; CHECK-NEXT:    uunpkhi z4.s, z0.h
+; CHECK-NEXT:    uunpklo z5.s, z0.h
+; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z4.s
+; CHECK-NEXT:    uunpklo z4.s, z1.h
+; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    udivr z3.s, p1/m, z3.s, z4.s
-; CHECK-NEXT:    uunpklo z5.s, z1.h
-; CHECK-NEXT:    uunpklo z6.s, z0.h
-; CHECK-NEXT:    movprfx z4, z6
-; CHECK-NEXT:    udiv z4.s, p1/m, z4.s, z5.s
 ; CHECK-NEXT:    cmpgt p0.h, p0/z, z2.h, #0
-; CHECK-NEXT:    uzp1 z2.h, z4.h, z3.h
-; CHECK-NEXT:    msb z1.h, p0/m, z2.h, z0.h
+; CHECK-NEXT:    uzp1 z3.h, z4.h, z3.h
+; CHECK-NEXT:    msb z1.h, p0/m, z3.h, z0.h
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
@@ -1961,25 +1950,24 @@ define <vscale x 16 x i8> @urem_nxv16i8_y(<vscale x 16 x i8> %x, <vscale x 16 x
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpkhi z5.s, z3.h
 ; CHECK-NEXT:    uunpkhi z6.s, z4.h
-; CHECK-NEXT:    uunpklo z7.h, z1.b
-; CHECK-NEXT:    udivr z5.s, p0/m, z5.s, z6.s
-; CHECK-NEXT:    uunpklo z6.h, z0.b
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    uunpklo z4.s, z4.h
-; CHECK-NEXT:    uunpkhi z24.s, z7.h
-; CHECK-NEXT:    uunpkhi z25.s, z6.h
-; CHECK-NEXT:    uunpklo z7.s, z7.h
+; CHECK-NEXT:    udivr z5.s, p0/m, z5.s, z6.s
+; CHECK-NEXT:    uunpklo z6.h, z0.b
+; CHECK-NEXT:    uunpkhi z24.s, z6.h
 ; CHECK-NEXT:    uunpklo z6.s, z6.h
 ; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z4.s
-; CHECK-NEXT:    movprfx z4, z25
-; CHECK-NEXT:    udiv z4.s, p0/m, z4.s, z24.s
-; CHECK-NEXT:    udiv z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    uunpklo z4.h, z1.b
+; CHECK-NEXT:    uunpkhi z7.s, z4.h
+; CHECK-NEXT:    uunpklo z4.s, z4.h
+; CHECK-NEXT:    udivr z7.s, p0/m, z7.s, z24.s
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z5.h
-; CHECK-NEXT:    uzp1 z4.h, z6.h, z4.h
+; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z6.s
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    cmpgt p0.b, p0/z, z2.b, #0
-; CHECK-NEXT:    uzp1 z2.b, z4.b, z3.b
-; CHECK-NEXT:    msb z1.b, p0/m, z2.b, z0.b
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z7.h
+; CHECK-NEXT:    uzp1 z3.b, z4.b, z3.b
+; CHECK-NEXT:    msb z1.b, p0/m, z3.b, z0.b
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
@@ -2625,10 +2613,10 @@ define <vscale x 4 x float> @fdiv_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fdiv_nxv4f32_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
 ; CHECK-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -2643,8 +2631,8 @@ define <vscale x 8 x half> @fdiv_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
 ; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -2657,10 +2645,10 @@ define <vscale x 2 x double> @fdiv_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fdiv_nxv2f64_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
 ; CHECK-NEXT:    fdiv z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -2865,10 +2853,10 @@ define <vscale x 4 x float> @fmai_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fmai_nxv4f32_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z3.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -2881,10 +2869,10 @@ define <vscale x 8 x half> @fmai_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fmai_nxv8f16_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fmla z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z3.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    fmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -2897,10 +2885,10 @@ define <vscale x 2 x double> @fmai_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fmai_nxv2f64_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fmla z0.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z3.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    fmla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -2913,10 +2901,10 @@ define <vscale x 4 x float> @fma_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fma_nxv4f32_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z3.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -2930,10 +2918,10 @@ define <vscale x 8 x half> @fma_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fma_nxv8f16_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fmla z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z3.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    fmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -2947,10 +2935,10 @@ define <vscale x 2 x double> @fma_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fma_nxv2f64_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fmla z0.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z3.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    fmla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer

diff  --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll
index 119ec16542ddc4..4413dcd89f4820 100644
--- a/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll
+++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll
@@ -776,10 +776,10 @@ define <vscale x 4 x float> @fdiv_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fdiv_nxv4f32_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
 ; CHECK-NEXT:    fdivr z1.s, p0/m, z1.s, z0.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    mov z0.s, p1/m, z1.s
+; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -794,8 +794,8 @@ define <vscale x 8 x half> @fdiv_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
 ; CHECK-NEXT:    fdivr z1.h, p0/m, z1.h, z0.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    mov z0.h, p1/m, z1.h
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -808,10 +808,10 @@ define <vscale x 2 x double> @fdiv_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fdiv_nxv2f64_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
 ; CHECK-NEXT:    fdivr z1.d, p0/m, z1.d, z0.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    mov z0.d, p1/m, z1.d
+; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1684,10 +1684,10 @@ define <vscale x 4 x float> @fdiv_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fdiv_nxv4f32_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
 ; CHECK-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    fcmle p1.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1702,8 +1702,8 @@ define <vscale x 8 x half> @fdiv_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
 ; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1716,10 +1716,10 @@ define <vscale x 2 x double> @fdiv_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fdiv_nxv2f64_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
 ; CHECK-NEXT:    fdiv z0.d, p0/m, z0.d, z1.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    fcmle p1.d, p0/z, z2.d, #0.0
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1732,10 +1732,10 @@ define <vscale x 4 x float> @fmai_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fmai_nxv4f32_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z3.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1748,10 +1748,10 @@ define <vscale x 8 x half> @fmai_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fmai_nxv8f16_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fmla z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z3.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    fmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1764,10 +1764,10 @@ define <vscale x 2 x double> @fmai_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fmai_nxv2f64_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fmla z0.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z3.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    fmla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1780,10 +1780,10 @@ define <vscale x 4 x float> @fma_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-LABEL: fma_nxv4f32_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z3.s, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1797,10 +1797,10 @@ define <vscale x 8 x half> @fma_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fma_nxv8f16_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fmla z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z3.h, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    fmla z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1814,10 +1814,10 @@ define <vscale x 2 x double> @fma_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-LABEL: fma_nxv2f64_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fmla z0.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z3.d, #0.0
-; CHECK-NEXT:    not p0.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    fmla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer

diff  --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
index dbd6207df026fa..39fe92aae06199 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
@@ -15,9 +15,8 @@ define void @test_sink_ptrue_into_ptest(i32 %n) {
 ; CHECK-NEXT:    cntw x8
 ; CHECK-NEXT:  .LBB0_2: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add w10, w9, w8
 ; CHECK-NEXT:    whilelt p0.s, w9, w0
-; CHECK-NEXT:    mov w9, w10
+; CHECK-NEXT:    add w9, w9, w8
 ; CHECK-NEXT:    b.mi .LBB0_2
 ; CHECK-NEXT:  .LBB0_3: // %exit
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-redundant-store.ll b/llvm/test/CodeGen/AArch64/sve-redundant-store.ll
index dec544eecafe60..b5799ae7096472 100644
--- a/llvm/test/CodeGen/AArch64/sve-redundant-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-redundant-store.ll
@@ -35,8 +35,8 @@ entry:
 define void @keep_scalable_store(ptr writeonly %ptr, ptr %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: keep_scalable_store:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp q2, q1, [x1]
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldp q2, q1, [x1]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-select.ll b/llvm/test/CodeGen/AArch64/sve-select.ll
index 5c1cfe639bc7ef..b1270165556e67 100644
--- a/llvm/test/CodeGen/AArch64/sve-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-select.ll
@@ -602,10 +602,10 @@ define <vscale x 4 x float> @select_f32_no_invert_2_op(<vscale x 4 x float> %a,
 ; CHECK-LABEL: select_f32_no_invert_2_op:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fmul z2.s, z2.s, z3.s
+; CHECK-NEXT:    fmul z1.s, z0.s, z1.s
 ; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    fmul z0.s, z0.s, z1.s
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z2.s
+; CHECK-NEXT:    fmul z0.s, z2.s, z3.s
+; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
   %p = fcmp oeq <vscale x 4 x float> %a, zeroinitializer
   %fmul1 = fmul <vscale x 4 x float> %a, %b

diff  --git a/llvm/test/CodeGen/AArch64/sve-sext-zext.ll b/llvm/test/CodeGen/AArch64/sve-sext-zext.ll
index 6514ab03ae515d..132bb48d89292a 100644
--- a/llvm/test/CodeGen/AArch64/sve-sext-zext.ll
+++ b/llvm/test/CodeGen/AArch64/sve-sext-zext.ll
@@ -240,11 +240,11 @@ define <vscale x 16 x i64> @sext_b_to_d(<vscale x 16 x i8> %a) {
 ; CHECK-LABEL: sext_b_to_d:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sunpklo z1.h, z0.b
-; CHECK-NEXT:    sunpkhi z6.h, z0.b
+; CHECK-NEXT:    sunpkhi z0.h, z0.b
 ; CHECK-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEXT:    sunpkhi z3.s, z1.h
-; CHECK-NEXT:    sunpklo z5.s, z6.h
-; CHECK-NEXT:    sunpkhi z7.s, z6.h
+; CHECK-NEXT:    sunpklo z5.s, z0.h
+; CHECK-NEXT:    sunpkhi z7.s, z0.h
 ; CHECK-NEXT:    sunpklo z0.d, z2.s
 ; CHECK-NEXT:    sunpkhi z1.d, z2.s
 ; CHECK-NEXT:    sunpklo z2.d, z3.s
@@ -309,11 +309,11 @@ define <vscale x 16 x i64> @zext_b_to_d(<vscale x 16 x i8> %a) {
 ; CHECK-LABEL: zext_b_to_d:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uunpklo z1.h, z0.b
-; CHECK-NEXT:    uunpkhi z6.h, z0.b
+; CHECK-NEXT:    uunpkhi z0.h, z0.b
 ; CHECK-NEXT:    uunpklo z2.s, z1.h
 ; CHECK-NEXT:    uunpkhi z3.s, z1.h
-; CHECK-NEXT:    uunpklo z5.s, z6.h
-; CHECK-NEXT:    uunpkhi z7.s, z6.h
+; CHECK-NEXT:    uunpklo z5.s, z0.h
+; CHECK-NEXT:    uunpkhi z7.s, z0.h
 ; CHECK-NEXT:    uunpklo z0.d, z2.s
 ; CHECK-NEXT:    uunpkhi z1.d, z2.s
 ; CHECK-NEXT:    uunpklo z2.d, z3.s

diff  --git a/llvm/test/CodeGen/AArch64/sve-smulo-sdnode.ll b/llvm/test/CodeGen/AArch64/sve-smulo-sdnode.ll
index eebd4a228a769e..871c74a1de14c2 100644
--- a/llvm/test/CodeGen/AArch64/sve-smulo-sdnode.ll
+++ b/llvm/test/CodeGen/AArch64/sve-smulo-sdnode.ll
@@ -92,18 +92,19 @@ define <vscale x 32 x i8> @smulo_nxv32i8(<vscale x 32 x i8> %x, <vscale x 32 x i
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    smulh z4.b, p0/m, z4.b, z3.b
-; CHECK-NEXT:    mul z1.b, p0/m, z1.b, z3.b
-; CHECK-NEXT:    movprfx z3, z0
-; CHECK-NEXT:    mul z3.b, p0/m, z3.b, z2.b
-; CHECK-NEXT:    asr z5.b, z1.b, #7
+; CHECK-NEXT:    mul z4.b, p0/m, z4.b, z3.b
+; CHECK-NEXT:    movprfx z5, z0
+; CHECK-NEXT:    mul z5.b, p0/m, z5.b, z2.b
+; CHECK-NEXT:    smulh z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    smulh z0.b, p0/m, z0.b, z2.b
-; CHECK-NEXT:    asr z2.b, z3.b, #7
-; CHECK-NEXT:    cmpne p1.b, p0/z, z4.b, z5.b
-; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, z2.b
-; CHECK-NEXT:    mov z1.b, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z3.b, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, z3.d
+; CHECK-NEXT:    asr z2.b, z4.b, #7
+; CHECK-NEXT:    asr z3.b, z5.b, #7
+; CHECK-NEXT:    cmpne p1.b, p0/z, z1.b, z2.b
+; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, z3.b
+; CHECK-NEXT:    mov z5.b, p0/m, #0 // =0x0
+; CHECK-NEXT:    mov z4.b, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, z5.d
+; CHECK-NEXT:    mov z1.d, z4.d
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 32 x i8>, <vscale x 32 x i1> } @llvm.smul.with.overflow.nxv32i8(<vscale x 32 x i8> %x, <vscale x 32 x i8> %y)
   %b = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i1> } %a, 0
@@ -119,31 +120,33 @@ define <vscale x 64 x i8> @smulo_nxv64i8(<vscale x 64 x i8> %x, <vscale x 64 x i
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    movprfx z24, z3
-; CHECK-NEXT:    smulh z24.b, p0/m, z24.b, z7.b
-; CHECK-NEXT:    mul z3.b, p0/m, z3.b, z7.b
-; CHECK-NEXT:    movprfx z7, z2
-; CHECK-NEXT:    mul z7.b, p0/m, z7.b, z6.b
-; CHECK-NEXT:    smulh z2.b, p0/m, z2.b, z6.b
-; CHECK-NEXT:    asr z6.b, z7.b, #7
-; CHECK-NEXT:    cmpne p2.b, p0/z, z2.b, z6.b
-; CHECK-NEXT:    movprfx z6, z1
-; CHECK-NEXT:    smulh z6.b, p0/m, z6.b, z5.b
-; CHECK-NEXT:    mul z1.b, p0/m, z1.b, z5.b
-; CHECK-NEXT:    asr z25.b, z3.b, #7
-; CHECK-NEXT:    asr z5.b, z1.b, #7
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    mul z2.b, p0/m, z2.b, z4.b
+; CHECK-NEXT:    mul z24.b, p0/m, z24.b, z7.b
+; CHECK-NEXT:    movprfx z25, z0
+; CHECK-NEXT:    mul z25.b, p0/m, z25.b, z4.b
+; CHECK-NEXT:    movprfx z26, z2
+; CHECK-NEXT:    mul z26.b, p0/m, z26.b, z6.b
+; CHECK-NEXT:    movprfx z27, z1
+; CHECK-NEXT:    mul z27.b, p0/m, z27.b, z5.b
+; CHECK-NEXT:    smulh z3.b, p0/m, z3.b, z7.b
 ; CHECK-NEXT:    smulh z0.b, p0/m, z0.b, z4.b
-; CHECK-NEXT:    asr z4.b, z2.b, #7
-; CHECK-NEXT:    cmpne p1.b, p0/z, z24.b, z25.b
-; CHECK-NEXT:    cmpne p3.b, p0/z, z6.b, z5.b
-; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, z4.b
-; CHECK-NEXT:    mov z7.b, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.b, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.b, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z3.b, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    mov z2.d, z7.d
+; CHECK-NEXT:    asr z4.b, z25.b, #7
+; CHECK-NEXT:    smulh z2.b, p0/m, z2.b, z6.b
+; CHECK-NEXT:    smulh z1.b, p0/m, z1.b, z5.b
+; CHECK-NEXT:    asr z5.b, z24.b, #7
+; CHECK-NEXT:    asr z6.b, z26.b, #7
+; CHECK-NEXT:    asr z7.b, z27.b, #7
+; CHECK-NEXT:    cmpne p1.b, p0/z, z0.b, z4.b
+; CHECK-NEXT:    cmpne p2.b, p0/z, z3.b, z5.b
+; CHECK-NEXT:    cmpne p3.b, p0/z, z2.b, z6.b
+; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, z7.b
+; CHECK-NEXT:    mov z25.b, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z24.b, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z27.b, p0/m, #0 // =0x0
+; CHECK-NEXT:    mov z26.b, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, z25.d
+; CHECK-NEXT:    mov z3.d, z24.d
+; CHECK-NEXT:    mov z1.d, z27.d
+; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 64 x i8>, <vscale x 64 x i1> } @llvm.smul.with.overflow.nxv64i8(<vscale x 64 x i8> %x, <vscale x 64 x i8> %y)
   %b = extractvalue { <vscale x 64 x i8>, <vscale x 64 x i1> } %a, 0
@@ -222,18 +225,19 @@ define <vscale x 16 x i16> @smulo_nxv16i16(<vscale x 16 x i16> %x, <vscale x 16
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    smulh z4.h, p0/m, z4.h, z3.h
-; CHECK-NEXT:    mul z1.h, p0/m, z1.h, z3.h
-; CHECK-NEXT:    movprfx z3, z0
-; CHECK-NEXT:    mul z3.h, p0/m, z3.h, z2.h
-; CHECK-NEXT:    asr z5.h, z1.h, #15
+; CHECK-NEXT:    mul z4.h, p0/m, z4.h, z3.h
+; CHECK-NEXT:    movprfx z5, z0
+; CHECK-NEXT:    mul z5.h, p0/m, z5.h, z2.h
+; CHECK-NEXT:    smulh z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    smulh z0.h, p0/m, z0.h, z2.h
-; CHECK-NEXT:    asr z2.h, z3.h, #15
-; CHECK-NEXT:    cmpne p1.h, p0/z, z4.h, z5.h
-; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    mov z1.h, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z3.h, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, z3.d
+; CHECK-NEXT:    asr z2.h, z4.h, #15
+; CHECK-NEXT:    asr z3.h, z5.h, #15
+; CHECK-NEXT:    cmpne p1.h, p0/z, z1.h, z2.h
+; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, z3.h
+; CHECK-NEXT:    mov z5.h, p0/m, #0 // =0x0
+; CHECK-NEXT:    mov z4.h, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, z5.d
+; CHECK-NEXT:    mov z1.d, z4.d
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 16 x i16>, <vscale x 16 x i1> } @llvm.smul.with.overflow.nxv16i16(<vscale x 16 x i16> %x, <vscale x 16 x i16> %y)
   %b = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i1> } %a, 0
@@ -249,31 +253,33 @@ define <vscale x 32 x i16> @smulo_nxv32i16(<vscale x 32 x i16> %x, <vscale x 32
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    movprfx z24, z3
-; CHECK-NEXT:    smulh z24.h, p0/m, z24.h, z7.h
-; CHECK-NEXT:    mul z3.h, p0/m, z3.h, z7.h
-; CHECK-NEXT:    movprfx z7, z2
-; CHECK-NEXT:    mul z7.h, p0/m, z7.h, z6.h
-; CHECK-NEXT:    smulh z2.h, p0/m, z2.h, z6.h
-; CHECK-NEXT:    asr z6.h, z7.h, #15
-; CHECK-NEXT:    cmpne p2.h, p0/z, z2.h, z6.h
-; CHECK-NEXT:    movprfx z6, z1
-; CHECK-NEXT:    smulh z6.h, p0/m, z6.h, z5.h
-; CHECK-NEXT:    mul z1.h, p0/m, z1.h, z5.h
-; CHECK-NEXT:    asr z25.h, z3.h, #15
-; CHECK-NEXT:    asr z5.h, z1.h, #15
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    mul z2.h, p0/m, z2.h, z4.h
+; CHECK-NEXT:    mul z24.h, p0/m, z24.h, z7.h
+; CHECK-NEXT:    movprfx z25, z0
+; CHECK-NEXT:    mul z25.h, p0/m, z25.h, z4.h
+; CHECK-NEXT:    movprfx z26, z2
+; CHECK-NEXT:    mul z26.h, p0/m, z26.h, z6.h
+; CHECK-NEXT:    movprfx z27, z1
+; CHECK-NEXT:    mul z27.h, p0/m, z27.h, z5.h
+; CHECK-NEXT:    smulh z3.h, p0/m, z3.h, z7.h
 ; CHECK-NEXT:    smulh z0.h, p0/m, z0.h, z4.h
-; CHECK-NEXT:    asr z4.h, z2.h, #15
-; CHECK-NEXT:    cmpne p1.h, p0/z, z24.h, z25.h
-; CHECK-NEXT:    cmpne p3.h, p0/z, z6.h, z5.h
-; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, z4.h
-; CHECK-NEXT:    mov z7.h, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.h, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.h, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z3.h, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    mov z2.d, z7.d
+; CHECK-NEXT:    asr z4.h, z25.h, #15
+; CHECK-NEXT:    smulh z2.h, p0/m, z2.h, z6.h
+; CHECK-NEXT:    smulh z1.h, p0/m, z1.h, z5.h
+; CHECK-NEXT:    asr z5.h, z24.h, #15
+; CHECK-NEXT:    asr z6.h, z26.h, #15
+; CHECK-NEXT:    asr z7.h, z27.h, #15
+; CHECK-NEXT:    cmpne p1.h, p0/z, z0.h, z4.h
+; CHECK-NEXT:    cmpne p2.h, p0/z, z3.h, z5.h
+; CHECK-NEXT:    cmpne p3.h, p0/z, z2.h, z6.h
+; CHECK-NEXT:    cmpne p0.h, p0/z, z1.h, z7.h
+; CHECK-NEXT:    mov z25.h, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z24.h, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z27.h, p0/m, #0 // =0x0
+; CHECK-NEXT:    mov z26.h, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, z25.d
+; CHECK-NEXT:    mov z3.d, z24.d
+; CHECK-NEXT:    mov z1.d, z27.d
+; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 32 x i16>, <vscale x 32 x i1> } @llvm.smul.with.overflow.nxv32i16(<vscale x 32 x i16> %x, <vscale x 32 x i16> %y)
   %b = extractvalue { <vscale x 32 x i16>, <vscale x 32 x i1> } %a, 0
@@ -331,18 +337,19 @@ define <vscale x 8 x i32> @smulo_nxv8i32(<vscale x 8 x i32> %x, <vscale x 8 x i3
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    smulh z4.s, p0/m, z4.s, z3.s
-; CHECK-NEXT:    mul z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT:    movprfx z3, z0
-; CHECK-NEXT:    mul z3.s, p0/m, z3.s, z2.s
-; CHECK-NEXT:    asr z5.s, z1.s, #31
+; CHECK-NEXT:    mul z4.s, p0/m, z4.s, z3.s
+; CHECK-NEXT:    movprfx z5, z0
+; CHECK-NEXT:    mul z5.s, p0/m, z5.s, z2.s
+; CHECK-NEXT:    smulh z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    smulh z0.s, p0/m, z0.s, z2.s
-; CHECK-NEXT:    asr z2.s, z3.s, #31
-; CHECK-NEXT:    cmpne p1.s, p0/z, z4.s, z5.s
-; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    mov z1.s, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z3.s, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, z3.d
+; CHECK-NEXT:    asr z2.s, z4.s, #31
+; CHECK-NEXT:    asr z3.s, z5.s, #31
+; CHECK-NEXT:    cmpne p1.s, p0/z, z1.s, z2.s
+; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, z3.s
+; CHECK-NEXT:    mov z5.s, p0/m, #0 // =0x0
+; CHECK-NEXT:    mov z4.s, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, z5.d
+; CHECK-NEXT:    mov z1.d, z4.d
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 8 x i32>, <vscale x 8 x i1> } @llvm.smul.with.overflow.nxv8i32(<vscale x 8 x i32> %x, <vscale x 8 x i32> %y)
   %b = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i1> } %a, 0
@@ -358,31 +365,33 @@ define <vscale x 16 x i32> @smulo_nxv16i32(<vscale x 16 x i32> %x, <vscale x 16
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    movprfx z24, z3
-; CHECK-NEXT:    smulh z24.s, p0/m, z24.s, z7.s
-; CHECK-NEXT:    mul z3.s, p0/m, z3.s, z7.s
-; CHECK-NEXT:    movprfx z7, z2
-; CHECK-NEXT:    mul z7.s, p0/m, z7.s, z6.s
-; CHECK-NEXT:    smulh z2.s, p0/m, z2.s, z6.s
-; CHECK-NEXT:    asr z6.s, z7.s, #31
-; CHECK-NEXT:    cmpne p2.s, p0/z, z2.s, z6.s
-; CHECK-NEXT:    movprfx z6, z1
-; CHECK-NEXT:    smulh z6.s, p0/m, z6.s, z5.s
-; CHECK-NEXT:    mul z1.s, p0/m, z1.s, z5.s
-; CHECK-NEXT:    asr z25.s, z3.s, #31
-; CHECK-NEXT:    asr z5.s, z1.s, #31
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    mul z2.s, p0/m, z2.s, z4.s
+; CHECK-NEXT:    mul z24.s, p0/m, z24.s, z7.s
+; CHECK-NEXT:    movprfx z25, z0
+; CHECK-NEXT:    mul z25.s, p0/m, z25.s, z4.s
+; CHECK-NEXT:    movprfx z26, z2
+; CHECK-NEXT:    mul z26.s, p0/m, z26.s, z6.s
+; CHECK-NEXT:    movprfx z27, z1
+; CHECK-NEXT:    mul z27.s, p0/m, z27.s, z5.s
+; CHECK-NEXT:    smulh z3.s, p0/m, z3.s, z7.s
 ; CHECK-NEXT:    smulh z0.s, p0/m, z0.s, z4.s
-; CHECK-NEXT:    asr z4.s, z2.s, #31
-; CHECK-NEXT:    cmpne p1.s, p0/z, z24.s, z25.s
-; CHECK-NEXT:    cmpne p3.s, p0/z, z6.s, z5.s
-; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, z4.s
-; CHECK-NEXT:    mov z7.s, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.s, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.s, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z3.s, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    mov z2.d, z7.d
+; CHECK-NEXT:    asr z4.s, z25.s, #31
+; CHECK-NEXT:    smulh z2.s, p0/m, z2.s, z6.s
+; CHECK-NEXT:    smulh z1.s, p0/m, z1.s, z5.s
+; CHECK-NEXT:    asr z5.s, z24.s, #31
+; CHECK-NEXT:    asr z6.s, z26.s, #31
+; CHECK-NEXT:    asr z7.s, z27.s, #31
+; CHECK-NEXT:    cmpne p1.s, p0/z, z0.s, z4.s
+; CHECK-NEXT:    cmpne p2.s, p0/z, z3.s, z5.s
+; CHECK-NEXT:    cmpne p3.s, p0/z, z2.s, z6.s
+; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, z7.s
+; CHECK-NEXT:    mov z25.s, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z24.s, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z27.s, p0/m, #0 // =0x0
+; CHECK-NEXT:    mov z26.s, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, z25.d
+; CHECK-NEXT:    mov z3.d, z24.d
+; CHECK-NEXT:    mov z1.d, z27.d
+; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 16 x i32>, <vscale x 16 x i1> } @llvm.smul.with.overflow.nxv16i32(<vscale x 16 x i32> %x, <vscale x 16 x i32> %y)
   %b = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i1> } %a, 0
@@ -419,18 +428,19 @@ define <vscale x 4 x i64> @smulo_nxv4i64(<vscale x 4 x i64> %x, <vscale x 4 x i6
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    smulh z4.d, p0/m, z4.d, z3.d
-; CHECK-NEXT:    mul z1.d, p0/m, z1.d, z3.d
-; CHECK-NEXT:    movprfx z3, z0
-; CHECK-NEXT:    mul z3.d, p0/m, z3.d, z2.d
-; CHECK-NEXT:    asr z5.d, z1.d, #63
+; CHECK-NEXT:    mul z4.d, p0/m, z4.d, z3.d
+; CHECK-NEXT:    movprfx z5, z0
+; CHECK-NEXT:    mul z5.d, p0/m, z5.d, z2.d
+; CHECK-NEXT:    smulh z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    smulh z0.d, p0/m, z0.d, z2.d
-; CHECK-NEXT:    asr z2.d, z3.d, #63
-; CHECK-NEXT:    cmpne p1.d, p0/z, z4.d, z5.d
-; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, z2.d
-; CHECK-NEXT:    mov z1.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z3.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, z3.d
+; CHECK-NEXT:    asr z2.d, z4.d, #63
+; CHECK-NEXT:    asr z3.d, z5.d, #63
+; CHECK-NEXT:    cmpne p1.d, p0/z, z1.d, z2.d
+; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, z3.d
+; CHECK-NEXT:    mov z5.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    mov z4.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, z5.d
+; CHECK-NEXT:    mov z1.d, z4.d
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 4 x i64>, <vscale x 4 x i1> } @llvm.smul.with.overflow.nxv4i64(<vscale x 4 x i64> %x, <vscale x 4 x i64> %y)
   %b = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i1> } %a, 0
@@ -446,31 +456,33 @@ define <vscale x 8 x i64> @smulo_nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i6
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    movprfx z24, z3
-; CHECK-NEXT:    smulh z24.d, p0/m, z24.d, z7.d
-; CHECK-NEXT:    mul z3.d, p0/m, z3.d, z7.d
-; CHECK-NEXT:    movprfx z7, z2
-; CHECK-NEXT:    mul z7.d, p0/m, z7.d, z6.d
-; CHECK-NEXT:    smulh z2.d, p0/m, z2.d, z6.d
-; CHECK-NEXT:    asr z6.d, z7.d, #63
-; CHECK-NEXT:    cmpne p2.d, p0/z, z2.d, z6.d
-; CHECK-NEXT:    movprfx z6, z1
-; CHECK-NEXT:    smulh z6.d, p0/m, z6.d, z5.d
-; CHECK-NEXT:    mul z1.d, p0/m, z1.d, z5.d
-; CHECK-NEXT:    asr z25.d, z3.d, #63
-; CHECK-NEXT:    asr z5.d, z1.d, #63
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    mul z2.d, p0/m, z2.d, z4.d
+; CHECK-NEXT:    mul z24.d, p0/m, z24.d, z7.d
+; CHECK-NEXT:    movprfx z25, z0
+; CHECK-NEXT:    mul z25.d, p0/m, z25.d, z4.d
+; CHECK-NEXT:    movprfx z26, z2
+; CHECK-NEXT:    mul z26.d, p0/m, z26.d, z6.d
+; CHECK-NEXT:    movprfx z27, z1
+; CHECK-NEXT:    mul z27.d, p0/m, z27.d, z5.d
+; CHECK-NEXT:    smulh z3.d, p0/m, z3.d, z7.d
 ; CHECK-NEXT:    smulh z0.d, p0/m, z0.d, z4.d
-; CHECK-NEXT:    asr z4.d, z2.d, #63
-; CHECK-NEXT:    cmpne p1.d, p0/z, z24.d, z25.d
-; CHECK-NEXT:    cmpne p3.d, p0/z, z6.d, z5.d
-; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, z4.d
-; CHECK-NEXT:    mov z7.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z3.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    mov z2.d, z7.d
+; CHECK-NEXT:    asr z4.d, z25.d, #63
+; CHECK-NEXT:    smulh z2.d, p0/m, z2.d, z6.d
+; CHECK-NEXT:    smulh z1.d, p0/m, z1.d, z5.d
+; CHECK-NEXT:    asr z5.d, z24.d, #63
+; CHECK-NEXT:    asr z6.d, z26.d, #63
+; CHECK-NEXT:    asr z7.d, z27.d, #63
+; CHECK-NEXT:    cmpne p1.d, p0/z, z0.d, z4.d
+; CHECK-NEXT:    cmpne p2.d, p0/z, z3.d, z5.d
+; CHECK-NEXT:    cmpne p3.d, p0/z, z2.d, z6.d
+; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, z7.d
+; CHECK-NEXT:    mov z25.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z24.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z27.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    mov z26.d, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, z25.d
+; CHECK-NEXT:    mov z3.d, z24.d
+; CHECK-NEXT:    mov z1.d, z27.d
+; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 8 x i64>, <vscale x 8 x i1> } @llvm.smul.with.overflow.nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i64> %y)
   %b = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i1> } %a, 0

diff  --git a/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll b/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll
index d54e6259a7429b..d79990e9e9616e 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll
@@ -22,15 +22,15 @@ define i8 @split_extract_32i8_idx(<vscale x 32 x i8> %a, i32 %idx) {
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    mov x8, #-1
-; CHECK-NEXT:    mov w9, w0
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    st1b { z1.b }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov w9, w0
 ; CHECK-NEXT:    addvl x8, x8, #2
 ; CHECK-NEXT:    cmp x9, x8
 ; CHECK-NEXT:    csel x8, x9, x8, lo
 ; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1b { z1.b }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
 ; CHECK-NEXT:    ldrb w0, [x9, x8]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -46,15 +46,15 @@ define i16 @split_extract_16i16_idx(<vscale x 16 x i16> %a, i32 %idx) {
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    mov x8, #-1
-; CHECK-NEXT:    mov w9, w0
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov w9, w0
 ; CHECK-NEXT:    addvl x8, x8, #1
 ; CHECK-NEXT:    cmp x9, x8
 ; CHECK-NEXT:    csel x8, x9, x8, lo
 ; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
 ; CHECK-NEXT:    ldrh w0, [x9, x8, lsl #1]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -70,14 +70,14 @@ define i32 @split_extract_8i32_idx(<vscale x 8 x i32> %a, i32 %idx) {
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cnth x8
 ; CHECK-NEXT:    mov w9, w0
 ; CHECK-NEXT:    sub x8, x8, #1
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    csel x8, x9, x8, lo
 ; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
 ; CHECK-NEXT:    ldr w0, [x9, x8, lsl #2]
 ; CHECK-NEXT:    addvl sp, sp, #2
@@ -94,14 +94,14 @@ define i64 @split_extract_8i64_idx(<vscale x 8 x i64> %a, i32 %idx) {
 ; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cnth x8
 ; CHECK-NEXT:    mov w9, w0
 ; CHECK-NEXT:    sub x8, x8, #1
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    st1d { z3.d }, p0, [sp, #3, mul vl]
 ; CHECK-NEXT:    csel x8, x9, x8, lo
 ; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1d { z3.d }, p0, [sp, #3, mul vl]
 ; CHECK-NEXT:    st1d { z2.d }, p0, [sp, #2, mul vl]
 ; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
@@ -140,15 +140,15 @@ define i16 @split_extract_16i16(<vscale x 16 x i16> %a) {
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    mov x8, #-1
-; CHECK-NEXT:    mov w9, #128
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov w9, #128 // =0x80
 ; CHECK-NEXT:    addvl x8, x8, #1
 ; CHECK-NEXT:    cmp x8, #128
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
 ; CHECK-NEXT:    ldrh w0, [x9, x8, lsl #1]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -164,18 +164,18 @@ define i32 @split_extract_16i32(<vscale x 16 x i32> %a) {
 ; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    mov x8, #-1
-; CHECK-NEXT:    mov w9, #34464
-; CHECK-NEXT:    movk w9, #1, lsl #16
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    st1w { z3.s }, p0, [sp, #3, mul vl]
-; CHECK-NEXT:    st1w { z2.s }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov w9, #34464 // =0x86a0
+; CHECK-NEXT:    movk w9, #1, lsl #16
 ; CHECK-NEXT:    addvl x8, x8, #1
-; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1w { z3.s }, p0, [sp, #3, mul vl]
+; CHECK-NEXT:    st1w { z2.s }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
 ; CHECK-NEXT:    ldr w0, [x9, x8, lsl #2]
 ; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -191,14 +191,14 @@ define i64 @split_extract_4i64(<vscale x 4 x i64> %a) {
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cntw x8
-; CHECK-NEXT:    mov w9, #10
+; CHECK-NEXT:    mov w9, #10 // =0xa
 ; CHECK-NEXT:    sub x8, x8, #1
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cmp x8, #10
-; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
 ; CHECK-NEXT:    ldr x0, [x9, x8, lsl #3]
 ; CHECK-NEXT:    addvl sp, sp, #2

diff  --git a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
index 99e9e61fca2953..44796606e7a1a4 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
@@ -42,7 +42,6 @@ define <vscale x 8 x double> @fcvtd_nxv8f16(<vscale x 8 x half> %a) {
 ; CHECK-NEXT:    uunpklo z2.d, z1.s
 ; CHECK-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEXT:    uunpklo z3.d, z0.s
-; CHECK-NEXT:    fcvt z1.d, p0/m, z1.h
 ; CHECK-NEXT:    uunpkhi z4.d, z0.s
 ; CHECK-NEXT:    movprfx z0, z2
 ; CHECK-NEXT:    fcvt z0.d, p0/m, z2.h
@@ -50,6 +49,7 @@ define <vscale x 8 x double> @fcvtd_nxv8f16(<vscale x 8 x half> %a) {
 ; CHECK-NEXT:    fcvt z2.d, p0/m, z3.h
 ; CHECK-NEXT:    movprfx z3, z4
 ; CHECK-NEXT:    fcvt z3.d, p0/m, z4.h
+; CHECK-NEXT:    fcvt z1.d, p0/m, z1.h
 ; CHECK-NEXT:    ret
   %res = fpext <vscale x 8 x half> %a to <vscale x 8 x double>
   ret <vscale x 8 x double> %res
@@ -77,13 +77,13 @@ define <vscale x 8 x double> @fcvtd_nxv8f32(<vscale x 8 x float> %a) {
 ; CHECK-NEXT:    uunpkhi z3.d, z0.s
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uunpklo z4.d, z1.s
-; CHECK-NEXT:    uunpkhi z5.d, z1.s
 ; CHECK-NEXT:    movprfx z0, z2
 ; CHECK-NEXT:    fcvt z0.d, p0/m, z2.s
-; CHECK-NEXT:    movprfx z1, z3
-; CHECK-NEXT:    fcvt z1.d, p0/m, z3.s
 ; CHECK-NEXT:    movprfx z2, z4
 ; CHECK-NEXT:    fcvt z2.d, p0/m, z4.s
+; CHECK-NEXT:    uunpkhi z5.d, z1.s
+; CHECK-NEXT:    movprfx z1, z3
+; CHECK-NEXT:    fcvt z1.d, p0/m, z3.s
 ; CHECK-NEXT:    movprfx z3, z5
 ; CHECK-NEXT:    fcvt z3.d, p0/m, z5.s
 ; CHECK-NEXT:    ret
@@ -149,9 +149,9 @@ define <vscale x 8 x float> @fcvts_nxv8f64(<vscale x 8 x double> %a) {
 ; CHECK-LABEL: fcvts_nxv8f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fcvt z3.s, p0/m, z3.d
 ; CHECK-NEXT:    fcvt z1.s, p0/m, z1.d
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
+; CHECK-NEXT:    fcvt z3.s, p0/m, z3.d
 ; CHECK-NEXT:    fcvt z2.s, p0/m, z2.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    uzp1 z1.s, z2.s, z3.s
@@ -214,13 +214,13 @@ define <vscale x 16 x i32> @fcvtzs_s_nxv16f16(<vscale x 16 x half> %a) {
 ; CHECK-NEXT:    uunpkhi z3.s, z0.h
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpklo z4.s, z1.h
-; CHECK-NEXT:    uunpkhi z5.s, z1.h
 ; CHECK-NEXT:    movprfx z0, z2
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z2.h
-; CHECK-NEXT:    movprfx z1, z3
-; CHECK-NEXT:    fcvtzs z1.s, p0/m, z3.h
 ; CHECK-NEXT:    movprfx z2, z4
 ; CHECK-NEXT:    fcvtzs z2.s, p0/m, z4.h
+; CHECK-NEXT:    uunpkhi z5.s, z1.h
+; CHECK-NEXT:    movprfx z1, z3
+; CHECK-NEXT:    fcvtzs z1.s, p0/m, z3.h
 ; CHECK-NEXT:    movprfx z3, z5
 ; CHECK-NEXT:    fcvtzs z3.s, p0/m, z5.h
 ; CHECK-NEXT:    ret
@@ -300,7 +300,6 @@ define <vscale x 16 x float> @scvtf_s_nxv16i8(<vscale x 16 x i8> %a) {
 ; CHECK-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEXT:    sunpkhi z1.s, z1.h
 ; CHECK-NEXT:    sunpklo z3.s, z0.h
-; CHECK-NEXT:    scvtf z1.s, p0/m, z1.s
 ; CHECK-NEXT:    sunpkhi z4.s, z0.h
 ; CHECK-NEXT:    movprfx z0, z2
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z2.s
@@ -308,6 +307,7 @@ define <vscale x 16 x float> @scvtf_s_nxv16i8(<vscale x 16 x i8> %a) {
 ; CHECK-NEXT:    scvtf z2.s, p0/m, z3.s
 ; CHECK-NEXT:    movprfx z3, z4
 ; CHECK-NEXT:    scvtf z3.s, p0/m, z4.s
+; CHECK-NEXT:    scvtf z1.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
   %res = sitofp <vscale x 16 x i8> %a to <vscale x 16 x float>
   ret <vscale x 16 x float> %res

diff  --git a/llvm/test/CodeGen/AArch64/sve-split-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-split-fp-reduce.ll
index 53200c9a56fd55..7f642882eddbee 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-fp-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-fp-reduce.ll
@@ -6,8 +6,8 @@
 define double @fadda_nxv8f64(double %init, <vscale x 8 x double> %a) {
 ; CHECK-LABEL: fadda_nxv8f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fadda d0, p0, d0, z1.d
 ; CHECK-NEXT:    fadda d0, p0, d0, z2.d
 ; CHECK-NEXT:    fadda d0, p0, d0, z3.d

diff  --git a/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll b/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll
index 0465da7e7093d0..7984057241c847 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll
@@ -6,9 +6,9 @@
 define <vscale x 8 x i8> @promote_insert_8i8(<vscale x 8 x i8> %a, i8 %elt, i64 %idx) {
 ; CHECK-LABEL: promote_insert_8i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    index z1.h, #0, #1
 ; CHECK-NEXT:    mov z2.h, w1
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z1.h, z2.h
 ; CHECK-NEXT:    mov z0.h, p0/m, w0
 ; CHECK-NEXT:    ret
@@ -23,14 +23,14 @@ define <vscale x 32 x i8> @split_insert_32i8_idx(<vscale x 32 x i8> %a, i8 %elt,
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    mov x8, #-1
-; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    st1b { z1.b }, p0, [sp, #1, mul vl]
-; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    addvl x8, x8, #2
 ; CHECK-NEXT:    cmp x1, x8
 ; CHECK-NEXT:    csel x8, x1, x8, lo
+; CHECK-NEXT:    st1b { z1.b }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
 ; CHECK-NEXT:    strb w0, [x9, x8]
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [sp]
 ; CHECK-NEXT:    ld1b { z1.b }, p0/z, [sp, #1, mul vl]
@@ -48,13 +48,13 @@ define <vscale x 8 x float> @split_insert_8f32_idx(<vscale x 8 x float> %a, floa
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cnth x8
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    sub x8, x8, #1
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cmp x0, x8
-; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    csel x8, x0, x8, lo
+; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
 ; CHECK-NEXT:    str s2, [x9, x8, lsl #2]
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [sp]
@@ -73,13 +73,13 @@ define <vscale x 8 x i64> @split_insert_8i64_idx(<vscale x 8 x i64> %a, i64 %elt
 ; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cnth x8
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    sub x8, x8, #1
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cmp x1, x8
-; CHECK-NEXT:    st1d { z3.d }, p0, [sp, #3, mul vl]
 ; CHECK-NEXT:    csel x8, x1, x8, lo
+; CHECK-NEXT:    st1d { z3.d }, p0, [sp, #3, mul vl]
 ; CHECK-NEXT:    st1d { z2.d }, p0, [sp, #2, mul vl]
 ; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
@@ -100,11 +100,11 @@ define <vscale x 8 x i64> @split_insert_8i64_idx(<vscale x 8 x i64> %a, i64 %elt
 define <vscale x 4 x i16> @promote_insert_4i16(<vscale x 4 x i16> %a, i16 %elt) {
 ; CHECK-LABEL: promote_insert_4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #5
-; CHECK-NEXT:    index z2.s, #0, #1
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z1.s, w8
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z2.s, z1.s
+; CHECK-NEXT:    mov w8, #5 // =0x5
+; CHECK-NEXT:    index z1.s, #0, #1
+; CHECK-NEXT:    mov z2.s, w8
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z1.s, z2.s
 ; CHECK-NEXT:    mov z0.s, p0/m, w0
 ; CHECK-NEXT:    ret
   %ins = insertelement <vscale x 4 x i16> %a, i16 %elt, i64 5
@@ -117,11 +117,11 @@ define <vscale x 4 x i16> @promote_insert_4i16(<vscale x 4 x i16> %a, i16 %elt)
 define <vscale x 32 x i8> @split_insert_32i8(<vscale x 32 x i8> %a, i8 %elt) {
 ; CHECK-LABEL: split_insert_32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #3
-; CHECK-NEXT:    index z3.b, #0, #1
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov z2.b, w8
-; CHECK-NEXT:    cmpeq p0.b, p0/z, z3.b, z2.b
+; CHECK-NEXT:    mov w8, #3 // =0x3
+; CHECK-NEXT:    index z2.b, #0, #1
+; CHECK-NEXT:    mov z3.b, w8
+; CHECK-NEXT:    cmpeq p0.b, p0/z, z2.b, z3.b
 ; CHECK-NEXT:    mov z0.b, p0/m, w0
 ; CHECK-NEXT:    ret
   %ins = insertelement <vscale x 32 x i8> %a, i8 %elt, i64 3
@@ -135,17 +135,17 @@ define <vscale x 32 x i16> @split_insert_32i16(<vscale x 32 x i16> %a, i16 %elt)
 ; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    mov x8, #-1
-; CHECK-NEXT:    mov w9, #128
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    st1h { z3.h }, p0, [sp, #3, mul vl]
-; CHECK-NEXT:    st1h { z2.h }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov w9, #128 // =0x80
 ; CHECK-NEXT:    addvl x8, x8, #2
-; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    cmp x8, #128
-; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1h { z3.h }, p0, [sp, #3, mul vl]
+; CHECK-NEXT:    st1h { z2.h }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
 ; CHECK-NEXT:    strh w0, [x9, x8, lsl #1]
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [sp]
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [sp, #1, mul vl]
@@ -165,12 +165,12 @@ define <vscale x 8 x i32> @split_insert_8i32(<vscale x 8 x i32> %a, i32 %elt) {
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cnth x8
-; CHECK-NEXT:    mov w9, #16960
+; CHECK-NEXT:    mov w9, #16960 // =0x4240
 ; CHECK-NEXT:    movk w9, #15, lsl #16
 ; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]

diff  --git a/llvm/test/CodeGen/AArch64/sve-split-int-pred-reduce.ll b/llvm/test/CodeGen/AArch64/sve-split-int-pred-reduce.ll
index ef0dbd81960986..bf55e0327441b4 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-int-pred-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-int-pred-reduce.ll
@@ -18,20 +18,12 @@ define i1 @andv_nxv32i1(<vscale x 32 x i1> %a) {
 define i1 @andv_nxv64i1(<vscale x 64 x i1> %a) {
 ; CHECK-LABEL: andv_nxv64i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    and p1.b, p1/z, p1.b, p3.b
 ; CHECK-NEXT:    and p0.b, p0/z, p0.b, p2.b
-; CHECK-NEXT:    ptrue p4.b
 ; CHECK-NEXT:    and p0.b, p0/z, p0.b, p1.b
-; CHECK-NEXT:    nots p0.b, p4/z, p0.b
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    nots p0.b, p1/z, p0.b
 ; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %res = call i1 @llvm.vector.reduce.and.nxv64i1(<vscale x 64 x i1> %a)
   ret i1 %res

diff  --git a/llvm/test/CodeGen/AArch64/sve-split-load.ll b/llvm/test/CodeGen/AArch64/sve-split-load.ll
index 50f6bff8f4670e..b8a5e1141cdf13 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-load.ll
@@ -122,11 +122,11 @@ define <vscale x 8 x i64> @masked_load_split_8i64(<vscale x 8 x i64> *%a, <vscal
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    punpklo p2.h, p1.b
 ; CHECK-NEXT:    punpkhi p1.h, p1.b
-; CHECK-NEXT:    ld1d { z0.d }, p2/z, [x0]
-; CHECK-NEXT:    punpklo p2.h, p0.b
+; CHECK-NEXT:    punpklo p3.h, p0.b
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
+; CHECK-NEXT:    ld1d { z0.d }, p2/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x0, #1, mul vl]
-; CHECK-NEXT:    ld1d { z2.d }, p2/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ld1d { z2.d }, p3/z, [x0, #2, mul vl]
 ; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x0, #3, mul vl]
 ; CHECK-NEXT:    ret
   %load = call <vscale x 8 x i64> @llvm.masked.load.nxv8i64(<vscale x 8 x i64> *%a, i32 1, <vscale x 8 x i1> %pg, <vscale x 8 x i64> undef)

diff  --git a/llvm/test/CodeGen/AArch64/sve-split-store.ll b/llvm/test/CodeGen/AArch64/sve-split-store.ll
index 387dde66242b1d..cd46430c1efd42 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-store.ll
@@ -80,11 +80,11 @@ define void @masked_store_split_32i16(<vscale x 32 x i16> %data, <vscale x 32 x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    punpkhi p2.h, p1.b
 ; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    punpkhi p3.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    st1h { z3.h }, p2, [x0, #3, mul vl]
 ; CHECK-NEXT:    st1h { z2.h }, p1, [x0, #2, mul vl]
-; CHECK-NEXT:    punpkhi p1.h, p0.b
-; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    st1h { z1.h }, p1, [x0, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p3, [x0, #1, mul vl]
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.nxv32i16(<vscale x 32 x i16> %data, <vscale x 32 x i16> *%a, i32 1, <vscale x 32 x i1> %pg)
@@ -110,11 +110,11 @@ define void @masked_store_split_8i64(<vscale x 8 x i64> %data, <vscale x 8 x i64
 ; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    punpkhi p2.h, p1.b
 ; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    punpkhi p3.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    st1d { z3.d }, p2, [x0, #3, mul vl]
 ; CHECK-NEXT:    st1d { z2.d }, p1, [x0, #2, mul vl]
-; CHECK-NEXT:    punpkhi p1.h, p0.b
-; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    st1d { z1.d }, p1, [x0, #1, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p3, [x0, #1, mul vl]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.nxv8i64(<vscale x 8 x i64> %data, <vscale x 8 x i64> *%a, i32 1, <vscale x 8 x i1> %pg)

diff  --git a/llvm/test/CodeGen/AArch64/sve-srem-combine-loop.ll b/llvm/test/CodeGen/AArch64/sve-srem-combine-loop.ll
index 102abfe07bc9ee..d001ae9f771218 100644
--- a/llvm/test/CodeGen/AArch64/sve-srem-combine-loop.ll
+++ b/llvm/test/CodeGen/AArch64/sve-srem-combine-loop.ll
@@ -6,10 +6,10 @@ target triple = "aarch64-unknown-linux-gnu"
 define <vscale x 4 x i32> @srem_combine_loop(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: srem_combine_loop:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    asrd z1.s, p0/m, z1.s, #1
+; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    mov z2.s, #2 // =0x2
+; CHECK-NEXT:    asrd z1.s, p0/m, z1.s, #1
 ; CHECK-NEXT:    mls z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    ret
   %rem = srem <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)

diff  --git a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll
index 4702e366002924..9f6fdf6b397185 100644
--- a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll
+++ b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll
@@ -39,8 +39,8 @@ define void @st1b_upper_bound(<vscale x 16 x i8> %data, <vscale x 16 x i8>* %a)
 define void @st1b_out_of_upper_bound(<vscale x 16 x i8> %data, <vscale x 16 x i8>* %a) {
 ; CHECK-LABEL: st1b_out_of_upper_bound:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    rdvl x8, #8
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    rdvl x8, #8
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0, x8]
 ; CHECK-NEXT:    ret
   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %a, i64 8
@@ -51,8 +51,8 @@ define void @st1b_out_of_upper_bound(<vscale x 16 x i8> %data, <vscale x 16 x i8
 define void @st1b_out_of_lower_bound(<vscale x 16 x i8> %data, <vscale x 16 x i8>* %a) {
 ; CHECK-LABEL: st1b_out_of_lower_bound:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    rdvl x8, #-9
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    rdvl x8, #-9
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0, x8]
 ; CHECK-NEXT:    ret
   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %a, i64 -9
@@ -105,8 +105,8 @@ define void @st1d_inbound(<vscale x 2 x i64> %data, <vscale x 2 x i64>* %a) {
 define void @store_nxv2f32(<vscale x 2 x float>* %out) {
 ; CHECK-LABEL: store_nxv2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov z0.s, #1.00000000
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fmov z0.s, #1.00000000
 ; CHECK-NEXT:    st1w { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   %ins = insertelement <vscale x 2 x float> undef, float 1.0, i32 0
@@ -118,8 +118,8 @@ define void @store_nxv2f32(<vscale x 2 x float>* %out) {
 define void @store_nxv4f16(<vscale x 4 x half>* %out) {
 ; CHECK-LABEL: store_nxv4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov z0.h, #1.00000000
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fmov z0.h, #1.00000000
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
   %ins = insertelement <vscale x 4 x half> undef, half 1.0, i32 0
@@ -133,9 +133,9 @@ define void @store_nxv4f16(<vscale x 4 x half>* %out) {
 define void @store_nxv6f32(<vscale x 6 x float>* %out) {
 ; CHECK-LABEL: store_nxv6f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov z0.s, #1.00000000
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    fmov z0.s, #1.00000000
 ; CHECK-NEXT:    st1w { z0.d }, p0, [x0, #2, mul vl]
 ; CHECK-NEXT:    st1w { z0.s }, p1, [x0]
 ; CHECK-NEXT:    ret
@@ -148,9 +148,9 @@ define void @store_nxv6f32(<vscale x 6 x float>* %out) {
 define void @store_nxv12f16(<vscale x 12 x half>* %out) {
 ; CHECK-LABEL: store_nxv12f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov z0.h, #1.00000000
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    fmov z0.h, #1.00000000
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x0, #2, mul vl]
 ; CHECK-NEXT:    st1h { z0.h }, p1, [x0]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-stepvector.ll b/llvm/test/CodeGen/AArch64/sve-stepvector.ll
index 4b3cb9c038613f..6f5a31248de7ed 100644
--- a/llvm/test/CodeGen/AArch64/sve-stepvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-stepvector.ll
@@ -49,8 +49,8 @@ define <vscale x 6 x i64> @stepvector_nxv6i64() {
 ; CHECK-LABEL: stepvector_nxv6i64:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    index z0.d, #0, #1
-; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    incd z1.d
 ; CHECK-NEXT:    incd z2.d, all, mul #2
 ; CHECK-NEXT:    ret
@@ -209,10 +209,10 @@ define <vscale x 4 x i32> @multiple_use_stepvector_nxv4i32_1(i32 %data) {
 ; CHECK-LABEL: multiple_use_stepvector_nxv4i32_1:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z0.s, w0
-; CHECK-NEXT:    index z1.s, w0, #1
-; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    sub z0.s, z0.s, z1.s
+; CHECK-NEXT:    index z0.s, w0, #1
+; CHECK-NEXT:    mov z1.s, w0
+; CHECK-NEXT:    mul z1.s, p0/m, z1.s, z0.s
+; CHECK-NEXT:    sub z0.s, z1.s, z0.s
 ; CHECK-NEXT:    ret
 entry:
   %0 = insertelement <vscale x 4 x i32> poison, i32 %data, i32 0
@@ -242,8 +242,8 @@ define <vscale x 2 x i64> @multiple_use_stepvector_nxv2i64_1(i64 %data) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    index z0.d, #0, #1
 ; CHECK-NEXT:    mov z1.d, x0
-; CHECK-NEXT:    add z1.d, z0.d, z1.d
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    add z1.d, z0.d, z1.d
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
@@ -271,7 +271,7 @@ entry:
 define <vscale x 2 x i64> @mul_stepvector_nxv2i64() {
 ; CHECK-LABEL: mul_stepvector_nxv2i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #2222
+; CHECK-NEXT:    mov w8, #2222 // =0x8ae
 ; CHECK-NEXT:    index z0.d, #0, x8
 ; CHECK-NEXT:    ret
 entry:
@@ -285,7 +285,7 @@ entry:
 define <vscale x 2 x i64> @mul_stepvector_bigconst_nxv2i64() {
 ; CHECK-LABEL: mul_stepvector_bigconst_nxv2i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov x8, #146028888064
+; CHECK-NEXT:    mov x8, #146028888064 // =0x2200000000
 ; CHECK-NEXT:    index z0.d, #0, x8
 ; CHECK-NEXT:    ret
 entry:
@@ -299,7 +299,7 @@ entry:
 define <vscale x 2 x i64> @mul_add_stepvector_nxv2i64(i64 %x) {
 ; CHECK-LABEL: mul_add_stepvector_nxv2i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #2222
+; CHECK-NEXT:    mov w8, #2222 // =0x8ae
 ; CHECK-NEXT:    index z0.d, x0, x8
 ; CHECK-NEXT:    ret
 entry:
@@ -332,7 +332,7 @@ entry:
 define <vscale x 2 x i64> @mul_add_stepvector_bigconst_nxv2i64(i64 %x) {
 ; CHECK-LABEL: mul_add_stepvector_bigconst_nxv2i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov x8, #146028888064
+; CHECK-NEXT:    mov x8, #146028888064 // =0x2200000000
 ; CHECK-NEXT:    index z0.d, x0, x8
 ; CHECK-NEXT:    ret
 entry:
@@ -425,12 +425,12 @@ define <vscale x 16 x i32> @split_sub_stepvector_nxv16i32() {
 ; CHECK-LABEL: split_sub_stepvector_nxv16i32:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    cntw x8
-; CHECK-NEXT:    cnth x9
-; CHECK-NEXT:    neg x8, x8
-; CHECK-NEXT:    neg x9, x9
 ; CHECK-NEXT:    index z0.s, #0, #-1
+; CHECK-NEXT:    neg x8, x8
 ; CHECK-NEXT:    mov z1.s, w8
-; CHECK-NEXT:    mov z3.s, w9
+; CHECK-NEXT:    cnth x8
+; CHECK-NEXT:    neg x8, x8
+; CHECK-NEXT:    mov z3.s, w8
 ; CHECK-NEXT:    add z1.s, z0.s, z1.s
 ; CHECK-NEXT:    add z2.s, z0.s, z3.s
 ; CHECK-NEXT:    add z3.s, z1.s, z3.s

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
index a5da99a9e6e8db..b80cb88c729e27 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
@@ -122,8 +122,8 @@ define <16 x i16> @vls_sve_and_16xi16(<16 x i16> %b) nounwind {
 define <2 x i32> @vls_sve_and_2xi32(<2 x i32> %b) nounwind {
 ; CHECK-LABEL: vls_sve_and_2xi32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    index z1.s, #0, #-1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -164,8 +164,8 @@ define <8 x i32> @vls_sve_and_8xi32(<8 x i32> %b) nounwind {
 define <2 x i64> @vls_sve_and_2xi64(<2 x i64> %b) nounwind {
 ; CHECK-LABEL: vls_sve_and_2xi64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    index z1.d, #0, #-1
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -176,9 +176,9 @@ define <2 x i64> @vls_sve_and_2xi64(<2 x i64> %b) nounwind {
 define <4 x i64> @vls_sve_and_4xi64(<4 x i64> %b) nounwind {
 ; CHECK-LABEL: vls_sve_and_4xi64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    index z2.d, #0, #-1
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    index z2.d, #0, #-1
 ; CHECK-NEXT:    and z0.d, z0.d, z2.d
 ; CHECK-NEXT:    and z1.d, z1.d, z2.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
index 8f36a9673b821d..a00569d8849596 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
@@ -10,8 +10,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x i8> @ctlz_v4i8(<4 x i8> %op) {
 ; CHECK-LABEL: ctlz_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    and z0.h, z0.h, #0xff
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    sub z0.h, z0.h, #8 // =0x8
@@ -24,8 +24,8 @@ define <4 x i8> @ctlz_v4i8(<4 x i8> %op) {
 define <8 x i8> @ctlz_v8i8(<8 x i8> %op) {
 ; CHECK-LABEL: ctlz_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    clz z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -36,8 +36,8 @@ define <8 x i8> @ctlz_v8i8(<8 x i8> %op) {
 define <16 x i8> @ctlz_v16i8(<16 x i8> %op) {
 ; CHECK-LABEL: ctlz_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    clz z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -48,8 +48,8 @@ define <16 x i8> @ctlz_v16i8(<16 x i8> %op) {
 define void @ctlz_v32i8(ptr %a) {
 ; CHECK-LABEL: ctlz_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    clz z0.b, p0/m, z0.b
 ; CHECK-NEXT:    clz z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -63,8 +63,8 @@ define void @ctlz_v32i8(ptr %a) {
 define <2 x i16> @ctlz_v2i16(<2 x i16> %op) {
 ; CHECK-LABEL: ctlz_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    and z0.s, z0.s, #0xffff
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    sub z0.s, z0.s, #16 // =0x10
@@ -77,8 +77,8 @@ define <2 x i16> @ctlz_v2i16(<2 x i16> %op) {
 define <4 x i16> @ctlz_v4i16(<4 x i16> %op) {
 ; CHECK-LABEL: ctlz_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -89,8 +89,8 @@ define <4 x i16> @ctlz_v4i16(<4 x i16> %op) {
 define <8 x i16> @ctlz_v8i16(<8 x i16> %op) {
 ; CHECK-LABEL: ctlz_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -101,8 +101,8 @@ define <8 x i16> @ctlz_v8i16(<8 x i16> %op) {
 define void @ctlz_v16i16(ptr %a) {
 ; CHECK-LABEL: ctlz_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    clz z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -116,8 +116,8 @@ define void @ctlz_v16i16(ptr %a) {
 define <2 x i32> @ctlz_v2i32(<2 x i32> %op) {
 ; CHECK-LABEL: ctlz_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -128,8 +128,8 @@ define <2 x i32> @ctlz_v2i32(<2 x i32> %op) {
 define <4 x i32> @ctlz_v4i32(<4 x i32> %op) {
 ; CHECK-LABEL: ctlz_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -140,8 +140,8 @@ define <4 x i32> @ctlz_v4i32(<4 x i32> %op) {
 define void @ctlz_v8i32(ptr %a) {
 ; CHECK-LABEL: ctlz_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    clz z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -155,8 +155,8 @@ define void @ctlz_v8i32(ptr %a) {
 define <1 x i64> @ctlz_v1i64(<1 x i64> %op) {
 ; CHECK-LABEL: ctlz_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -167,8 +167,8 @@ define <1 x i64> @ctlz_v1i64(<1 x i64> %op) {
 define <2 x i64> @ctlz_v2i64(<2 x i64> %op) {
 ; CHECK-LABEL: ctlz_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -179,8 +179,8 @@ define <2 x i64> @ctlz_v2i64(<2 x i64> %op) {
 define void @ctlz_v4i64(ptr %a) {
 ; CHECK-LABEL: ctlz_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    clz z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -198,8 +198,8 @@ define void @ctlz_v4i64(ptr %a) {
 define <4 x i8> @ctpop_v4i8(<4 x i8> %op) {
 ; CHECK-LABEL: ctpop_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    and z0.h, z0.h, #0xff
 ; CHECK-NEXT:    cnt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -211,8 +211,8 @@ define <4 x i8> @ctpop_v4i8(<4 x i8> %op) {
 define <8 x i8> @ctpop_v8i8(<8 x i8> %op) {
 ; CHECK-LABEL: ctpop_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    cnt z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -223,8 +223,8 @@ define <8 x i8> @ctpop_v8i8(<8 x i8> %op) {
 define <16 x i8> @ctpop_v16i8(<16 x i8> %op) {
 ; CHECK-LABEL: ctpop_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    cnt z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -235,8 +235,8 @@ define <16 x i8> @ctpop_v16i8(<16 x i8> %op) {
 define void @ctpop_v32i8(ptr %a) {
 ; CHECK-LABEL: ctpop_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    cnt z0.b, p0/m, z0.b
 ; CHECK-NEXT:    cnt z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -250,8 +250,8 @@ define void @ctpop_v32i8(ptr %a) {
 define <2 x i16> @ctpop_v2i16(<2 x i16> %op) {
 ; CHECK-LABEL: ctpop_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    and z0.s, z0.s, #0xffff
 ; CHECK-NEXT:    cnt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -263,8 +263,8 @@ define <2 x i16> @ctpop_v2i16(<2 x i16> %op) {
 define <4 x i16> @ctpop_v4i16(<4 x i16> %op) {
 ; CHECK-LABEL: ctpop_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    cnt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -275,8 +275,8 @@ define <4 x i16> @ctpop_v4i16(<4 x i16> %op) {
 define <8 x i16> @ctpop_v8i16(<8 x i16> %op) {
 ; CHECK-LABEL: ctpop_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    cnt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -287,8 +287,8 @@ define <8 x i16> @ctpop_v8i16(<8 x i16> %op) {
 define void @ctpop_v16i16(ptr %a) {
 ; CHECK-LABEL: ctpop_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    cnt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    cnt z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -302,8 +302,8 @@ define void @ctpop_v16i16(ptr %a) {
 define <2 x i32> @ctpop_v2i32(<2 x i32> %op) {
 ; CHECK-LABEL: ctpop_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    cnt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -314,8 +314,8 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %op) {
 define <4 x i32> @ctpop_v4i32(<4 x i32> %op) {
 ; CHECK-LABEL: ctpop_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    cnt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -326,8 +326,8 @@ define <4 x i32> @ctpop_v4i32(<4 x i32> %op) {
 define void @ctpop_v8i32(ptr %a) {
 ; CHECK-LABEL: ctpop_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    cnt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    cnt z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -341,8 +341,8 @@ define void @ctpop_v8i32(ptr %a) {
 define <1 x i64> @ctpop_v1i64(<1 x i64> %op) {
 ; CHECK-LABEL: ctpop_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    cnt z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -353,8 +353,8 @@ define <1 x i64> @ctpop_v1i64(<1 x i64> %op) {
 define <2 x i64> @ctpop_v2i64(<2 x i64> %op) {
 ; CHECK-LABEL: ctpop_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    cnt z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -365,8 +365,8 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %op) {
 define void @ctpop_v4i64(ptr %a) {
 ; CHECK-LABEL: ctpop_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    cnt z0.d, p0/m, z0.d
 ; CHECK-NEXT:    cnt z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -384,8 +384,8 @@ define void @ctpop_v4i64(ptr %a) {
 define <4 x i8> @cttz_v4i8(<4 x i8> %op) {
 ; CHECK-LABEL: cttz_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    orr z0.h, z0.h, #0x100
 ; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
@@ -398,8 +398,8 @@ define <4 x i8> @cttz_v4i8(<4 x i8> %op) {
 define <8 x i8> @cttz_v8i8(<8 x i8> %op) {
 ; CHECK-LABEL: cttz_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
 ; CHECK-NEXT:    clz z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -411,8 +411,8 @@ define <8 x i8> @cttz_v8i8(<8 x i8> %op) {
 define <16 x i8> @cttz_v16i8(<16 x i8> %op) {
 ; CHECK-LABEL: cttz_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
 ; CHECK-NEXT:    clz z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -424,11 +424,11 @@ define <16 x i8> @cttz_v16i8(<16 x i8> %op) {
 define void @cttz_v32i8(ptr %a) {
 ; CHECK-LABEL: cttz_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
-; CHECK-NEXT:    clz z0.b, p0/m, z0.b
 ; CHECK-NEXT:    rbit z1.b, p0/m, z1.b
+; CHECK-NEXT:    clz z0.b, p0/m, z0.b
 ; CHECK-NEXT:    clz z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -441,8 +441,8 @@ define void @cttz_v32i8(ptr %a) {
 define <2 x i16> @cttz_v2i16(<2 x i16> %op) {
 ; CHECK-LABEL: cttz_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    orr z0.s, z0.s, #0x10000
 ; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
@@ -455,8 +455,8 @@ define <2 x i16> @cttz_v2i16(<2 x i16> %op) {
 define <4 x i16> @cttz_v4i16(<4 x i16> %op) {
 ; CHECK-LABEL: cttz_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -468,8 +468,8 @@ define <4 x i16> @cttz_v4i16(<4 x i16> %op) {
 define <8 x i16> @cttz_v8i16(<8 x i16> %op) {
 ; CHECK-LABEL: cttz_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -481,11 +481,11 @@ define <8 x i16> @cttz_v8i16(<8 x i16> %op) {
 define void @cttz_v16i16(ptr %a) {
 ; CHECK-LABEL: cttz_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
-; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    rbit z1.h, p0/m, z1.h
+; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    clz z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -498,8 +498,8 @@ define void @cttz_v16i16(ptr %a) {
 define <2 x i32> @cttz_v2i32(<2 x i32> %op) {
 ; CHECK-LABEL: cttz_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -511,8 +511,8 @@ define <2 x i32> @cttz_v2i32(<2 x i32> %op) {
 define <4 x i32> @cttz_v4i32(<4 x i32> %op) {
 ; CHECK-LABEL: cttz_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -524,11 +524,11 @@ define <4 x i32> @cttz_v4i32(<4 x i32> %op) {
 define void @cttz_v8i32(ptr %a) {
 ; CHECK-LABEL: cttz_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
-; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    rbit z1.s, p0/m, z1.s
+; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    clz z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -541,8 +541,8 @@ define void @cttz_v8i32(ptr %a) {
 define <1 x i64> @cttz_v1i64(<1 x i64> %op) {
 ; CHECK-LABEL: cttz_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -554,8 +554,8 @@ define <1 x i64> @cttz_v1i64(<1 x i64> %op) {
 define <2 x i64> @cttz_v2i64(<2 x i64> %op) {
 ; CHECK-LABEL: cttz_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -567,11 +567,11 @@ define <2 x i64> @cttz_v2i64(<2 x i64> %op) {
 define void @cttz_v4i64(ptr %a) {
 ; CHECK-LABEL: cttz_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
-; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    rbit z1.d, p0/m, z1.d
+; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    clz z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
index 9fee61cd0d8337..e09179b726a6ea 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
@@ -66,9 +66,9 @@ define void @bitcast_v2i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr d0, [sp]
 ; CHECK-NEXT:    mov z1.s, z0.s[1]
 ; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov w9, s1
 ; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    strh w9, [sp, #10]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    strh w8, [sp, #10]
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    str w8, [x1]

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
index 9ffd05b799b241..16f43c61ac64ec 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
@@ -12,22 +12,22 @@ target triple = "aarch64"
 define <8 x i32> @fixed_bitselect_v8i32(ptr %pre_cond_ptr, ptr %left_ptr, ptr %right_ptr) {
 ; CHECK-LABEL: fixed_bitselect_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q3, q2, [x0]
 ; CHECK-NEXT:    mov z0.s, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    add z7.s, z3.s, z0.s
-; CHECK-NEXT:    subr z3.s, z3.s, #0 // =0x0
-; CHECK-NEXT:    ldp q1, q4, [x1]
+; CHECK-NEXT:    ldp q2, q1, [x0]
+; CHECK-NEXT:    ldp q5, q4, [x1]
+; CHECK-NEXT:    ldp q6, q7, [x2]
+; CHECK-NEXT:    add z3.s, z1.s, z0.s
+; CHECK-NEXT:    subr z1.s, z1.s, #0 // =0x0
 ; CHECK-NEXT:    add z0.s, z2.s, z0.s
 ; CHECK-NEXT:    subr z2.s, z2.s, #0 // =0x0
-; CHECK-NEXT:    and z1.d, z3.d, z1.d
-; CHECK-NEXT:    ldp q5, q6, [x2]
-; CHECK-NEXT:    and z2.d, z2.d, z4.d
-; CHECK-NEXT:    and z3.d, z0.d, z6.d
-; CHECK-NEXT:    and z0.d, z7.d, z5.d
-; CHECK-NEXT:    orr z0.d, z0.d, z1.d
-; CHECK-NEXT:    orr z1.d, z3.d, z2.d
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    and z1.d, z1.d, z4.d
+; CHECK-NEXT:    and z3.d, z3.d, z7.d
+; CHECK-NEXT:    and z0.d, z0.d, z6.d
+; CHECK-NEXT:    and z2.d, z2.d, z5.d
+; CHECK-NEXT:    orr z1.d, z3.d, z1.d
+; CHECK-NEXT:    orr z0.d, z0.d, z2.d
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %pre_cond = load <8 x i32>, ptr %pre_cond_ptr
   %left = load <8 x i32>, ptr %left_ptr

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
index 745ba26d92ca0f..52bca4256722a8 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
@@ -56,12 +56,12 @@ define void @build_vector_minus2_dec32_v4i64(ptr %a) {
 ; CHECK-LABEL: build_vector_minus2_dec32_v4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov x8, #-32 // =0xffffffffffffffe0
-; CHECK-NEXT:    mov z0.d, #-66 // =0xffffffffffffffbe
+; CHECK-NEXT:    mov z1.d, #-66 // =0xffffffffffffffbe
 ; CHECK-NEXT:    mov z2.d, #-2 // =0xfffffffffffffffe
-; CHECK-NEXT:    index z1.d, #0, x8
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    add z1.d, z1.d, z2.d
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    index z0.d, #0, x8
+; CHECK-NEXT:    add z1.d, z0.d, z1.d
+; CHECK-NEXT:    add z0.d, z0.d, z2.d
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   store <4 x i64> <i64 -2, i64 -34, i64 -66, i64 -98>, ptr %a, align 8
   ret void

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
index 767d7c0fa4a02a..3c35b53f9cdf6b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
@@ -13,29 +13,29 @@ define <8 x i8> @concat_v8i8(<4 x i8> %op1, <4 x i8> %op2)  {
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z2.h, z1.h[3]
 ; CHECK-NEXT:    mov z3.h, z1.h[2]
-; CHECK-NEXT:    mov z4.h, z1.h[1]
-; CHECK-NEXT:    fmov w10, s2
+; CHECK-NEXT:    mov z1.h, z1.h[1]
+; CHECK-NEXT:    mov z4.h, z0.h[3]
 ; CHECK-NEXT:    strb w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strb w9, [sp, #8]
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    mov z1.h, z0.h[3]
-; CHECK-NEXT:    mov z5.h, z0.h[2]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strb w8, [sp, #8]
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    mov z2.h, z0.h[2]
 ; CHECK-NEXT:    mov z0.h, z0.h[1]
-; CHECK-NEXT:    strb w10, [sp, #15]
-; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    strb w8, [sp, #15]
+; CHECK-NEXT:    fmov w8, s3
 ; CHECK-NEXT:    strb w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    strb w9, [sp, #13]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strb w10, [sp, #11]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    strb w8, [sp, #13]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    strb w8, [sp, #11]
+; CHECK-NEXT:    fmov w8, s2
 ; CHECK-NEXT:    strb w8, [sp, #10]
-; CHECK-NEXT:    strb w9, [sp, #9]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strb w8, [sp, #9]
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
@@ -46,8 +46,8 @@ define <8 x i8> @concat_v8i8(<4 x i8> %op1, <4 x i8> %op2)  {
 define <16 x i8> @concat_v16i8(<8 x i8> %op1, <8 x i8> %op2)  {
 ; CHECK-LABEL: concat_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -78,9 +78,9 @@ define void @concat_v64i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: concat_v64i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x1]
-; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    ldp q3, q2, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
-; CHECK-NEXT:    stp q2, q3, [x2]
+; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -107,16 +107,16 @@ define <4 x i16> @concat_v4i16(<2 x i16> %op1, <2 x i16> %op2)  {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov z2.s, z1.s[1]
 ; CHECK-NEXT:    fmov w8, s1
 ; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov z1.s, z1.s[1]
-; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w11, s0
+; CHECK-NEXT:    mov z1.s, z0.s[1]
 ; CHECK-NEXT:    strh w8, [sp, #12]
+; CHECK-NEXT:    fmov w8, s2
 ; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    strh w10, [sp, #14]
-; CHECK-NEXT:    strh w11, [sp, #10]
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    strh w8, [sp, #14]
+; CHECK-NEXT:    strh w9, [sp, #10]
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
@@ -128,8 +128,8 @@ define <4 x i16> @concat_v4i16(<2 x i16> %op1, <2 x i16> %op2)  {
 define <8 x i16> @concat_v8i16(<4 x i16> %op1, <4 x i16> %op2)  {
 ; CHECK-LABEL: concat_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -157,9 +157,9 @@ define void @concat_v32i16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: concat_v32i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x1]
-; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    ldp q3, q2, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
-; CHECK-NEXT:    stp q2, q3, [x2]
+; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -192,8 +192,8 @@ define <2 x i32> @concat_v2i32(<1 x i32> %op1, <1 x i32> %op2)  {
 define <4 x i32> @concat_v4i32(<2 x i32> %op1, <2 x i32> %op2)  {
 ; CHECK-LABEL: concat_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -220,9 +220,9 @@ define void @concat_v16i32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: concat_v16i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x1]
-; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    ldp q3, q2, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
-; CHECK-NEXT:    stp q2, q3, [x2]
+; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -240,8 +240,8 @@ define void @concat_v16i32(ptr %a, ptr %b, ptr %c) {
 define <2 x i64> @concat_v2i64(<1 x i64> %op1, <1 x i64> %op2)  {
 ; CHECK-LABEL: concat_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -268,9 +268,9 @@ define void @concat_v8i64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: concat_v8i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x1]
-; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    ldp q3, q2, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
-; CHECK-NEXT:    stp q2, q3, [x2]
+; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -290,12 +290,12 @@ define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2)  {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov z2.h, z1.h[1]
 ; CHECK-NEXT:    str h1, [sp, #12]
+; CHECK-NEXT:    mov z1.h, z0.h[1]
 ; CHECK-NEXT:    str h0, [sp, #8]
-; CHECK-NEXT:    mov z1.h, z1.h[1]
-; CHECK-NEXT:    mov z0.h, z0.h[1]
-; CHECK-NEXT:    str h1, [sp, #14]
-; CHECK-NEXT:    str h0, [sp, #10]
+; CHECK-NEXT:    str h2, [sp, #14]
+; CHECK-NEXT:    str h1, [sp, #10]
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
@@ -306,8 +306,8 @@ define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2)  {
 define <8 x half> @concat_v8f16(<4 x half> %op1, <4 x half> %op2)  {
 ; CHECK-LABEL: concat_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -335,9 +335,9 @@ define void @concat_v32f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: concat_v32f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x1]
-; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    ldp q3, q2, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
-; CHECK-NEXT:    stp q2, q3, [x2]
+; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -370,8 +370,8 @@ define <2 x float> @concat_v2f32(<1 x float> %op1, <1 x float> %op2)  {
 define <4 x float> @concat_v4f32(<2 x float> %op1, <2 x float> %op2)  {
 ; CHECK-LABEL: concat_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -398,9 +398,9 @@ define void @concat_v16f32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: concat_v16f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x1]
-; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    ldp q3, q2, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
-; CHECK-NEXT:    stp q2, q3, [x2]
+; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -418,8 +418,8 @@ define void @concat_v16f32(ptr %a, ptr %b, ptr %c) {
 define <2 x double> @concat_v2f64(<1 x double> %op1, <1 x double> %op2)  {
 ; CHECK-LABEL: concat_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -446,9 +446,9 @@ define void @concat_v8f64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: concat_v8f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x1]
-; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    ldp q3, q2, [x0]
 ; CHECK-NEXT:    stp q0, q1, [x2, #32]
-; CHECK-NEXT:    stp q2, q3, [x2]
+; CHECK-NEXT:    stp q3, q2, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
index c4082142ada78b..3b007e105bb377 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
@@ -50,8 +50,8 @@ define <2 x i256> @load_zext_v2i64i256(ptr %ap) {
 ; CHECK-NEXT:    mov x6, xzr
 ; CHECK-NEXT:    mov z1.d, z0.d[1]
 ; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    fmov x4, d1
 ; CHECK-NEXT:    mov x7, xzr
+; CHECK-NEXT:    fmov x4, d1
 ; CHECK-NEXT:    ret
   %a = load <2 x i64>, ptr %ap
   %val = zext <2 x i64> %a to <2 x i256>
@@ -61,18 +61,18 @@ define <2 x i256> @load_zext_v2i64i256(ptr %ap) {
 define <16 x i32> @load_sext_v16i8i32(ptr %ap)  {
 ; CHECK-LABEL: load_sext_v16i8i32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    mov w8, #4 // =0x4
 ; CHECK-NEXT:    mov w9, #8 // =0x8
 ; CHECK-NEXT:    mov w10, #12 // =0xc
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ld1sb { z1.s }, p0/z, [x0, x8]
 ; CHECK-NEXT:    ld1sb { z2.s }, p0/z, [x0, x9]
 ; CHECK-NEXT:    ld1sb { z3.s }, p0/z, [x0, x10]
 ; CHECK-NEXT:    ld1sb { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $z2
 ; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $z3
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %a = load <16 x i8>, ptr %ap
   %val = sext <16 x i8> %a to <16 x i32>
@@ -82,8 +82,8 @@ define <16 x i32> @load_sext_v16i8i32(ptr %ap)  {
 define <8 x i32> @load_sext_v8i16i32(ptr %ap)  {
 ; CHECK-LABEL: load_sext_v8i16i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #4 // =0x4
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    mov x8, #4 // =0x4
 ; CHECK-NEXT:    ld1sh { z1.s }, p0/z, [x0, x8, lsl #1]
 ; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -100,25 +100,25 @@ define <4 x i256> @load_sext_v4i32i256(ptr %ap) {
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    sunpklo z1.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    fmov x9, d1
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    fmov x9, d1
+; CHECK-NEXT:    mov z1.d, z1.d[1]
 ; CHECK-NEXT:    fmov x11, d0
 ; CHECK-NEXT:    mov z0.d, z0.d[1]
 ; CHECK-NEXT:    asr x10, x9, #63
-; CHECK-NEXT:    asr x12, x11, #63
 ; CHECK-NEXT:    stp x9, x10, [x8]
-; CHECK-NEXT:    fmov x9, d0
-; CHECK-NEXT:    mov z0.d, z1.d[1]
+; CHECK-NEXT:    fmov x9, d1
+; CHECK-NEXT:    asr x12, x11, #63
+; CHECK-NEXT:    stp x10, x10, [x8, #16]
 ; CHECK-NEXT:    stp x11, x12, [x8, #64]
 ; CHECK-NEXT:    fmov x11, d0
-; CHECK-NEXT:    stp x10, x10, [x8, #16]
-; CHECK-NEXT:    stp x12, x12, [x8, #80]
 ; CHECK-NEXT:    asr x10, x9, #63
+; CHECK-NEXT:    stp x12, x12, [x8, #80]
+; CHECK-NEXT:    stp x10, x10, [x8, #48]
 ; CHECK-NEXT:    asr x12, x11, #63
-; CHECK-NEXT:    stp x10, x10, [x8, #112]
-; CHECK-NEXT:    stp x9, x10, [x8, #96]
-; CHECK-NEXT:    stp x12, x12, [x8, #48]
-; CHECK-NEXT:    stp x11, x12, [x8, #32]
+; CHECK-NEXT:    stp x9, x10, [x8, #32]
+; CHECK-NEXT:    stp x12, x12, [x8, #112]
+; CHECK-NEXT:    stp x11, x12, [x8, #96]
 ; CHECK-NEXT:    ret
   %a = load <4 x i32>, ptr %ap
   %val = sext <4 x i32> %a to <4 x i256>
@@ -130,28 +130,28 @@ define <2 x i256> @load_sext_v2i64i256(ptr %ap) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    mov z0.d, z0.d[1]
-; CHECK-NEXT:    fmov x10, d0
+; CHECK-NEXT:    mov z1.d, z0.d[1]
 ; CHECK-NEXT:    asr x9, x8, #63
-; CHECK-NEXT:    asr x11, x10, #63
+; CHECK-NEXT:    fmov x10, d1
 ; CHECK-NEXT:    stp x8, x9, [sp, #-32]!
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    asr x8, x10, #63
 ; CHECK-NEXT:    mov z0.d, x9
-; CHECK-NEXT:    stp x10, x11, [sp, #16]
-; CHECK-NEXT:    mov z1.d, z0.d[1]
+; CHECK-NEXT:    stp x10, x8, [sp, #16]
+; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    ldp q2, q4, [sp], #32
+; CHECK-NEXT:    mov z3.d, z0.d[1]
+; CHECK-NEXT:    mov z5.d, z1.d[1]
+; CHECK-NEXT:    mov z6.d, z2.d[1]
 ; CHECK-NEXT:    fmov x2, d0
-; CHECK-NEXT:    mov z0.d, x11
-; CHECK-NEXT:    fmov x3, d1
-; CHECK-NEXT:    ldp q1, q3, [sp], #32
-; CHECK-NEXT:    mov z2.d, z0.d[1]
-; CHECK-NEXT:    fmov x6, d0
-; CHECK-NEXT:    mov z0.d, z1.d[1]
-; CHECK-NEXT:    fmov x0, d1
-; CHECK-NEXT:    mov z1.d, z3.d[1]
-; CHECK-NEXT:    fmov x7, d2
-; CHECK-NEXT:    fmov x4, d3
-; CHECK-NEXT:    fmov x1, d0
-; CHECK-NEXT:    fmov x5, d1
+; CHECK-NEXT:    mov z0.d, z4.d[1]
+; CHECK-NEXT:    fmov x6, d1
+; CHECK-NEXT:    fmov x0, d2
+; CHECK-NEXT:    fmov x4, d4
+; CHECK-NEXT:    fmov x3, d3
+; CHECK-NEXT:    fmov x7, d5
+; CHECK-NEXT:    fmov x1, d6
+; CHECK-NEXT:    fmov x5, d0
 ; CHECK-NEXT:    ret
   %a = load <2 x i64>, ptr %ap
   %val = sext <2 x i64> %a to <2 x i256>
@@ -161,29 +161,29 @@ define <2 x i256> @load_sext_v2i64i256(ptr %ap) {
 define <16 x i64> @load_zext_v16i16i64(ptr %ap)  {
 ; CHECK-LABEL: load_zext_v16i16i64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    mov x8, #2 // =0x2
 ; CHECK-NEXT:    mov x9, #4 // =0x4
-; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    mov x10, #6 // =0x6
-; CHECK-NEXT:    mov x11, #8 // =0x8
-; CHECK-NEXT:    mov x12, #10 // =0xa
 ; CHECK-NEXT:    ld1h { z1.d }, p0/z, [x0, x8, lsl #1]
-; CHECK-NEXT:    mov x8, #12 // =0xc
+; CHECK-NEXT:    mov x8, #6 // =0x6
 ; CHECK-NEXT:    ld1h { z2.d }, p0/z, [x0, x9, lsl #1]
-; CHECK-NEXT:    mov x9, #14 // =0xe
-; CHECK-NEXT:    ld1h { z3.d }, p0/z, [x0, x10, lsl #1]
-; CHECK-NEXT:    ld1h { z4.d }, p0/z, [x0, x11, lsl #1]
-; CHECK-NEXT:    ld1h { z5.d }, p0/z, [x0, x12, lsl #1]
-; CHECK-NEXT:    ld1h { z6.d }, p0/z, [x0, x8, lsl #1]
-; CHECK-NEXT:    ld1h { z7.d }, p0/z, [x0, x9, lsl #1]
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    mov x9, #8 // =0x8
+; CHECK-NEXT:    ld1h { z3.d }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT:    mov x8, #10 // =0xa
+; CHECK-NEXT:    ld1h { z4.d }, p0/z, [x0, x9, lsl #1]
+; CHECK-NEXT:    mov x9, #12 // =0xc
+; CHECK-NEXT:    ld1h { z5.d }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT:    mov x8, #14 // =0xe
+; CHECK-NEXT:    ld1h { z6.d }, p0/z, [x0, x9, lsl #1]
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $z2
 ; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $z3
 ; CHECK-NEXT:    // kill: def $q4 killed $q4 killed $z4
 ; CHECK-NEXT:    // kill: def $q5 killed $q5 killed $z5
+; CHECK-NEXT:    ld1h { z7.d }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    // kill: def $q6 killed $q6 killed $z6
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q7 killed $q7 killed $z7
 ; CHECK-NEXT:    ret
   %a = load <16 x i16>, ptr %ap

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
index cc046e57b5996a..08eec6cb288eec 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
@@ -13,16 +13,16 @@ define <4 x i1> @extract_subvector_v8i1(<8 x i1> %op) {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z1.b, z0.b[7]
 ; CHECK-NEXT:    mov z2.b, z0.b[6]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z1.b, z0.b[5]
+; CHECK-NEXT:    mov z3.b, z0.b[5]
 ; CHECK-NEXT:    mov z0.b, z0.b[4]
+; CHECK-NEXT:    fmov w8, s1
 ; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w11, s0
 ; CHECK-NEXT:    strh w8, [sp, #14]
+; CHECK-NEXT:    fmov w8, s3
 ; CHECK-NEXT:    strh w9, [sp, #12]
-; CHECK-NEXT:    strh w10, [sp, #10]
-; CHECK-NEXT:    strh w11, [sp, #8]
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    strh w8, [sp, #10]
+; CHECK-NEXT:    strh w9, [sp, #8]
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
@@ -40,16 +40,16 @@ define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z1.b, z0.b[7]
 ; CHECK-NEXT:    mov z2.b, z0.b[6]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z1.b, z0.b[5]
+; CHECK-NEXT:    mov z3.b, z0.b[5]
 ; CHECK-NEXT:    mov z0.b, z0.b[4]
+; CHECK-NEXT:    fmov w8, s1
 ; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    fmov w11, s0
 ; CHECK-NEXT:    strh w8, [sp, #14]
+; CHECK-NEXT:    fmov w8, s3
 ; CHECK-NEXT:    strh w9, [sp, #12]
-; CHECK-NEXT:    strh w10, [sp, #10]
-; CHECK-NEXT:    strh w11, [sp, #8]
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    strh w8, [sp, #10]
+; CHECK-NEXT:    strh w9, [sp, #8]
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
index fd9dd17d17c2d4..8150cba5e9b84f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
@@ -13,19 +13,19 @@ define void @test_copysign_v4f16_v4f16(ptr %ap, ptr %bp) {
 ; SVE:       // %bb.0:
 ; SVE-NEXT:    ldr d0, [x0]
 ; SVE-NEXT:    ldr d1, [x1]
-; SVE-NEXT:    and z0.h, z0.h, #0x7fff
 ; SVE-NEXT:    and z1.h, z1.h, #0x8000
+; SVE-NEXT:    and z0.h, z0.h, #0x7fff
 ; SVE-NEXT:    orr z0.d, z0.d, z1.d
 ; SVE-NEXT:    str d0, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: test_copysign_v4f16_v4f16:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ldr d0, [x0]
-; SVE2-NEXT:    mov z2.h, #32767 // =0x7fff
-; SVE2-NEXT:    ldr d1, [x1]
-; SVE2-NEXT:    bsl z0.d, z0.d, z1.d, z2.d
-; SVE2-NEXT:    str d0, [x0]
+; SVE2-NEXT:    mov z0.h, #32767 // =0x7fff
+; SVE2-NEXT:    ldr d1, [x0]
+; SVE2-NEXT:    ldr d2, [x1]
+; SVE2-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; SVE2-NEXT:    str d1, [x0]
 ; SVE2-NEXT:    ret
   %a = load <4 x half>, ptr %ap
   %b = load <4 x half>, ptr %bp
@@ -39,19 +39,19 @@ define void @test_copysign_v8f16_v8f16(ptr %ap, ptr %bp) {
 ; SVE:       // %bb.0:
 ; SVE-NEXT:    ldr q0, [x0]
 ; SVE-NEXT:    ldr q1, [x1]
-; SVE-NEXT:    and z0.h, z0.h, #0x7fff
 ; SVE-NEXT:    and z1.h, z1.h, #0x8000
+; SVE-NEXT:    and z0.h, z0.h, #0x7fff
 ; SVE-NEXT:    orr z0.d, z0.d, z1.d
 ; SVE-NEXT:    str q0, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: test_copysign_v8f16_v8f16:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ldr q0, [x0]
-; SVE2-NEXT:    mov z2.h, #32767 // =0x7fff
-; SVE2-NEXT:    ldr q1, [x1]
-; SVE2-NEXT:    bsl z0.d, z0.d, z1.d, z2.d
-; SVE2-NEXT:    str q0, [x0]
+; SVE2-NEXT:    mov z0.h, #32767 // =0x7fff
+; SVE2-NEXT:    ldr q1, [x0]
+; SVE2-NEXT:    ldr q2, [x1]
+; SVE2-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; SVE2-NEXT:    str q1, [x0]
 ; SVE2-NEXT:    ret
   %a = load <8 x half>, ptr %ap
   %b = load <8 x half>, ptr %bp
@@ -63,25 +63,25 @@ define void @test_copysign_v8f16_v8f16(ptr %ap, ptr %bp) {
 define void @test_copysign_v16f16_v16f16(ptr %ap, ptr %bp) {
 ; SVE-LABEL: test_copysign_v16f16_v16f16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ldp q0, q1, [x1]
+; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    and z0.h, z0.h, #0x8000
-; SVE-NEXT:    ldp q2, q3, [x0]
-; SVE-NEXT:    and z1.h, z1.h, #0x8000
+; SVE-NEXT:    and z3.h, z3.h, #0x8000
+; SVE-NEXT:    and z1.h, z1.h, #0x7fff
 ; SVE-NEXT:    and z2.h, z2.h, #0x7fff
-; SVE-NEXT:    orr z0.d, z2.d, z0.d
-; SVE-NEXT:    and z3.h, z3.h, #0x7fff
-; SVE-NEXT:    orr z1.d, z3.d, z1.d
+; SVE-NEXT:    orr z0.d, z1.d, z0.d
+; SVE-NEXT:    orr z1.d, z2.d, z3.d
 ; SVE-NEXT:    stp q0, q1, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: test_copysign_v16f16_v16f16:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ldp q1, q2, [x0]
 ; SVE2-NEXT:    mov z0.h, #32767 // =0x7fff
-; SVE2-NEXT:    ldp q3, q4, [x1]
-; SVE2-NEXT:    bsl z1.d, z1.d, z3.d, z0.d
-; SVE2-NEXT:    bsl z2.d, z2.d, z4.d, z0.d
-; SVE2-NEXT:    stp q1, q2, [x0]
+; SVE2-NEXT:    ldp q1, q4, [x1]
+; SVE2-NEXT:    ldp q2, q3, [x0]
+; SVE2-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
+; SVE2-NEXT:    bsl z3.d, z3.d, z4.d, z0.d
+; SVE2-NEXT:    stp q2, q3, [x0]
 ; SVE2-NEXT:    ret
   %a = load <16 x half>, ptr %ap
   %b = load <16 x half>, ptr %bp
@@ -97,19 +97,19 @@ define void @test_copysign_v2f32_v2f32(ptr %ap, ptr %bp) {
 ; SVE:       // %bb.0:
 ; SVE-NEXT:    ldr d0, [x0]
 ; SVE-NEXT:    ldr d1, [x1]
-; SVE-NEXT:    and z0.s, z0.s, #0x7fffffff
 ; SVE-NEXT:    and z1.s, z1.s, #0x80000000
+; SVE-NEXT:    and z0.s, z0.s, #0x7fffffff
 ; SVE-NEXT:    orr z0.d, z0.d, z1.d
 ; SVE-NEXT:    str d0, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: test_copysign_v2f32_v2f32:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ldr d0, [x0]
-; SVE2-NEXT:    mov z2.s, #0x7fffffff
-; SVE2-NEXT:    ldr d1, [x1]
-; SVE2-NEXT:    bsl z0.d, z0.d, z1.d, z2.d
-; SVE2-NEXT:    str d0, [x0]
+; SVE2-NEXT:    mov z0.s, #0x7fffffff
+; SVE2-NEXT:    ldr d1, [x0]
+; SVE2-NEXT:    ldr d2, [x1]
+; SVE2-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; SVE2-NEXT:    str d1, [x0]
 ; SVE2-NEXT:    ret
   %a = load <2 x float>, ptr %ap
   %b = load <2 x float>, ptr %bp
@@ -123,19 +123,19 @@ define void @test_copysign_v4f32_v4f32(ptr %ap, ptr %bp) {
 ; SVE:       // %bb.0:
 ; SVE-NEXT:    ldr q0, [x0]
 ; SVE-NEXT:    ldr q1, [x1]
-; SVE-NEXT:    and z0.s, z0.s, #0x7fffffff
 ; SVE-NEXT:    and z1.s, z1.s, #0x80000000
+; SVE-NEXT:    and z0.s, z0.s, #0x7fffffff
 ; SVE-NEXT:    orr z0.d, z0.d, z1.d
 ; SVE-NEXT:    str q0, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: test_copysign_v4f32_v4f32:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ldr q0, [x0]
-; SVE2-NEXT:    mov z2.s, #0x7fffffff
-; SVE2-NEXT:    ldr q1, [x1]
-; SVE2-NEXT:    bsl z0.d, z0.d, z1.d, z2.d
-; SVE2-NEXT:    str q0, [x0]
+; SVE2-NEXT:    mov z0.s, #0x7fffffff
+; SVE2-NEXT:    ldr q1, [x0]
+; SVE2-NEXT:    ldr q2, [x1]
+; SVE2-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; SVE2-NEXT:    str q1, [x0]
 ; SVE2-NEXT:    ret
   %a = load <4 x float>, ptr %ap
   %b = load <4 x float>, ptr %bp
@@ -147,25 +147,25 @@ define void @test_copysign_v4f32_v4f32(ptr %ap, ptr %bp) {
 define void @test_copysign_v8f32_v8f32(ptr %ap, ptr %bp) {
 ; SVE-LABEL: test_copysign_v8f32_v8f32:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ldp q0, q1, [x1]
+; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    and z0.s, z0.s, #0x80000000
-; SVE-NEXT:    ldp q2, q3, [x0]
-; SVE-NEXT:    and z1.s, z1.s, #0x80000000
+; SVE-NEXT:    and z3.s, z3.s, #0x80000000
+; SVE-NEXT:    and z1.s, z1.s, #0x7fffffff
 ; SVE-NEXT:    and z2.s, z2.s, #0x7fffffff
-; SVE-NEXT:    orr z0.d, z2.d, z0.d
-; SVE-NEXT:    and z3.s, z3.s, #0x7fffffff
-; SVE-NEXT:    orr z1.d, z3.d, z1.d
+; SVE-NEXT:    orr z0.d, z1.d, z0.d
+; SVE-NEXT:    orr z1.d, z2.d, z3.d
 ; SVE-NEXT:    stp q0, q1, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: test_copysign_v8f32_v8f32:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ldp q1, q2, [x0]
 ; SVE2-NEXT:    mov z0.s, #0x7fffffff
-; SVE2-NEXT:    ldp q3, q4, [x1]
-; SVE2-NEXT:    bsl z1.d, z1.d, z3.d, z0.d
-; SVE2-NEXT:    bsl z2.d, z2.d, z4.d, z0.d
-; SVE2-NEXT:    stp q1, q2, [x0]
+; SVE2-NEXT:    ldp q1, q4, [x1]
+; SVE2-NEXT:    ldp q2, q3, [x0]
+; SVE2-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
+; SVE2-NEXT:    bsl z3.d, z3.d, z4.d, z0.d
+; SVE2-NEXT:    stp q2, q3, [x0]
 ; SVE2-NEXT:    ret
   %a = load <8 x float>, ptr %ap
   %b = load <8 x float>, ptr %bp
@@ -181,19 +181,19 @@ define void @test_copysign_v2f64_v2f64(ptr %ap, ptr %bp) {
 ; SVE:       // %bb.0:
 ; SVE-NEXT:    ldr q0, [x0]
 ; SVE-NEXT:    ldr q1, [x1]
-; SVE-NEXT:    and z0.d, z0.d, #0x7fffffffffffffff
 ; SVE-NEXT:    and z1.d, z1.d, #0x8000000000000000
+; SVE-NEXT:    and z0.d, z0.d, #0x7fffffffffffffff
 ; SVE-NEXT:    orr z0.d, z0.d, z1.d
 ; SVE-NEXT:    str q0, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: test_copysign_v2f64_v2f64:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ldr q0, [x0]
-; SVE2-NEXT:    mov z2.d, #0x7fffffffffffffff
-; SVE2-NEXT:    ldr q1, [x1]
-; SVE2-NEXT:    bsl z0.d, z0.d, z1.d, z2.d
-; SVE2-NEXT:    str q0, [x0]
+; SVE2-NEXT:    mov z0.d, #0x7fffffffffffffff
+; SVE2-NEXT:    ldr q1, [x0]
+; SVE2-NEXT:    ldr q2, [x1]
+; SVE2-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; SVE2-NEXT:    str q1, [x0]
 ; SVE2-NEXT:    ret
   %a = load <2 x double>, ptr %ap
   %b = load <2 x double>, ptr %bp
@@ -205,25 +205,25 @@ define void @test_copysign_v2f64_v2f64(ptr %ap, ptr %bp) {
 define void @test_copysign_v4f64_v4f64(ptr %ap, ptr %bp) {
 ; SVE-LABEL: test_copysign_v4f64_v4f64:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ldp q0, q1, [x1]
+; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    and z0.d, z0.d, #0x8000000000000000
-; SVE-NEXT:    ldp q2, q3, [x0]
-; SVE-NEXT:    and z1.d, z1.d, #0x8000000000000000
+; SVE-NEXT:    and z3.d, z3.d, #0x8000000000000000
+; SVE-NEXT:    and z1.d, z1.d, #0x7fffffffffffffff
 ; SVE-NEXT:    and z2.d, z2.d, #0x7fffffffffffffff
-; SVE-NEXT:    orr z0.d, z2.d, z0.d
-; SVE-NEXT:    and z3.d, z3.d, #0x7fffffffffffffff
-; SVE-NEXT:    orr z1.d, z3.d, z1.d
+; SVE-NEXT:    orr z0.d, z1.d, z0.d
+; SVE-NEXT:    orr z1.d, z2.d, z3.d
 ; SVE-NEXT:    stp q0, q1, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: test_copysign_v4f64_v4f64:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ldp q1, q2, [x0]
 ; SVE2-NEXT:    mov z0.d, #0x7fffffffffffffff
-; SVE2-NEXT:    ldp q3, q4, [x1]
-; SVE2-NEXT:    bsl z1.d, z1.d, z3.d, z0.d
-; SVE2-NEXT:    bsl z2.d, z2.d, z4.d, z0.d
-; SVE2-NEXT:    stp q1, q2, [x0]
+; SVE2-NEXT:    ldp q1, q4, [x1]
+; SVE2-NEXT:    ldp q2, q3, [x0]
+; SVE2-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
+; SVE2-NEXT:    bsl z3.d, z3.d, z4.d, z0.d
+; SVE2-NEXT:    stp q2, q3, [x0]
 ; SVE2-NEXT:    ret
   %a = load <4 x double>, ptr %ap
   %b = load <4 x double>, ptr %bp
@@ -237,12 +237,12 @@ define void @test_copysign_v4f64_v4f64(ptr %ap, ptr %bp) {
 define void @test_copysign_v2f32_v2f64(ptr %ap, ptr %bp) {
 ; SVE-LABEL: test_copysign_v2f32_v2f64:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ldr q0, [x1]
 ; SVE-NEXT:    ptrue p0.d
+; SVE-NEXT:    ldr q0, [x1]
 ; SVE-NEXT:    ldr d1, [x0]
+; SVE-NEXT:    and z1.s, z1.s, #0x7fffffff
 ; SVE-NEXT:    fcvt z0.s, p0/m, z0.d
 ; SVE-NEXT:    uzp1 z0.s, z0.s, z0.s
-; SVE-NEXT:    and z1.s, z1.s, #0x7fffffff
 ; SVE-NEXT:    and z0.s, z0.s, #0x80000000
 ; SVE-NEXT:    orr z0.d, z1.d, z0.d
 ; SVE-NEXT:    str d0, [x0]
@@ -250,14 +250,14 @@ define void @test_copysign_v2f32_v2f64(ptr %ap, ptr %bp) {
 ;
 ; SVE2-LABEL: test_copysign_v2f32_v2f64:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ldr q0, [x1]
 ; SVE2-NEXT:    ptrue p0.d
-; SVE2-NEXT:    ldr d1, [x0]
-; SVE2-NEXT:    mov z2.s, #0x7fffffff
+; SVE2-NEXT:    ldr q0, [x1]
+; SVE2-NEXT:    mov z1.s, #0x7fffffff
+; SVE2-NEXT:    ldr d2, [x0]
 ; SVE2-NEXT:    fcvt z0.s, p0/m, z0.d
 ; SVE2-NEXT:    uzp1 z0.s, z0.s, z0.s
-; SVE2-NEXT:    bsl z1.d, z1.d, z0.d, z2.d
-; SVE2-NEXT:    str d1, [x0]
+; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
+; SVE2-NEXT:    str d2, [x0]
 ; SVE2-NEXT:    ret
   %a = load <2 x float>, ptr %ap
   %b = load <2 x double>, ptr %bp
@@ -273,34 +273,34 @@ define void @test_copysign_v2f32_v2f64(ptr %ap, ptr %bp) {
 define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) {
 ; SVE-LABEL: test_copysign_v4f32_v4f64:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ldp q1, q0, [x1]
 ; SVE-NEXT:    ptrue p0.d
+; SVE-NEXT:    ldp q0, q1, [x1]
 ; SVE-NEXT:    fcvt z1.s, p0/m, z1.d
-; SVE-NEXT:    uzp1 z1.s, z1.s, z1.s
-; SVE-NEXT:    ldr q2, [x0]
 ; SVE-NEXT:    fcvt z0.s, p0/m, z0.d
-; SVE-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; SVE-NEXT:    ptrue p0.s, vl2
-; SVE-NEXT:    splice z1.s, p0, z1.s, z0.s
-; SVE-NEXT:    and z1.s, z1.s, #0x80000000
-; SVE-NEXT:    and z2.s, z2.s, #0x7fffffff
-; SVE-NEXT:    orr z0.d, z2.d, z1.d
+; SVE-NEXT:    uzp1 z1.s, z1.s, z1.s
+; SVE-NEXT:    uzp1 z0.s, z0.s, z0.s
+; SVE-NEXT:    splice z0.s, p0, z0.s, z1.s
+; SVE-NEXT:    ldr q1, [x0]
+; SVE-NEXT:    and z1.s, z1.s, #0x7fffffff
+; SVE-NEXT:    and z0.s, z0.s, #0x80000000
+; SVE-NEXT:    orr z0.d, z1.d, z0.d
 ; SVE-NEXT:    str q0, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: test_copysign_v4f32_v4f64:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ldp q1, q0, [x1]
 ; SVE2-NEXT:    ptrue p0.d
-; SVE2-NEXT:    fcvt z1.s, p0/m, z1.d
-; SVE2-NEXT:    uzp1 z1.s, z1.s, z1.s
+; SVE2-NEXT:    ldp q0, q1, [x1]
 ; SVE2-NEXT:    ldr q2, [x0]
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.d
 ; SVE2-NEXT:    fcvt z0.s, p0/m, z0.d
-; SVE2-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; SVE2-NEXT:    ptrue p0.s, vl2
-; SVE2-NEXT:    splice z1.s, p0, z1.s, z0.s
-; SVE2-NEXT:    mov z0.s, #0x7fffffff
-; SVE2-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
+; SVE2-NEXT:    uzp1 z1.s, z1.s, z1.s
+; SVE2-NEXT:    uzp1 z0.s, z0.s, z0.s
+; SVE2-NEXT:    splice z0.s, p0, z0.s, z1.s
+; SVE2-NEXT:    mov z1.s, #0x7fffffff
+; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    str q2, [x0]
 ; SVE2-NEXT:    ret
   %a = load <4 x float>, ptr %ap
@@ -318,8 +318,8 @@ define void @test_copysign_v2f64_v2f32(ptr %ap, ptr %bp) {
 ; SVE:       // %bb.0:
 ; SVE-NEXT:    ptrue p0.d, vl2
 ; SVE-NEXT:    ldr q0, [x0]
-; SVE-NEXT:    ld1w { z1.d }, p0/z, [x1]
 ; SVE-NEXT:    and z0.d, z0.d, #0x7fffffffffffffff
+; SVE-NEXT:    ld1w { z1.d }, p0/z, [x1]
 ; SVE-NEXT:    fcvt z1.d, p0/m, z1.s
 ; SVE-NEXT:    and z1.d, z1.d, #0x8000000000000000
 ; SVE-NEXT:    orr z0.d, z0.d, z1.d
@@ -330,8 +330,8 @@ define void @test_copysign_v2f64_v2f32(ptr %ap, ptr %bp) {
 ; SVE2:       // %bb.0:
 ; SVE2-NEXT:    ptrue p0.d, vl2
 ; SVE2-NEXT:    ldr q0, [x0]
-; SVE2-NEXT:    ld1w { z1.d }, p0/z, [x1]
 ; SVE2-NEXT:    mov z2.d, #0x7fffffffffffffff
+; SVE2-NEXT:    ld1w { z1.d }, p0/z, [x1]
 ; SVE2-NEXT:    fcvt z1.d, p0/m, z1.s
 ; SVE2-NEXT:    bsl z0.d, z0.d, z1.d, z2.d
 ; SVE2-NEXT:    str q0, [x0]
@@ -350,15 +350,15 @@ define void @test_copysign_v2f64_v2f32(ptr %ap, ptr %bp) {
 define void @test_copysign_v4f64_v4f32(ptr %ap, ptr %bp) {
 ; SVE-LABEL: test_copysign_v4f64_v4f32:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    mov x8, #2 // =0x2
 ; SVE-NEXT:    ptrue p0.d, vl2
+; SVE-NEXT:    mov x8, #2 // =0x2
 ; SVE-NEXT:    ldp q0, q1, [x0]
+; SVE-NEXT:    and z0.d, z0.d, #0x7fffffffffffffff
+; SVE-NEXT:    and z1.d, z1.d, #0x7fffffffffffffff
 ; SVE-NEXT:    ld1w { z2.d }, p0/z, [x1, x8, lsl #2]
 ; SVE-NEXT:    ld1w { z3.d }, p0/z, [x1]
-; SVE-NEXT:    and z0.d, z0.d, #0x7fffffffffffffff
 ; SVE-NEXT:    fcvt z3.d, p0/m, z3.s
 ; SVE-NEXT:    fcvt z2.d, p0/m, z2.s
-; SVE-NEXT:    and z1.d, z1.d, #0x7fffffffffffffff
 ; SVE-NEXT:    and z3.d, z3.d, #0x8000000000000000
 ; SVE-NEXT:    and z2.d, z2.d, #0x8000000000000000
 ; SVE-NEXT:    orr z0.d, z0.d, z3.d
@@ -368,10 +368,10 @@ define void @test_copysign_v4f64_v4f32(ptr %ap, ptr %bp) {
 ;
 ; SVE2-LABEL: test_copysign_v4f64_v4f32:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    mov x8, #2 // =0x2
 ; SVE2-NEXT:    ptrue p0.d, vl2
-; SVE2-NEXT:    ldp q0, q1, [x0]
+; SVE2-NEXT:    mov x8, #2 // =0x2
 ; SVE2-NEXT:    mov z4.d, #0x7fffffffffffffff
+; SVE2-NEXT:    ldp q0, q1, [x0]
 ; SVE2-NEXT:    ld1w { z2.d }, p0/z, [x1, x8, lsl #2]
 ; SVE2-NEXT:    ld1w { z3.d }, p0/z, [x1]
 ; SVE2-NEXT:    fcvt z3.d, p0/m, z3.s
@@ -393,12 +393,12 @@ define void @test_copysign_v4f64_v4f32(ptr %ap, ptr %bp) {
 define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) {
 ; SVE-LABEL: test_copysign_v4f16_v4f32:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ldr q0, [x1]
 ; SVE-NEXT:    ptrue p0.s
+; SVE-NEXT:    ldr q0, [x1]
 ; SVE-NEXT:    ldr d1, [x0]
+; SVE-NEXT:    and z1.h, z1.h, #0x7fff
 ; SVE-NEXT:    fcvt z0.h, p0/m, z0.s
 ; SVE-NEXT:    uzp1 z0.h, z0.h, z0.h
-; SVE-NEXT:    and z1.h, z1.h, #0x7fff
 ; SVE-NEXT:    and z0.h, z0.h, #0x8000
 ; SVE-NEXT:    orr z0.d, z1.d, z0.d
 ; SVE-NEXT:    str d0, [x0]
@@ -406,14 +406,14 @@ define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) {
 ;
 ; SVE2-LABEL: test_copysign_v4f16_v4f32:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ldr q0, [x1]
 ; SVE2-NEXT:    ptrue p0.s
-; SVE2-NEXT:    ldr d1, [x0]
-; SVE2-NEXT:    mov z2.h, #32767 // =0x7fff
+; SVE2-NEXT:    ldr q0, [x1]
+; SVE2-NEXT:    mov z1.h, #32767 // =0x7fff
+; SVE2-NEXT:    ldr d2, [x0]
 ; SVE2-NEXT:    fcvt z0.h, p0/m, z0.s
 ; SVE2-NEXT:    uzp1 z0.h, z0.h, z0.h
-; SVE2-NEXT:    bsl z1.d, z1.d, z0.d, z2.d
-; SVE2-NEXT:    str d1, [x0]
+; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
+; SVE2-NEXT:    str d2, [x0]
 ; SVE2-NEXT:    ret
   %a = load <4 x half>, ptr %ap
   %b = load <4 x float>, ptr %bp
@@ -429,18 +429,18 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) {
 ; SVE-NEXT:    sub sp, sp, #16
 ; SVE-NEXT:    .cfi_def_cfa_offset 16
 ; SVE-NEXT:    ldp q1, q0, [x1]
-; SVE-NEXT:    fcvt h3, d1
-; SVE-NEXT:    mov z1.d, z1.d[1]
-; SVE-NEXT:    fcvt h1, d1
-; SVE-NEXT:    fcvt h2, d0
-; SVE-NEXT:    mov z0.d, z0.d[1]
-; SVE-NEXT:    fcvt h0, d0
 ; SVE-NEXT:    ldr d4, [x0]
-; SVE-NEXT:    str h3, [sp, #8]
-; SVE-NEXT:    str h1, [sp, #10]
-; SVE-NEXT:    str h2, [sp, #12]
 ; SVE-NEXT:    and z4.h, z4.h, #0x7fff
-; SVE-NEXT:    str h0, [sp, #14]
+; SVE-NEXT:    mov z2.d, z0.d[1]
+; SVE-NEXT:    mov z3.d, z1.d[1]
+; SVE-NEXT:    fcvt h0, d0
+; SVE-NEXT:    fcvt h1, d1
+; SVE-NEXT:    fcvt h2, d2
+; SVE-NEXT:    fcvt h3, d3
+; SVE-NEXT:    str h0, [sp, #12]
+; SVE-NEXT:    str h1, [sp, #8]
+; SVE-NEXT:    str h2, [sp, #14]
+; SVE-NEXT:    str h3, [sp, #10]
 ; SVE-NEXT:    ldr d0, [sp, #8]
 ; SVE-NEXT:    and z0.h, z0.h, #0x8000
 ; SVE-NEXT:    orr z0.d, z4.d, z0.d
@@ -452,22 +452,22 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) {
 ; SVE2:       // %bb.0:
 ; SVE2-NEXT:    sub sp, sp, #16
 ; SVE2-NEXT:    .cfi_def_cfa_offset 16
-; SVE2-NEXT:    ldp q1, q0, [x1]
-; SVE2-NEXT:    fcvt h3, d1
-; SVE2-NEXT:    mov z1.d, z1.d[1]
+; SVE2-NEXT:    ldp q2, q1, [x1]
+; SVE2-NEXT:    mov z0.h, #32767 // =0x7fff
+; SVE2-NEXT:    ldr d5, [x0]
+; SVE2-NEXT:    mov z3.d, z1.d[1]
+; SVE2-NEXT:    mov z4.d, z2.d[1]
 ; SVE2-NEXT:    fcvt h1, d1
-; SVE2-NEXT:    fcvt h2, d0
-; SVE2-NEXT:    mov z0.d, z0.d[1]
-; SVE2-NEXT:    fcvt h0, d0
-; SVE2-NEXT:    ldr d4, [x0]
-; SVE2-NEXT:    str h3, [sp, #8]
-; SVE2-NEXT:    str h1, [sp, #10]
-; SVE2-NEXT:    mov z1.h, #32767 // =0x7fff
-; SVE2-NEXT:    str h2, [sp, #12]
-; SVE2-NEXT:    str h0, [sp, #14]
-; SVE2-NEXT:    ldr d0, [sp, #8]
-; SVE2-NEXT:    bsl z4.d, z4.d, z0.d, z1.d
-; SVE2-NEXT:    str d4, [x0]
+; SVE2-NEXT:    fcvt h2, d2
+; SVE2-NEXT:    fcvt h3, d3
+; SVE2-NEXT:    fcvt h4, d4
+; SVE2-NEXT:    str h1, [sp, #12]
+; SVE2-NEXT:    str h2, [sp, #8]
+; SVE2-NEXT:    str h3, [sp, #14]
+; SVE2-NEXT:    str h4, [sp, #10]
+; SVE2-NEXT:    ldr d1, [sp, #8]
+; SVE2-NEXT:    bsl z5.d, z5.d, z1.d, z0.d
+; SVE2-NEXT:    str d5, [x0]
 ; SVE2-NEXT:    add sp, sp, #16
 ; SVE2-NEXT:    ret
   %a = load <4 x half>, ptr %ap
@@ -483,34 +483,34 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) {
 define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) {
 ; SVE-LABEL: test_copysign_v8f16_v8f32:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ldp q1, q0, [x1]
 ; SVE-NEXT:    ptrue p0.s
+; SVE-NEXT:    ldp q0, q1, [x1]
 ; SVE-NEXT:    fcvt z1.h, p0/m, z1.s
-; SVE-NEXT:    uzp1 z1.h, z1.h, z1.h
-; SVE-NEXT:    ldr q2, [x0]
 ; SVE-NEXT:    fcvt z0.h, p0/m, z0.s
-; SVE-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; SVE-NEXT:    ptrue p0.h, vl4
-; SVE-NEXT:    splice z1.h, p0, z1.h, z0.h
-; SVE-NEXT:    and z1.h, z1.h, #0x8000
-; SVE-NEXT:    and z2.h, z2.h, #0x7fff
-; SVE-NEXT:    orr z0.d, z2.d, z1.d
+; SVE-NEXT:    uzp1 z1.h, z1.h, z1.h
+; SVE-NEXT:    uzp1 z0.h, z0.h, z0.h
+; SVE-NEXT:    splice z0.h, p0, z0.h, z1.h
+; SVE-NEXT:    ldr q1, [x0]
+; SVE-NEXT:    and z1.h, z1.h, #0x7fff
+; SVE-NEXT:    and z0.h, z0.h, #0x8000
+; SVE-NEXT:    orr z0.d, z1.d, z0.d
 ; SVE-NEXT:    str q0, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: test_copysign_v8f16_v8f32:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ldp q1, q0, [x1]
 ; SVE2-NEXT:    ptrue p0.s
-; SVE2-NEXT:    fcvt z1.h, p0/m, z1.s
-; SVE2-NEXT:    uzp1 z1.h, z1.h, z1.h
+; SVE2-NEXT:    ldp q0, q1, [x1]
 ; SVE2-NEXT:    ldr q2, [x0]
+; SVE2-NEXT:    fcvt z1.h, p0/m, z1.s
 ; SVE2-NEXT:    fcvt z0.h, p0/m, z0.s
-; SVE2-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; SVE2-NEXT:    ptrue p0.h, vl4
-; SVE2-NEXT:    splice z1.h, p0, z1.h, z0.h
-; SVE2-NEXT:    mov z0.h, #32767 // =0x7fff
-; SVE2-NEXT:    bsl z2.d, z2.d, z1.d, z0.d
+; SVE2-NEXT:    uzp1 z1.h, z1.h, z1.h
+; SVE2-NEXT:    uzp1 z0.h, z0.h, z0.h
+; SVE2-NEXT:    splice z0.h, p0, z0.h, z1.h
+; SVE2-NEXT:    mov z1.h, #32767 // =0x7fff
+; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    str q2, [x0]
 ; SVE2-NEXT:    ret
   %a = load <8 x half>, ptr %ap

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
index ccb68a808e4b42..1d6352b4fbe1d4 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
@@ -10,8 +10,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define <2 x half> @fadd_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ; CHECK-LABEL: fadd_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -23,8 +23,8 @@ define <2 x half> @fadd_v2f16(<2 x half> %op1, <2 x half> %op2) {
 define <4 x half> @fadd_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-LABEL: fadd_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -36,8 +36,8 @@ define <4 x half> @fadd_v4f16(<4 x half> %op1, <4 x half> %op2) {
 define <8 x half> @fadd_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-LABEL: fadd_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -49,10 +49,11 @@ define <8 x half> @fadd_v8f16(<8 x half> %op1, <8 x half> %op2) {
 define void @fadd_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fadd_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fadd z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -66,8 +67,8 @@ define void @fadd_v16f16(ptr %a, ptr %b) {
 define <2 x float> @fadd_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-LABEL: fadd_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -79,8 +80,8 @@ define <2 x float> @fadd_v2f32(<2 x float> %op1, <2 x float> %op2) {
 define <4 x float> @fadd_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-LABEL: fadd_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -92,10 +93,11 @@ define <4 x float> @fadd_v4f32(<4 x float> %op1, <4 x float> %op2) {
 define void @fadd_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fadd_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fadd z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -109,8 +111,8 @@ define void @fadd_v8f32(ptr %a, ptr %b) {
 define <2 x double> @fadd_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-LABEL: fadd_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -122,10 +124,11 @@ define <2 x double> @fadd_v2f64(<2 x double> %op1, <2 x double> %op2) {
 define void @fadd_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fadd_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fadd z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -143,8 +146,8 @@ define void @fadd_v4f64(ptr %a, ptr %b) {
 define <2 x half> @fdiv_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ; CHECK-LABEL: fdiv_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -156,8 +159,8 @@ define <2 x half> @fdiv_v2f16(<2 x half> %op1, <2 x half> %op2) {
 define <4 x half> @fdiv_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-LABEL: fdiv_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -169,8 +172,8 @@ define <4 x half> @fdiv_v4f16(<4 x half> %op1, <4 x half> %op2) {
 define <8 x half> @fdiv_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-LABEL: fdiv_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -182,10 +185,11 @@ define <8 x half> @fdiv_v8f16(<8 x half> %op1, <8 x half> %op2) {
 define void @fdiv_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fdiv_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fdivr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fdiv z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -199,8 +203,8 @@ define void @fdiv_v16f16(ptr %a, ptr %b) {
 define <2 x float> @fdiv_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-LABEL: fdiv_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -212,8 +216,8 @@ define <2 x float> @fdiv_v2f32(<2 x float> %op1, <2 x float> %op2) {
 define <4 x float> @fdiv_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-LABEL: fdiv_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -225,10 +229,11 @@ define <4 x float> @fdiv_v4f32(<4 x float> %op1, <4 x float> %op2) {
 define void @fdiv_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fdiv_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fdiv z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fdivr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fdiv z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -242,8 +247,8 @@ define void @fdiv_v8f32(ptr %a, ptr %b) {
 define <2 x double> @fdiv_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-LABEL: fdiv_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fdiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -255,10 +260,11 @@ define <2 x double> @fdiv_v2f64(<2 x double> %op1, <2 x double> %op2) {
 define void @fdiv_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fdiv_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fdiv z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fdivr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fdiv z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -276,8 +282,8 @@ define void @fdiv_v4f64(ptr %a, ptr %b) {
 define <2 x half> @fma_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x half> %op3) {
 ; CHECK-LABEL: fma_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $z2
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
@@ -290,8 +296,8 @@ define <2 x half> @fma_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x half> %op3)
 define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) {
 ; CHECK-LABEL: fma_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $z2
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
@@ -304,8 +310,8 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
 define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) {
 ; CHECK-LABEL: fma_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 def $z2
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
@@ -318,13 +324,13 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
 define void @fma_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fma_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    ldp q4, q5, [x2]
-; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z4.h
+; CHECK-NEXT:    ldp q0, q4, [x1]
+; CHECK-NEXT:    ldp q1, q5, [x2]
+; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    fmad z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    movprfx z1, z5
-; CHECK-NEXT:    fmla z1.h, p0/m, z2.h, z3.h
+; CHECK-NEXT:    fmla z1.h, p0/m, z3.h, z4.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
@@ -338,8 +344,8 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) {
 define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3) {
 ; CHECK-LABEL: fma_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $z2
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
@@ -352,8 +358,8 @@ define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %o
 define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3) {
 ; CHECK-LABEL: fma_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 def $z2
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
@@ -366,13 +372,13 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o
 define void @fma_v8f32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fma_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    ldp q4, q5, [x2]
-; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z4.s
+; CHECK-NEXT:    ldp q0, q4, [x1]
+; CHECK-NEXT:    ldp q1, q5, [x2]
+; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    fmad z0.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    movprfx z1, z5
-; CHECK-NEXT:    fmla z1.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    fmla z1.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
@@ -386,8 +392,8 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) {
 define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3) {
 ; CHECK-LABEL: fma_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 def $z2
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
@@ -400,13 +406,13 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double
 define void @fma_v4f64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fma_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    ldp q4, q5, [x2]
-; CHECK-NEXT:    fmad z0.d, p0/m, z1.d, z4.d
+; CHECK-NEXT:    ldp q0, q4, [x1]
+; CHECK-NEXT:    ldp q1, q5, [x2]
+; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    fmad z0.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    movprfx z1, z5
-; CHECK-NEXT:    fmla z1.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    fmla z1.d, p0/m, z3.d, z4.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
@@ -424,8 +430,8 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) {
 define <2 x half> @fmul_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ; CHECK-LABEL: fmul_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -437,8 +443,8 @@ define <2 x half> @fmul_v2f16(<2 x half> %op1, <2 x half> %op2) {
 define <4 x half> @fmul_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-LABEL: fmul_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -450,8 +456,8 @@ define <4 x half> @fmul_v4f16(<4 x half> %op1, <4 x half> %op2) {
 define <8 x half> @fmul_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-LABEL: fmul_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -463,10 +469,11 @@ define <8 x half> @fmul_v8f16(<8 x half> %op1, <8 x half> %op2) {
 define void @fmul_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fmul_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fmul z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -480,8 +487,8 @@ define void @fmul_v16f16(ptr %a, ptr %b) {
 define <2 x float> @fmul_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-LABEL: fmul_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -493,8 +500,8 @@ define <2 x float> @fmul_v2f32(<2 x float> %op1, <2 x float> %op2) {
 define <4 x float> @fmul_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-LABEL: fmul_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -506,10 +513,11 @@ define <4 x float> @fmul_v4f32(<4 x float> %op1, <4 x float> %op2) {
 define void @fmul_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fmul_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fmul z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -523,8 +531,8 @@ define void @fmul_v8f32(ptr %a, ptr %b) {
 define <2 x double> @fmul_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-LABEL: fmul_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fmul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -536,10 +544,11 @@ define <2 x double> @fmul_v2f64(<2 x double> %op1, <2 x double> %op2) {
 define void @fmul_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fmul_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fmul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fmul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fmul z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -557,8 +566,8 @@ define void @fmul_v4f64(ptr %a, ptr %b) {
 define <2 x half> @fneg_v2f16(<2 x half> %op) {
 ; CHECK-LABEL: fneg_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fneg z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -569,8 +578,8 @@ define <2 x half> @fneg_v2f16(<2 x half> %op) {
 define <4 x half> @fneg_v4f16(<4 x half> %op) {
 ; CHECK-LABEL: fneg_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fneg z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -581,8 +590,8 @@ define <4 x half> @fneg_v4f16(<4 x half> %op) {
 define <8 x half> @fneg_v8f16(<8 x half> %op) {
 ; CHECK-LABEL: fneg_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fneg z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -593,8 +602,8 @@ define <8 x half> @fneg_v8f16(<8 x half> %op) {
 define void @fneg_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fneg_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fneg z0.h, p0/m, z0.h
 ; CHECK-NEXT:    fneg z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -608,8 +617,8 @@ define void @fneg_v16f16(ptr %a, ptr %b) {
 define <2 x float> @fneg_v2f32(<2 x float> %op) {
 ; CHECK-LABEL: fneg_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fneg z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -620,8 +629,8 @@ define <2 x float> @fneg_v2f32(<2 x float> %op) {
 define <4 x float> @fneg_v4f32(<4 x float> %op) {
 ; CHECK-LABEL: fneg_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fneg z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -632,8 +641,8 @@ define <4 x float> @fneg_v4f32(<4 x float> %op) {
 define void @fneg_v8f32(ptr %a) {
 ; CHECK-LABEL: fneg_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fneg z0.s, p0/m, z0.s
 ; CHECK-NEXT:    fneg z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -647,8 +656,8 @@ define void @fneg_v8f32(ptr %a) {
 define <2 x double> @fneg_v2f64(<2 x double> %op) {
 ; CHECK-LABEL: fneg_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fneg z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -659,8 +668,8 @@ define <2 x double> @fneg_v2f64(<2 x double> %op) {
 define void @fneg_v4f64(ptr %a) {
 ; CHECK-LABEL: fneg_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fneg z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fneg z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -678,8 +687,8 @@ define void @fneg_v4f64(ptr %a) {
 define <2 x half> @fsqrt_v2f16(<2 x half> %op) {
 ; CHECK-LABEL: fsqrt_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fsqrt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -690,8 +699,8 @@ define <2 x half> @fsqrt_v2f16(<2 x half> %op) {
 define <4 x half> @fsqrt_v4f16(<4 x half> %op) {
 ; CHECK-LABEL: fsqrt_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fsqrt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -702,8 +711,8 @@ define <4 x half> @fsqrt_v4f16(<4 x half> %op) {
 define <8 x half> @fsqrt_v8f16(<8 x half> %op) {
 ; CHECK-LABEL: fsqrt_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fsqrt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -714,8 +723,8 @@ define <8 x half> @fsqrt_v8f16(<8 x half> %op) {
 define void @fsqrt_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fsqrt_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fsqrt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    fsqrt z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -729,8 +738,8 @@ define void @fsqrt_v16f16(ptr %a, ptr %b) {
 define <2 x float> @fsqrt_v2f32(<2 x float> %op) {
 ; CHECK-LABEL: fsqrt_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fsqrt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -741,8 +750,8 @@ define <2 x float> @fsqrt_v2f32(<2 x float> %op) {
 define <4 x float> @fsqrt_v4f32(<4 x float> %op) {
 ; CHECK-LABEL: fsqrt_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fsqrt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -753,8 +762,8 @@ define <4 x float> @fsqrt_v4f32(<4 x float> %op) {
 define void @fsqrt_v8f32(ptr %a) {
 ; CHECK-LABEL: fsqrt_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fsqrt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    fsqrt z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -768,8 +777,8 @@ define void @fsqrt_v8f32(ptr %a) {
 define <2 x double> @fsqrt_v2f64(<2 x double> %op) {
 ; CHECK-LABEL: fsqrt_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fsqrt z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -780,8 +789,8 @@ define <2 x double> @fsqrt_v2f64(<2 x double> %op) {
 define void @fsqrt_v4f64(ptr %a) {
 ; CHECK-LABEL: fsqrt_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fsqrt z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fsqrt z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -799,8 +808,8 @@ define void @fsqrt_v4f64(ptr %a) {
 define <2 x half> @fsub_v2f16(<2 x half> %op1, <2 x half> %op2) {
 ; CHECK-LABEL: fsub_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -812,8 +821,8 @@ define <2 x half> @fsub_v2f16(<2 x half> %op1, <2 x half> %op2) {
 define <4 x half> @fsub_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-LABEL: fsub_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -825,8 +834,8 @@ define <4 x half> @fsub_v4f16(<4 x half> %op1, <4 x half> %op2) {
 define <8 x half> @fsub_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-LABEL: fsub_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fsub z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -838,10 +847,11 @@ define <8 x half> @fsub_v8f16(<8 x half> %op1, <8 x half> %op2) {
 define void @fsub_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fsub_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fsub z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fsubr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fsub z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -855,8 +865,8 @@ define void @fsub_v16f16(ptr %a, ptr %b) {
 define <2 x float> @fsub_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-LABEL: fsub_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -868,8 +878,8 @@ define <2 x float> @fsub_v2f32(<2 x float> %op1, <2 x float> %op2) {
 define <4 x float> @fsub_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-LABEL: fsub_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fsub z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -881,10 +891,11 @@ define <4 x float> @fsub_v4f32(<4 x float> %op1, <4 x float> %op2) {
 define void @fsub_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fsub_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fsub z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fsubr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fsub z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -898,8 +909,8 @@ define void @fsub_v8f32(ptr %a, ptr %b) {
 define <2 x double> @fsub_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-LABEL: fsub_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fsub z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -911,10 +922,11 @@ define <2 x double> @fsub_v2f64(<2 x double> %op1, <2 x double> %op2) {
 define void @fsub_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fsub_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fsub z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fsubr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fsub z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -932,8 +944,8 @@ define void @fsub_v4f64(ptr %a, ptr %b) {
 define <2 x half> @fabs_v2f16(<2 x half> %op) {
 ; CHECK-LABEL: fabs_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fabs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -944,8 +956,8 @@ define <2 x half> @fabs_v2f16(<2 x half> %op) {
 define <4 x half> @fabs_v4f16(<4 x half> %op) {
 ; CHECK-LABEL: fabs_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fabs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -956,8 +968,8 @@ define <4 x half> @fabs_v4f16(<4 x half> %op) {
 define <8 x half> @fabs_v8f16(<8 x half> %op) {
 ; CHECK-LABEL: fabs_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fabs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -968,8 +980,8 @@ define <8 x half> @fabs_v8f16(<8 x half> %op) {
 define void @fabs_v16f16(ptr %a) {
 ; CHECK-LABEL: fabs_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fabs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    fabs z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -983,8 +995,8 @@ define void @fabs_v16f16(ptr %a) {
 define <2 x float> @fabs_v2f32(<2 x float> %op) {
 ; CHECK-LABEL: fabs_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fabs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -995,8 +1007,8 @@ define <2 x float> @fabs_v2f32(<2 x float> %op) {
 define <4 x float> @fabs_v4f32(<4 x float> %op) {
 ; CHECK-LABEL: fabs_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fabs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -1007,8 +1019,8 @@ define <4 x float> @fabs_v4f32(<4 x float> %op) {
 define void @fabs_v8f32(ptr %a) {
 ; CHECK-LABEL: fabs_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fabs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    fabs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -1022,8 +1034,8 @@ define void @fabs_v8f32(ptr %a) {
 define <2 x double> @fabs_v2f64(<2 x double> %op) {
 ; CHECK-LABEL: fabs_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fabs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -1034,8 +1046,8 @@ define <2 x double> @fabs_v2f64(<2 x double> %op) {
 define void @fabs_v4f64(ptr %a) {
 ; CHECK-LABEL: fabs_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fabs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fabs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
index cba2c82558e117..2d92820d1bbd71 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
@@ -56,12 +56,12 @@ define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) {
 define void @fcmp_oeq_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_oeq_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmeq p1.h, p0/z, z1.h, z0.h
-; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    fcmeq p0.h, p0/z, z2.h, z3.h
+; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
@@ -106,12 +106,12 @@ define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) {
 define void @fcmp_oeq_v8f32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_oeq_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmeq p1.s, p0/z, z1.s, z0.s
-; CHECK-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    fcmeq p0.s, p0/z, z2.s, z3.s
+; CHECK-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
@@ -156,12 +156,12 @@ define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) {
 define void @fcmp_oeq_v4f64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_oeq_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmeq p1.d, p0/z, z1.d, z0.d
-; CHECK-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    fcmeq p0.d, p0/z, z2.d, z3.d
+; CHECK-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
@@ -180,15 +180,15 @@ define void @fcmp_oeq_v4f64(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_ueq_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_ueq_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmuo p1.h, p0/z, z1.h, z0.h
 ; CHECK-NEXT:    fcmeq p2.h, p0/z, z1.h, z0.h
-; CHECK-NEXT:    mov p1.b, p2/m, p2.b
-; CHECK-NEXT:    fcmuo p2.h, p0/z, z2.h, z3.h
+; CHECK-NEXT:    fcmuo p3.h, p0/z, z2.h, z3.h
 ; CHECK-NEXT:    fcmeq p0.h, p0/z, z2.h, z3.h
-; CHECK-NEXT:    sel p0.b, p0, p0.b, p2.b
+; CHECK-NEXT:    mov p1.b, p2/m, p2.b
+; CHECK-NEXT:    sel p0.b, p0, p0.b, p3.b
 ; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
@@ -208,15 +208,15 @@ define void @fcmp_ueq_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_one_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_one_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z1.h
 ; CHECK-NEXT:    fcmgt p2.h, p0/z, z1.h, z0.h
-; CHECK-NEXT:    mov p1.b, p2/m, p2.b
-; CHECK-NEXT:    fcmgt p2.h, p0/z, z3.h, z2.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z3.h, z2.h
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z2.h, z3.h
-; CHECK-NEXT:    sel p0.b, p0, p0.b, p2.b
+; CHECK-NEXT:    mov p1.b, p2/m, p2.b
+; CHECK-NEXT:    sel p0.b, p0, p0.b, p3.b
 ; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
@@ -236,12 +236,12 @@ define void @fcmp_one_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_une_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmne p1.h, p0/z, z1.h, z0.h
-; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    fcmne p0.h, p0/z, z2.h, z3.h
+; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
@@ -260,12 +260,12 @@ define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_ogt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_ogt_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z0.h
-; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z2.h, z3.h
+; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
@@ -284,15 +284,15 @@ define void @fcmp_ogt_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_ugt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_ugt_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    fcmge p0.h, p0/z, z3.h, z2.h
 ; CHECK-NEXT:    mov z0.h, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.h, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    fcmge p0.h, p0/z, z3.h, z2.h
-; CHECK-NEXT:    eor z1.d, z1.d, z0.d
 ; CHECK-NEXT:    mov z2.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    eor z1.d, z1.d, z0.d
 ; CHECK-NEXT:    eor z0.d, z2.d, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
@@ -311,12 +311,12 @@ define void @fcmp_ugt_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_olt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_olt_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z3.h, z2.h
+; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
@@ -335,15 +335,15 @@ define void @fcmp_olt_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_ult_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_ult_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z1.h, z0.h
+; CHECK-NEXT:    fcmge p0.h, p0/z, z2.h, z3.h
 ; CHECK-NEXT:    mov z0.h, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.h, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    fcmge p0.h, p0/z, z2.h, z3.h
-; CHECK-NEXT:    eor z1.d, z1.d, z0.d
 ; CHECK-NEXT:    mov z2.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    eor z1.d, z1.d, z0.d
 ; CHECK-NEXT:    eor z0.d, z2.d, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
@@ -362,12 +362,12 @@ define void @fcmp_ult_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_oge_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_oge_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z1.h, z0.h
-; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    fcmge p0.h, p0/z, z2.h, z3.h
+; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
@@ -386,15 +386,15 @@ define void @fcmp_oge_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_uge_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_uge_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    fcmgt p0.h, p0/z, z3.h, z2.h
 ; CHECK-NEXT:    mov z0.h, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.h, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    fcmgt p0.h, p0/z, z3.h, z2.h
-; CHECK-NEXT:    eor z1.d, z1.d, z0.d
 ; CHECK-NEXT:    mov z2.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    eor z1.d, z1.d, z0.d
 ; CHECK-NEXT:    eor z0.d, z2.d, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
@@ -413,12 +413,12 @@ define void @fcmp_uge_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_ole_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_ole_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    fcmge p0.h, p0/z, z3.h, z2.h
+; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
@@ -437,15 +437,15 @@ define void @fcmp_ole_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_ule_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_ule_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z0.h
+; CHECK-NEXT:    fcmgt p0.h, p0/z, z2.h, z3.h
 ; CHECK-NEXT:    mov z0.h, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.h, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    fcmgt p0.h, p0/z, z2.h, z3.h
-; CHECK-NEXT:    eor z1.d, z1.d, z0.d
 ; CHECK-NEXT:    mov z2.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    eor z1.d, z1.d, z0.d
 ; CHECK-NEXT:    eor z0.d, z2.d, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
@@ -464,12 +464,12 @@ define void @fcmp_ule_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_uno_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_uno_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmuo p1.h, p0/z, z1.h, z0.h
-; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z2.h, z3.h
+; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
@@ -488,15 +488,15 @@ define void @fcmp_uno_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_ord_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_ord_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmuo p1.h, p0/z, z1.h, z0.h
+; CHECK-NEXT:    fcmuo p0.h, p0/z, z2.h, z3.h
 ; CHECK-NEXT:    mov z0.h, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.h, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    fcmuo p0.h, p0/z, z2.h, z3.h
-; CHECK-NEXT:    eor z1.d, z1.d, z0.d
 ; CHECK-NEXT:    mov z2.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    eor z1.d, z1.d, z0.d
 ; CHECK-NEXT:    eor z0.d, z2.d, z0.d
 ; CHECK-NEXT:    stp q1, q0, [x2]
 ; CHECK-NEXT:    ret
@@ -515,12 +515,12 @@ define void @fcmp_ord_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_eq_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_eq_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmeq p1.h, p0/z, z1.h, z0.h
-; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    fcmeq p0.h, p0/z, z2.h, z3.h
+; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
@@ -539,12 +539,12 @@ define void @fcmp_eq_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_ne_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_ne_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmne p1.h, p0/z, z1.h, z0.h
-; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    fcmne p0.h, p0/z, z2.h, z3.h
+; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
@@ -563,12 +563,12 @@ define void @fcmp_ne_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_gt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_gt_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z0.h
-; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z2.h, z3.h
+; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
@@ -587,12 +587,12 @@ define void @fcmp_gt_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_lt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_lt_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z3.h, z2.h
+; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
@@ -611,12 +611,12 @@ define void @fcmp_lt_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_ge_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_ge_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z1.h, z0.h
-; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    fcmge p0.h, p0/z, z2.h, z3.h
+; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret
@@ -635,12 +635,12 @@ define void @fcmp_ge_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_le_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_le_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
-; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    fcmge p0.h, p0/z, z3.h, z2.h
+; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x2]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
index 8e2805ad8fbbd2..fceb8500f9e80d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
@@ -7,14 +7,14 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @fp_convert_combine_crash(ptr %a, ptr %b) {
 ; CHECK-LABEL: fp_convert_combine_crash:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    fmov z2.s, #8.00000000
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fmov z0.s, #8.00000000
+; CHECK-NEXT:    fmul z1.s, p0/m, z1.s, z0.s
 ; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z2.s
-; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
-; CHECK-NEXT:    fmul z1.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    fcvtzs z1.s, p0/m, z1.s
-; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
   %f = load <8 x float>, ptr %a
   %mul.i = fmul <8 x float> %f, <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00,

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
index 549d09f15122d5..690b85fb2f0846 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
@@ -10,8 +10,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @fcvt_v2f16_to_v2f32(<2 x half> %a, ptr %b) {
 ; CHECK-LABEL: fcvt_v2f16_to_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
 ; CHECK-NEXT:    str d0, [x0]
@@ -24,8 +24,8 @@ define void @fcvt_v2f16_to_v2f32(<2 x half> %a, ptr %b) {
 define void @fcvt_v4f16_to_v4f32(<4 x half> %a, ptr %b) {
 ; CHECK-LABEL: fcvt_v4f16_to_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x0]
@@ -42,8 +42,8 @@ define void @fcvt_v8f16_to_v8f32(<8 x half> %a, ptr %b) {
 ; CHECK-NEXT:    uunpklo z1.s, z0.h
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    fcvt z1.s, p0/m, z1.h
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    fcvt z1.s, p0/m, z1.h
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
 ; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
@@ -59,18 +59,17 @@ define void @fcvt_v16f16_to_v16f32(<16 x half> %a, ptr %b) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    uunpklo z2.s, z1.h
 ; CHECK-NEXT:    uunpklo z3.s, z0.h
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    fcvt z1.s, p0/m, z1.h
 ; CHECK-NEXT:    fcvt z2.s, p0/m, z2.h
+; CHECK-NEXT:    fcvt z3.s, p0/m, z3.h
+; CHECK-NEXT:    fcvt z1.s, p0/m, z1.h
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
+; CHECK-NEXT:    stp q3, q0, [x0]
 ; CHECK-NEXT:    stp q2, q1, [x0, #32]
-; CHECK-NEXT:    movprfx z1, z3
-; CHECK-NEXT:    fcvt z1.s, p0/m, z3.h
-; CHECK-NEXT:    stp q1, q0, [x0]
 ; CHECK-NEXT:    ret
   %res = fpext <16 x half> %a to <16 x float>
   store <16 x float> %res, ptr %b
@@ -113,8 +112,8 @@ define void @fcvt_v4f16_v4f32(ptr %a, ptr %b) {
 define void @fcvt_v8f16_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvt_v8f16_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #4 // =0x4
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    mov x8, #4 // =0x4
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, x8, lsl #1]
 ; CHECK-NEXT:    ld1h { z1.s }, p0/z, [x0]
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
@@ -130,22 +129,20 @@ define void @fcvt_v8f16_v8f32(ptr %a, ptr %b) {
 define void @fcvt_v16f16_v16f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvt_v16f16_v16f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #8 // =0x8
-; CHECK-NEXT:    mov x9, #12 // =0xc
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    mov x10, #4 // =0x4
+; CHECK-NEXT:    mov x8, #8 // =0x8
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, x8, lsl #1]
-; CHECK-NEXT:    ld1h { z1.s }, p0/z, [x0, x9, lsl #1]
-; CHECK-NEXT:    ld1h { z2.s }, p0/z, [x0, x10, lsl #1]
+; CHECK-NEXT:    mov x8, #12 // =0xc
+; CHECK-NEXT:    ld1h { z1.s }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT:    mov x8, #4 // =0x4
+; CHECK-NEXT:    ld1h { z2.s }, p0/z, [x0, x8, lsl #1]
 ; CHECK-NEXT:    ld1h { z3.s }, p0/z, [x0]
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
 ; CHECK-NEXT:    fcvt z1.s, p0/m, z1.h
+; CHECK-NEXT:    fcvt z3.s, p0/m, z3.h
+; CHECK-NEXT:    fcvt z2.s, p0/m, z2.h
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
-; CHECK-NEXT:    movprfx z0, z3
-; CHECK-NEXT:    fcvt z0.s, p0/m, z3.h
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fcvt z1.s, p0/m, z2.h
-; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    stp q3, q2, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fpext <16 x half> %op1 to <16 x float>
@@ -187,8 +184,8 @@ define void @fcvt_v2f16_v2f64(ptr %a, ptr %b) {
 define void @fcvt_v4f16_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvt_v4f16_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #2 // =0x2
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    mov x8, #2 // =0x2
 ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, x8, lsl #1]
 ; CHECK-NEXT:    ld1h { z1.d }, p0/z, [x0]
 ; CHECK-NEXT:    fcvt z0.d, p0/m, z0.h
@@ -204,22 +201,20 @@ define void @fcvt_v4f16_v4f64(ptr %a, ptr %b) {
 define void @fcvt_v8f16_v8f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvt_v8f16_v8f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #4 // =0x4
-; CHECK-NEXT:    mov x9, #6 // =0x6
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    mov x10, #2 // =0x2
+; CHECK-NEXT:    mov x8, #4 // =0x4
 ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, x8, lsl #1]
-; CHECK-NEXT:    ld1h { z1.d }, p0/z, [x0, x9, lsl #1]
-; CHECK-NEXT:    ld1h { z2.d }, p0/z, [x0, x10, lsl #1]
+; CHECK-NEXT:    mov x8, #6 // =0x6
+; CHECK-NEXT:    ld1h { z1.d }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT:    mov x8, #2 // =0x2
+; CHECK-NEXT:    ld1h { z2.d }, p0/z, [x0, x8, lsl #1]
 ; CHECK-NEXT:    ld1h { z3.d }, p0/z, [x0]
 ; CHECK-NEXT:    fcvt z0.d, p0/m, z0.h
 ; CHECK-NEXT:    fcvt z1.d, p0/m, z1.h
+; CHECK-NEXT:    fcvt z3.d, p0/m, z3.h
+; CHECK-NEXT:    fcvt z2.d, p0/m, z2.h
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
-; CHECK-NEXT:    movprfx z0, z3
-; CHECK-NEXT:    fcvt z0.d, p0/m, z3.h
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fcvt z1.d, p0/m, z2.h
-; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    stp q3, q2, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %res = fpext <8 x half> %op1 to <8 x double>
@@ -230,40 +225,37 @@ define void @fcvt_v8f16_v8f64(ptr %a, ptr %b) {
 define void @fcvt_v16f16_v16f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvt_v16f16_v16f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x9, #14 // =0xe
-; CHECK-NEXT:    mov x10, #12 // =0xc
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    mov x8, #2 // =0x2
-; CHECK-NEXT:    mov x11, #6 // =0x6
-; CHECK-NEXT:    mov x12, #4 // =0x4
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, x9, lsl #1]
-; CHECK-NEXT:    ld1h { z1.d }, p0/z, [x0, x10, lsl #1]
-; CHECK-NEXT:    mov x9, #8 // =0x8
-; CHECK-NEXT:    mov x10, #10 // =0xa
+; CHECK-NEXT:    mov x8, #12 // =0xc
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT:    mov x8, #14 // =0xe
+; CHECK-NEXT:    ld1h { z1.d }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT:    mov x8, #8 // =0x8
 ; CHECK-NEXT:    ld1h { z2.d }, p0/z, [x0, x8, lsl #1]
-; CHECK-NEXT:    ld1h { z3.d }, p0/z, [x0, x11, lsl #1]
-; CHECK-NEXT:    ld1h { z5.d }, p0/z, [x0, x12, lsl #1]
+; CHECK-NEXT:    mov x8, #10 // =0xa
+; CHECK-NEXT:    ld1h { z3.d }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT:    mov x8, #4 // =0x4
 ; CHECK-NEXT:    fcvt z0.d, p0/m, z0.h
+; CHECK-NEXT:    ld1h { z4.d }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT:    mov x8, #6 // =0x6
 ; CHECK-NEXT:    fcvt z1.d, p0/m, z1.h
-; CHECK-NEXT:    ld1h { z4.d }, p0/z, [x0, x9, lsl #1]
-; CHECK-NEXT:    ld1h { z6.d }, p0/z, [x0, x10, lsl #1]
+; CHECK-NEXT:    ld1h { z5.d }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT:    mov x8, #2 // =0x2
+; CHECK-NEXT:    fcvt z2.d, p0/m, z2.h
+; CHECK-NEXT:    ld1h { z6.d }, p0/z, [x0, x8, lsl #1]
 ; CHECK-NEXT:    ld1h { z7.d }, p0/z, [x0]
-; CHECK-NEXT:    stp q1, q0, [x1, #96]
-; CHECK-NEXT:    movprfx z1, z4
-; CHECK-NEXT:    fcvt z1.d, p0/m, z4.h
-; CHECK-NEXT:    movprfx z0, z6
-; CHECK-NEXT:    fcvt z0.d, p0/m, z6.h
-; CHECK-NEXT:    stp q1, q0, [x1, #64]
-; CHECK-NEXT:    movprfx z1, z5
-; CHECK-NEXT:    fcvt z1.d, p0/m, z5.h
-; CHECK-NEXT:    movprfx z0, z3
-; CHECK-NEXT:    fcvt z0.d, p0/m, z3.h
-; CHECK-NEXT:    stp q1, q0, [x1, #32]
+; CHECK-NEXT:    fcvt z3.d, p0/m, z3.h
+; CHECK-NEXT:    fcvt z4.d, p0/m, z4.h
+; CHECK-NEXT:    stp q0, q1, [x1, #96]
+; CHECK-NEXT:    movprfx z0, z5
+; CHECK-NEXT:    fcvt z0.d, p0/m, z5.h
 ; CHECK-NEXT:    movprfx z1, z7
 ; CHECK-NEXT:    fcvt z1.d, p0/m, z7.h
-; CHECK-NEXT:    movprfx z0, z2
-; CHECK-NEXT:    fcvt z0.d, p0/m, z2.h
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    stp q2, q3, [x1, #64]
+; CHECK-NEXT:    movprfx z2, z6
+; CHECK-NEXT:    fcvt z2.d, p0/m, z6.h
+; CHECK-NEXT:    stp q1, q2, [x1]
+; CHECK-NEXT:    stp q4, q0, [x1, #32]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fpext <16 x half> %op1 to <16 x double>
@@ -305,8 +297,8 @@ define void @fcvt_v2f32_v2f64(ptr %a, ptr %b) {
 define void @fcvt_v4f32_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvt_v4f32_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #2 // =0x2
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    mov x8, #2 // =0x2
 ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, x8, lsl #2]
 ; CHECK-NEXT:    ld1w { z1.d }, p0/z, [x0]
 ; CHECK-NEXT:    fcvt z0.d, p0/m, z0.s
@@ -322,22 +314,20 @@ define void @fcvt_v4f32_v4f64(ptr %a, ptr %b) {
 define void @fcvt_v8f32_v8f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvt_v8f32_v8f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #4 // =0x4
-; CHECK-NEXT:    mov x9, #6 // =0x6
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    mov x10, #2 // =0x2
+; CHECK-NEXT:    mov x8, #4 // =0x4
 ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, x8, lsl #2]
-; CHECK-NEXT:    ld1w { z1.d }, p0/z, [x0, x9, lsl #2]
-; CHECK-NEXT:    ld1w { z2.d }, p0/z, [x0, x10, lsl #2]
+; CHECK-NEXT:    mov x8, #6 // =0x6
+; CHECK-NEXT:    ld1w { z1.d }, p0/z, [x0, x8, lsl #2]
+; CHECK-NEXT:    mov x8, #2 // =0x2
+; CHECK-NEXT:    ld1w { z2.d }, p0/z, [x0, x8, lsl #2]
 ; CHECK-NEXT:    ld1w { z3.d }, p0/z, [x0]
 ; CHECK-NEXT:    fcvt z0.d, p0/m, z0.s
 ; CHECK-NEXT:    fcvt z1.d, p0/m, z1.s
+; CHECK-NEXT:    fcvt z3.d, p0/m, z3.s
+; CHECK-NEXT:    fcvt z2.d, p0/m, z2.s
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
-; CHECK-NEXT:    movprfx z0, z3
-; CHECK-NEXT:    fcvt z0.d, p0/m, z3.s
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fcvt z1.d, p0/m, z2.s
-; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    stp q3, q2, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fpext <8 x float> %op1 to <8 x double>
@@ -352,8 +342,8 @@ define void @fcvt_v8f32_v8f64(ptr %a, ptr %b) {
 define void @fcvt_v2f32_v2f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvt_v2f32_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
@@ -366,8 +356,8 @@ define void @fcvt_v2f32_v2f16(ptr %a, ptr %b) {
 define void @fcvt_v4f32_v4f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvt_v4f32_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
@@ -380,13 +370,13 @@ define void @fcvt_v4f32_v4f16(ptr %a, ptr %b) {
 define void @fcvt_v8f32_v8f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvt_v8f32_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    mov x8, #4 // =0x4
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    mov x8, #4 // =0x4
 ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
-; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
 ; CHECK-NEXT:    fcvt z1.h, p0/m, z1.s
-; CHECK-NEXT:    st1h { z1.s }, p0, [x1, x8, lsl #1]
+; CHECK-NEXT:    st1h { z0.s }, p0, [x1, x8, lsl #1]
+; CHECK-NEXT:    st1h { z1.s }, p0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptrunc <8 x float> %op1 to <8 x half>
@@ -401,8 +391,8 @@ define void @fcvt_v8f32_v8f16(ptr %a, ptr %b) {
 define void @fcvt_v1f64_v1f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvt_v1f64_v1f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
 ; CHECK-NEXT:    st1h { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
@@ -415,8 +405,8 @@ define void @fcvt_v1f64_v1f16(ptr %a, ptr %b) {
 define void @fcvt_v2f64_v2f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvt_v2f64_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
 ; CHECK-NEXT:    st1h { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
@@ -429,13 +419,13 @@ define void @fcvt_v2f64_v2f16(ptr %a, ptr %b) {
 define void @fcvt_v4f64_v4f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvt_v4f64_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    mov x8, #2 // =0x2
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    mov x8, #2 // =0x2
 ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
-; CHECK-NEXT:    st1h { z0.d }, p0, [x1]
 ; CHECK-NEXT:    fcvt z1.h, p0/m, z1.d
-; CHECK-NEXT:    st1h { z1.d }, p0, [x1, x8, lsl #1]
+; CHECK-NEXT:    st1h { z0.d }, p0, [x1, x8, lsl #1]
+; CHECK-NEXT:    st1h { z1.d }, p0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptrunc <4 x double> %op1 to <4 x half>
@@ -450,8 +440,8 @@ define void @fcvt_v4f64_v4f16(ptr %a, ptr %b) {
 define void @fcvt_v1f64_v1f32(<1 x double> %op1, ptr %b) {
 ; CHECK-LABEL: fcvt_v1f64_v1f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
 ; CHECK-NEXT:    st1w { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -463,8 +453,8 @@ define void @fcvt_v1f64_v1f32(<1 x double> %op1, ptr %b) {
 define void @fcvt_v2f64_v2f32(<2 x double> %op1, ptr %b) {
 ; CHECK-LABEL: fcvt_v2f64_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
 ; CHECK-NEXT:    st1w { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -476,13 +466,13 @@ define void @fcvt_v2f64_v2f32(<2 x double> %op1, ptr %b) {
 define void @fcvt_v4f64_v4f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvt_v4f64_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    mov x8, #2 // =0x2
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    mov x8, #2 // =0x2
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
-; CHECK-NEXT:    st1w { z0.d }, p0, [x1]
 ; CHECK-NEXT:    fcvt z1.s, p0/m, z1.d
-; CHECK-NEXT:    st1w { z1.d }, p0, [x1, x8, lsl #2]
+; CHECK-NEXT:    st1w { z0.d }, p0, [x1, x8, lsl #2]
+; CHECK-NEXT:    st1w { z1.d }, p0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %res = fptrunc <4 x double> %op1 to <4 x float>

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
index db8000f9fe6d5e..b5df97f767c13b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
@@ -10,8 +10,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) {
 ; CHECK-LABEL: fma_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $z2
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
@@ -25,8 +25,8 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
 define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) {
 ; CHECK-LABEL: fma_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 def $z2
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z2.h
@@ -40,13 +40,13 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
 define void @fma_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fma_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    ldp q4, q5, [x2]
-; CHECK-NEXT:    fmad z0.h, p0/m, z1.h, z4.h
+; CHECK-NEXT:    ldp q0, q4, [x1]
+; CHECK-NEXT:    ldp q1, q5, [x2]
+; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    fmad z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    movprfx z1, z5
-; CHECK-NEXT:    fmla z1.h, p0/m, z2.h, z3.h
+; CHECK-NEXT:    fmla z1.h, p0/m, z3.h, z4.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
@@ -61,8 +61,8 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) {
 define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3) {
 ; CHECK-LABEL: fma_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $z2
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
@@ -76,8 +76,8 @@ define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %o
 define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3) {
 ; CHECK-LABEL: fma_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 def $z2
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z2.s
@@ -91,13 +91,13 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o
 define void @fma_v8f32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fma_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    ldp q4, q5, [x2]
-; CHECK-NEXT:    fmad z0.s, p0/m, z1.s, z4.s
+; CHECK-NEXT:    ldp q0, q4, [x1]
+; CHECK-NEXT:    ldp q1, q5, [x2]
+; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    fmad z0.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    movprfx z1, z5
-; CHECK-NEXT:    fmla z1.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    fmla z1.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
@@ -125,8 +125,8 @@ define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double
 define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3) {
 ; CHECK-LABEL: fma_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 def $z2
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
@@ -140,13 +140,13 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double
 define void @fma_v4f64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fma_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q3, [x1]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    ldp q4, q5, [x2]
-; CHECK-NEXT:    fmad z0.d, p0/m, z1.d, z4.d
+; CHECK-NEXT:    ldp q0, q4, [x1]
+; CHECK-NEXT:    ldp q1, q5, [x2]
+; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    fmad z0.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    movprfx z1, z5
-; CHECK-NEXT:    fmla z1.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    fmla z1.d, p0/m, z3.d, z4.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <4 x double>, ptr %a

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
index 47292ee0392d23..cfd46aa68730fe 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
@@ -10,8 +10,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x half> @fmaxnm_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-LABEL: fmaxnm_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -23,8 +23,8 @@ define <4 x half> @fmaxnm_v4f16(<4 x half> %op1, <4 x half> %op2) {
 define <8 x half> @fmaxnm_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-LABEL: fmaxnm_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -36,10 +36,11 @@ define <8 x half> @fmaxnm_v8f16(<8 x half> %op1, <8 x half> %op2) {
 define void @fmaxnm_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fmaxnm_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fmaxnm z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -53,8 +54,8 @@ define void @fmaxnm_v16f16(ptr %a, ptr %b) {
 define <2 x float> @fmaxnm_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-LABEL: fmaxnm_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -66,8 +67,8 @@ define <2 x float> @fmaxnm_v2f32(<2 x float> %op1, <2 x float> %op2) {
 define <4 x float> @fmaxnm_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-LABEL: fmaxnm_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -79,10 +80,11 @@ define <4 x float> @fmaxnm_v4f32(<4 x float> %op1, <4 x float> %op2) {
 define void @fmaxnm_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fmaxnm_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fmaxnm z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -107,8 +109,8 @@ define <1 x double> @fmaxnm_v1f64(<1 x double> %op1, <1 x double> %op2) {
 define <2 x double> @fmaxnm_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-LABEL: fmaxnm_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -120,10 +122,11 @@ define <2 x double> @fmaxnm_v2f64(<2 x double> %op1, <2 x double> %op2) {
 define void @fmaxnm_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fmaxnm_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fmaxnm z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fmaxnm z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -141,8 +144,8 @@ define void @fmaxnm_v4f64(ptr %a, ptr %b) {
 define <4 x half> @fminnm_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-LABEL: fminnm_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -154,8 +157,8 @@ define <4 x half> @fminnm_v4f16(<4 x half> %op1, <4 x half> %op2) {
 define <8 x half> @fminnm_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-LABEL: fminnm_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -167,10 +170,11 @@ define <8 x half> @fminnm_v8f16(<8 x half> %op1, <8 x half> %op2) {
 define void @fminnm_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fminnm_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fminnm z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -184,8 +188,8 @@ define void @fminnm_v16f16(ptr %a, ptr %b) {
 define <2 x float> @fminnm_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-LABEL: fminnm_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -197,8 +201,8 @@ define <2 x float> @fminnm_v2f32(<2 x float> %op1, <2 x float> %op2) {
 define <4 x float> @fminnm_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-LABEL: fminnm_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -210,10 +214,11 @@ define <4 x float> @fminnm_v4f32(<4 x float> %op1, <4 x float> %op2) {
 define void @fminnm_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fminnm_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fminnm z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -238,8 +243,8 @@ define <1 x double> @fminnm_v1f64(<1 x double> %op1, <1 x double> %op2) {
 define <2 x double> @fminnm_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-LABEL: fminnm_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -251,10 +256,11 @@ define <2 x double> @fminnm_v2f64(<2 x double> %op1, <2 x double> %op2) {
 define void @fminnm_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fminnm_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fminnm z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -272,8 +278,8 @@ define void @fminnm_v4f64(ptr %a, ptr %b) {
 define <4 x half> @fmax_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-LABEL: fmax_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -285,8 +291,8 @@ define <4 x half> @fmax_v4f16(<4 x half> %op1, <4 x half> %op2) {
 define <8 x half> @fmax_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-LABEL: fmax_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -298,10 +304,11 @@ define <8 x half> @fmax_v8f16(<8 x half> %op1, <8 x half> %op2) {
 define void @fmax_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fmax_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fmax z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fmax z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -315,8 +322,8 @@ define void @fmax_v16f16(ptr %a, ptr %b) {
 define <2 x float> @fmax_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-LABEL: fmax_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -328,8 +335,8 @@ define <2 x float> @fmax_v2f32(<2 x float> %op1, <2 x float> %op2) {
 define <4 x float> @fmax_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-LABEL: fmax_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -341,10 +348,11 @@ define <4 x float> @fmax_v4f32(<4 x float> %op1, <4 x float> %op2) {
 define void @fmax_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fmax_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fmax z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fmax z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -369,8 +377,8 @@ define <1 x double> @fmax_v1f64(<1 x double> %op1, <1 x double> %op2) {
 define <2 x double> @fmax_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-LABEL: fmax_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fmax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -382,10 +390,11 @@ define <2 x double> @fmax_v2f64(<2 x double> %op1, <2 x double> %op2) {
 define void @fmax_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fmax_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fmax z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fmax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fmax z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -403,8 +412,8 @@ define void @fmax_v4f64(ptr %a, ptr %b) {
 define <4 x half> @fmin_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-LABEL: fmin_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -416,8 +425,8 @@ define <4 x half> @fmin_v4f16(<4 x half> %op1, <4 x half> %op2) {
 define <8 x half> @fmin_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-LABEL: fmin_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -429,10 +438,11 @@ define <8 x half> @fmin_v8f16(<8 x half> %op1, <8 x half> %op2) {
 define void @fmin_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fmin_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fmin z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fmin z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -446,8 +456,8 @@ define void @fmin_v16f16(ptr %a, ptr %b) {
 define <2 x float> @fmin_v2f32(<2 x float> %op1, <2 x float> %op2) {
 ; CHECK-LABEL: fmin_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -459,8 +469,8 @@ define <2 x float> @fmin_v2f32(<2 x float> %op1, <2 x float> %op2) {
 define <4 x float> @fmin_v4f32(<4 x float> %op1, <4 x float> %op2) {
 ; CHECK-LABEL: fmin_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -472,10 +482,11 @@ define <4 x float> @fmin_v4f32(<4 x float> %op1, <4 x float> %op2) {
 define void @fmin_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fmin_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fmin z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fmin z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -500,8 +511,8 @@ define <1 x double> @fmin_v1f64(<1 x double> %op1, <1 x double> %op2) {
 define <2 x double> @fmin_v2f64(<2 x double> %op1, <2 x double> %op2) {
 ; CHECK-LABEL: fmin_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fmin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -513,10 +524,11 @@ define <2 x double> @fmin_v2f64(<2 x double> %op1, <2 x double> %op2) {
 define void @fmin_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fmin_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fmin z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fmin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fmin z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
index 8675477a7d60e5..a94870815d42f3 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
@@ -10,8 +10,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define half @fadda_v4f16(half %start, <4 x half> %a) {
 ; CHECK-LABEL: fadda_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fadda h0, p0, h0, z1.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
@@ -23,8 +23,8 @@ define half @fadda_v4f16(half %start, <4 x half> %a) {
 define half @fadda_v8f16(half %start, <8 x half> %a) {
 ; CHECK-LABEL: fadda_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fadda h0, p0, h0, z1.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
@@ -36,11 +36,12 @@ define half @fadda_v8f16(half %start, <8 x half> %a) {
 define half @fadda_v16f16(half %start, ptr %a) {
 ; CHECK-LABEL: fadda_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    fadda h0, p0, h0, z1.h
+; CHECK-NEXT:    ldr q1, [x0, #16]
 ; CHECK-NEXT:    fadda h0, p0, h0, z1.h
-; CHECK-NEXT:    fadda h0, p0, h0, z2.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
   %op = load <16 x half>, ptr %a
@@ -51,8 +52,8 @@ define half @fadda_v16f16(half %start, ptr %a) {
 define float @fadda_v2f32(float %start, <2 x float> %a) {
 ; CHECK-LABEL: fadda_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    fadda s0, p0, s0, z1.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
@@ -64,8 +65,8 @@ define float @fadda_v2f32(float %start, <2 x float> %a) {
 define float @fadda_v4f32(float %start, <4 x float> %a) {
 ; CHECK-LABEL: fadda_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fadda s0, p0, s0, z1.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
@@ -77,11 +78,12 @@ define float @fadda_v4f32(float %start, <4 x float> %a) {
 define float @fadda_v8f32(float %start, ptr %a) {
 ; CHECK-LABEL: fadda_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    fadda s0, p0, s0, z1.s
+; CHECK-NEXT:    ldr q1, [x0, #16]
 ; CHECK-NEXT:    fadda s0, p0, s0, z1.s
-; CHECK-NEXT:    fadda s0, p0, s0, z2.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
   %op = load <8 x float>, ptr %a
@@ -102,8 +104,8 @@ define double @fadda_v1f64(double %start, <1 x double> %a) {
 define double @fadda_v2f64(double %start, <2 x double> %a) {
 ; CHECK-LABEL: fadda_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    fadda d0, p0, d0, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -115,11 +117,12 @@ define double @fadda_v2f64(double %start, <2 x double> %a) {
 define double @fadda_v4f64(double %start, ptr %a) {
 ; CHECK-LABEL: fadda_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fadda d0, p0, d0, z1.d
+; CHECK-NEXT:    ldr q1, [x0, #16]
 ; CHECK-NEXT:    fadda d0, p0, d0, z1.d
-; CHECK-NEXT:    fadda d0, p0, d0, z2.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %op = load <4 x double>, ptr %a
@@ -134,8 +137,8 @@ define double @fadda_v4f64(double %start, ptr %a) {
 define half @faddv_v4f16(half %start, <4 x half> %a) {
 ; CHECK-LABEL: faddv_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    faddv h1, p0, z1.h
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
@@ -146,8 +149,8 @@ define half @faddv_v4f16(half %start, <4 x half> %a) {
 define half @faddv_v8f16(half %start, <8 x half> %a) {
 ; CHECK-LABEL: faddv_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    faddv h1, p0, z1.h
 ; CHECK-NEXT:    fadd h0, h0, h1
 ; CHECK-NEXT:    ret
@@ -158,8 +161,8 @@ define half @faddv_v8f16(half %start, <8 x half> %a) {
 define half @faddv_v16f16(half %start, ptr %a) {
 ; CHECK-LABEL: faddv_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q2, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q2, q1, [x0]
 ; CHECK-NEXT:    fadd z1.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    faddv h1, p0, z1.h
 ; CHECK-NEXT:    fadd h0, h0, h1
@@ -172,8 +175,8 @@ define half @faddv_v16f16(half %start, ptr %a) {
 define float @faddv_v2f32(float %start, <2 x float> %a) {
 ; CHECK-LABEL: faddv_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    faddv s1, p0, z1.s
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
@@ -184,8 +187,8 @@ define float @faddv_v2f32(float %start, <2 x float> %a) {
 define float @faddv_v4f32(float %start, <4 x float> %a) {
 ; CHECK-LABEL: faddv_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    faddv s1, p0, z1.s
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
@@ -196,8 +199,8 @@ define float @faddv_v4f32(float %start, <4 x float> %a) {
 define float @faddv_v8f32(float %start, ptr %a) {
 ; CHECK-LABEL: faddv_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q2, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q2, q1, [x0]
 ; CHECK-NEXT:    fadd z1.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    faddv s1, p0, z1.s
 ; CHECK-NEXT:    fadd s0, s0, s1
@@ -220,8 +223,8 @@ define double @faddv_v1f64(double %start, <1 x double> %a) {
 define double @faddv_v2f64(double %start, <2 x double> %a) {
 ; CHECK-LABEL: faddv_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    faddv d1, p0, z1.d
 ; CHECK-NEXT:    fadd d0, d0, d1
 ; CHECK-NEXT:    ret
@@ -232,8 +235,8 @@ define double @faddv_v2f64(double %start, <2 x double> %a) {
 define double @faddv_v4f64(double %start, ptr %a) {
 ; CHECK-LABEL: faddv_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q2, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q2, q1, [x0]
 ; CHECK-NEXT:    fadd z1.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    faddv d1, p0, z1.d
 ; CHECK-NEXT:    fadd d0, d0, d1
@@ -250,8 +253,8 @@ define double @faddv_v4f64(double %start, ptr %a) {
 define half @fmaxv_v4f16(<4 x half> %a) {
 ; CHECK-LABEL: fmaxv_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fmaxnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
@@ -262,8 +265,8 @@ define half @fmaxv_v4f16(<4 x half> %a) {
 define half @fmaxv_v8f16(<8 x half> %a) {
 ; CHECK-LABEL: fmaxv_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fmaxnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
@@ -274,8 +277,8 @@ define half @fmaxv_v8f16(<8 x half> %a) {
 define half @fmaxv_v16f16(ptr %a) {
 ; CHECK-LABEL: fmaxv_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    fmaxnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
@@ -288,8 +291,8 @@ define half @fmaxv_v16f16(ptr %a) {
 define float @fmaxv_v2f32(<2 x float> %a) {
 ; CHECK-LABEL: fmaxv_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
@@ -300,8 +303,8 @@ define float @fmaxv_v2f32(<2 x float> %a) {
 define float @fmaxv_v4f32(<4 x float> %a) {
 ; CHECK-LABEL: fmaxv_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
@@ -312,8 +315,8 @@ define float @fmaxv_v4f32(<4 x float> %a) {
 define float @fmaxv_v8f32(ptr %a) {
 ; CHECK-LABEL: fmaxv_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
@@ -336,8 +339,8 @@ define double @fmaxv_v1f64(<1 x double> %a) {
 define double @fmaxv_v2f64(<2 x double> %a) {
 ; CHECK-LABEL: fmaxv_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fmaxnmv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -348,8 +351,8 @@ define double @fmaxv_v2f64(<2 x double> %a) {
 define double @fmaxv_v4f64(ptr %a) {
 ; CHECK-LABEL: fmaxv_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    fmaxnmv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -366,8 +369,8 @@ define double @fmaxv_v4f64(ptr %a) {
 define half @fminv_v4f16(<4 x half> %a) {
 ; CHECK-LABEL: fminv_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fminnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
@@ -378,8 +381,8 @@ define half @fminv_v4f16(<4 x half> %a) {
 define half @fminv_v8f16(<8 x half> %a) {
 ; CHECK-LABEL: fminv_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fminnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
@@ -390,8 +393,8 @@ define half @fminv_v8f16(<8 x half> %a) {
 define half @fminv_v16f16(ptr %a) {
 ; CHECK-LABEL: fminv_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    fminnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
@@ -404,8 +407,8 @@ define half @fminv_v16f16(ptr %a) {
 define float @fminv_v2f32(<2 x float> %a) {
 ; CHECK-LABEL: fminv_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fminnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
@@ -416,8 +419,8 @@ define float @fminv_v2f32(<2 x float> %a) {
 define float @fminv_v4f32(<4 x float> %a) {
 ; CHECK-LABEL: fminv_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fminnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
@@ -428,8 +431,8 @@ define float @fminv_v4f32(<4 x float> %a) {
 define float @fminv_v8f32(ptr %a) {
 ; CHECK-LABEL: fminv_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    fminnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
@@ -452,8 +455,8 @@ define double @fminv_v1f64(<1 x double> %a) {
 define double @fminv_v2f64(<2 x double> %a) {
 ; CHECK-LABEL: fminv_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fminnmv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -464,8 +467,8 @@ define double @fminv_v2f64(<2 x double> %a) {
 define double @fminv_v4f64(ptr %a) {
 ; CHECK-LABEL: fminv_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    fminnmv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -482,8 +485,8 @@ define double @fminv_v4f64(ptr %a) {
 define half @fmaximumv_v4f16(<4 x half> %a) {
 ; CHECK-LABEL: fmaximumv_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fmaxv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
@@ -494,8 +497,8 @@ define half @fmaximumv_v4f16(<4 x half> %a) {
 define half @fmaximumv_v8f16(<8 x half> %a) {
 ; CHECK-LABEL: fmaximumv_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fmaxv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
@@ -506,8 +509,8 @@ define half @fmaximumv_v8f16(<8 x half> %a) {
 define half @fmaximumv_v16f16(ptr %a) {
 ; CHECK-LABEL: fmaximumv_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    fmaxv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
@@ -520,8 +523,8 @@ define half @fmaximumv_v16f16(ptr %a) {
 define float @fmaximumv_v2f32(<2 x float> %a) {
 ; CHECK-LABEL: fmaximumv_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fmaxv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
@@ -532,8 +535,8 @@ define float @fmaximumv_v2f32(<2 x float> %a) {
 define float @fmaximumv_v4f32(<4 x float> %a) {
 ; CHECK-LABEL: fmaximumv_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fmaxv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
@@ -544,8 +547,8 @@ define float @fmaximumv_v4f32(<4 x float> %a) {
 define float @fmaximumv_v8f32(ptr %a) {
 ; CHECK-LABEL: fmaximumv_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    fmaxv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
@@ -568,8 +571,8 @@ define double @fmaximumv_v1f64(<1 x double> %a) {
 define double @fmaximumv_v2f64(<2 x double> %a) {
 ; CHECK-LABEL: fmaximumv_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fmaxv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -580,8 +583,8 @@ define double @fmaximumv_v2f64(<2 x double> %a) {
 define double @fmaximumv_v4f64(ptr %a) {
 ; CHECK-LABEL: fmaximumv_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    fmax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    fmaxv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -598,8 +601,8 @@ define double @fmaximumv_v4f64(ptr %a) {
 define half @fminimumv_v4f16(<4 x half> %a) {
 ; CHECK-LABEL: fminimumv_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fminv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
@@ -610,8 +613,8 @@ define half @fminimumv_v4f16(<4 x half> %a) {
 define half @fminimumv_v8f16(<8 x half> %a) {
 ; CHECK-LABEL: fminimumv_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fminv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
@@ -622,8 +625,8 @@ define half @fminimumv_v8f16(<8 x half> %a) {
 define half @fminimumv_v16f16(ptr %a) {
 ; CHECK-LABEL: fminimumv_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    fminv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
@@ -636,8 +639,8 @@ define half @fminimumv_v16f16(ptr %a) {
 define float @fminimumv_v2f32(<2 x float> %a) {
 ; CHECK-LABEL: fminimumv_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fminv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
@@ -648,8 +651,8 @@ define float @fminimumv_v2f32(<2 x float> %a) {
 define float @fminimumv_v4f32(<4 x float> %a) {
 ; CHECK-LABEL: fminimumv_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fminv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
@@ -660,8 +663,8 @@ define float @fminimumv_v4f32(<4 x float> %a) {
 define float @fminimumv_v8f32(ptr %a) {
 ; CHECK-LABEL: fminimumv_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    fminv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
@@ -684,8 +687,8 @@ define double @fminimumv_v1f64(<1 x double> %a) {
 define double @fminimumv_v2f64(<2 x double> %a) {
 ; CHECK-LABEL: fminimumv_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fminv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -696,8 +699,8 @@ define double @fminimumv_v2f64(<2 x double> %a) {
 define double @fminimumv_v4f64(ptr %a) {
 ; CHECK-LABEL: fminimumv_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    fmin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    fminv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
index 74d20d188a71d5..fedcdbc979622d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
@@ -10,8 +10,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define <2 x half> @frintp_v2f16(<2 x half> %op) {
 ; CHECK-LABEL: frintp_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    frintp z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -22,8 +22,8 @@ define <2 x half> @frintp_v2f16(<2 x half> %op) {
 define <4 x half> @frintp_v4f16(<4 x half> %op) {
 ; CHECK-LABEL: frintp_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    frintp z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -34,8 +34,8 @@ define <4 x half> @frintp_v4f16(<4 x half> %op) {
 define <8 x half> @frintp_v8f16(<8 x half> %op) {
 ; CHECK-LABEL: frintp_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    frintp z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -46,8 +46,8 @@ define <8 x half> @frintp_v8f16(<8 x half> %op) {
 define void @frintp_v16f16(ptr %a) {
 ; CHECK-LABEL: frintp_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    frintp z0.h, p0/m, z0.h
 ; CHECK-NEXT:    frintp z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -61,8 +61,8 @@ define void @frintp_v16f16(ptr %a) {
 define <2 x float> @frintp_v2f32(<2 x float> %op) {
 ; CHECK-LABEL: frintp_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    frintp z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -73,8 +73,8 @@ define <2 x float> @frintp_v2f32(<2 x float> %op) {
 define <4 x float> @frintp_v4f32(<4 x float> %op) {
 ; CHECK-LABEL: frintp_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    frintp z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -85,8 +85,8 @@ define <4 x float> @frintp_v4f32(<4 x float> %op) {
 define void @frintp_v8f32(ptr %a) {
 ; CHECK-LABEL: frintp_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    frintp z0.s, p0/m, z0.s
 ; CHECK-NEXT:    frintp z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -110,8 +110,8 @@ define <1 x double> @frintp_v1f64(<1 x double> %op) {
 define <2 x double> @frintp_v2f64(<2 x double> %op) {
 ; CHECK-LABEL: frintp_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    frintp z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -122,8 +122,8 @@ define <2 x double> @frintp_v2f64(<2 x double> %op) {
 define void @frintp_v4f64(ptr %a) {
 ; CHECK-LABEL: frintp_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    frintp z0.d, p0/m, z0.d
 ; CHECK-NEXT:    frintp z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -141,8 +141,8 @@ define void @frintp_v4f64(ptr %a) {
 define <2 x half> @frintm_v2f16(<2 x half> %op) {
 ; CHECK-LABEL: frintm_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    frintm z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -153,8 +153,8 @@ define <2 x half> @frintm_v2f16(<2 x half> %op) {
 define <4 x half> @frintm_v4f16(<4 x half> %op) {
 ; CHECK-LABEL: frintm_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    frintm z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -165,8 +165,8 @@ define <4 x half> @frintm_v4f16(<4 x half> %op) {
 define <8 x half> @frintm_v8f16(<8 x half> %op) {
 ; CHECK-LABEL: frintm_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    frintm z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -177,8 +177,8 @@ define <8 x half> @frintm_v8f16(<8 x half> %op) {
 define void @frintm_v16f16(ptr %a) {
 ; CHECK-LABEL: frintm_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    frintm z0.h, p0/m, z0.h
 ; CHECK-NEXT:    frintm z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -192,8 +192,8 @@ define void @frintm_v16f16(ptr %a) {
 define <2 x float> @frintm_v2f32(<2 x float> %op) {
 ; CHECK-LABEL: frintm_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    frintm z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -204,8 +204,8 @@ define <2 x float> @frintm_v2f32(<2 x float> %op) {
 define <4 x float> @frintm_v4f32(<4 x float> %op) {
 ; CHECK-LABEL: frintm_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    frintm z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -216,8 +216,8 @@ define <4 x float> @frintm_v4f32(<4 x float> %op) {
 define void @frintm_v8f32(ptr %a) {
 ; CHECK-LABEL: frintm_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    frintm z0.s, p0/m, z0.s
 ; CHECK-NEXT:    frintm z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -241,8 +241,8 @@ define <1 x double> @frintm_v1f64(<1 x double> %op) {
 define <2 x double> @frintm_v2f64(<2 x double> %op) {
 ; CHECK-LABEL: frintm_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    frintm z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -253,8 +253,8 @@ define <2 x double> @frintm_v2f64(<2 x double> %op) {
 define void @frintm_v4f64(ptr %a) {
 ; CHECK-LABEL: frintm_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    frintm z0.d, p0/m, z0.d
 ; CHECK-NEXT:    frintm z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -272,8 +272,8 @@ define void @frintm_v4f64(ptr %a) {
 define <2 x half> @frinti_v2f16(<2 x half> %op) {
 ; CHECK-LABEL: frinti_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    frinti z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -284,8 +284,8 @@ define <2 x half> @frinti_v2f16(<2 x half> %op) {
 define <4 x half> @frinti_v4f16(<4 x half> %op) {
 ; CHECK-LABEL: frinti_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    frinti z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -296,8 +296,8 @@ define <4 x half> @frinti_v4f16(<4 x half> %op) {
 define <8 x half> @frinti_v8f16(<8 x half> %op) {
 ; CHECK-LABEL: frinti_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    frinti z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -308,8 +308,8 @@ define <8 x half> @frinti_v8f16(<8 x half> %op) {
 define void @frinti_v16f16(ptr %a) {
 ; CHECK-LABEL: frinti_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    frinti z0.h, p0/m, z0.h
 ; CHECK-NEXT:    frinti z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -323,8 +323,8 @@ define void @frinti_v16f16(ptr %a) {
 define <2 x float> @frinti_v2f32(<2 x float> %op) {
 ; CHECK-LABEL: frinti_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    frinti z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -335,8 +335,8 @@ define <2 x float> @frinti_v2f32(<2 x float> %op) {
 define <4 x float> @frinti_v4f32(<4 x float> %op) {
 ; CHECK-LABEL: frinti_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    frinti z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -347,8 +347,8 @@ define <4 x float> @frinti_v4f32(<4 x float> %op) {
 define void @frinti_v8f32(ptr %a) {
 ; CHECK-LABEL: frinti_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    frinti z0.s, p0/m, z0.s
 ; CHECK-NEXT:    frinti z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -372,8 +372,8 @@ define <1 x double> @frinti_v1f64(<1 x double> %op) {
 define <2 x double> @frinti_v2f64(<2 x double> %op) {
 ; CHECK-LABEL: frinti_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    frinti z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -384,8 +384,8 @@ define <2 x double> @frinti_v2f64(<2 x double> %op) {
 define void @frinti_v4f64(ptr %a) {
 ; CHECK-LABEL: frinti_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    frinti z0.d, p0/m, z0.d
 ; CHECK-NEXT:    frinti z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -403,8 +403,8 @@ define void @frinti_v4f64(ptr %a) {
 define <2 x half> @frintx_v2f16(<2 x half> %op) {
 ; CHECK-LABEL: frintx_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -415,8 +415,8 @@ define <2 x half> @frintx_v2f16(<2 x half> %op) {
 define <4 x half> @frintx_v4f16(<4 x half> %op) {
 ; CHECK-LABEL: frintx_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -427,8 +427,8 @@ define <4 x half> @frintx_v4f16(<4 x half> %op) {
 define <8 x half> @frintx_v8f16(<8 x half> %op) {
 ; CHECK-LABEL: frintx_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -439,8 +439,8 @@ define <8 x half> @frintx_v8f16(<8 x half> %op) {
 define void @frintx_v16f16(ptr %a) {
 ; CHECK-LABEL: frintx_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    frintx z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -454,8 +454,8 @@ define void @frintx_v16f16(ptr %a) {
 define <2 x float> @frintx_v2f32(<2 x float> %op) {
 ; CHECK-LABEL: frintx_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -466,8 +466,8 @@ define <2 x float> @frintx_v2f32(<2 x float> %op) {
 define <4 x float> @frintx_v4f32(<4 x float> %op) {
 ; CHECK-LABEL: frintx_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -478,8 +478,8 @@ define <4 x float> @frintx_v4f32(<4 x float> %op) {
 define void @frintx_v8f32(ptr %a) {
 ; CHECK-LABEL: frintx_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
 ; CHECK-NEXT:    frintx z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -503,8 +503,8 @@ define <1 x double> @frintx_v1f64(<1 x double> %op) {
 define <2 x double> @frintx_v2f64(<2 x double> %op) {
 ; CHECK-LABEL: frintx_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -515,8 +515,8 @@ define <2 x double> @frintx_v2f64(<2 x double> %op) {
 define void @frintx_v4f64(ptr %a) {
 ; CHECK-LABEL: frintx_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
 ; CHECK-NEXT:    frintx z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -534,8 +534,8 @@ define void @frintx_v4f64(ptr %a) {
 define <2 x half> @frinta_v2f16(<2 x half> %op) {
 ; CHECK-LABEL: frinta_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    frinta z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -546,8 +546,8 @@ define <2 x half> @frinta_v2f16(<2 x half> %op) {
 define <4 x half> @frinta_v4f16(<4 x half> %op) {
 ; CHECK-LABEL: frinta_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    frinta z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -558,8 +558,8 @@ define <4 x half> @frinta_v4f16(<4 x half> %op) {
 define <8 x half> @frinta_v8f16(<8 x half> %op) {
 ; CHECK-LABEL: frinta_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    frinta z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -570,8 +570,8 @@ define <8 x half> @frinta_v8f16(<8 x half> %op) {
 define void @frinta_v16f16(ptr %a) {
 ; CHECK-LABEL: frinta_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    frinta z0.h, p0/m, z0.h
 ; CHECK-NEXT:    frinta z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -585,8 +585,8 @@ define void @frinta_v16f16(ptr %a) {
 define <2 x float> @frinta_v2f32(<2 x float> %op) {
 ; CHECK-LABEL: frinta_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    frinta z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -597,8 +597,8 @@ define <2 x float> @frinta_v2f32(<2 x float> %op) {
 define <4 x float> @frinta_v4f32(<4 x float> %op) {
 ; CHECK-LABEL: frinta_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    frinta z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -609,8 +609,8 @@ define <4 x float> @frinta_v4f32(<4 x float> %op) {
 define void @frinta_v8f32(ptr %a) {
 ; CHECK-LABEL: frinta_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    frinta z0.s, p0/m, z0.s
 ; CHECK-NEXT:    frinta z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -634,8 +634,8 @@ define <1 x double> @frinta_v1f64(<1 x double> %op) {
 define <2 x double> @frinta_v2f64(<2 x double> %op) {
 ; CHECK-LABEL: frinta_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    frinta z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -646,8 +646,8 @@ define <2 x double> @frinta_v2f64(<2 x double> %op) {
 define void @frinta_v4f64(ptr %a) {
 ; CHECK-LABEL: frinta_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    frinta z0.d, p0/m, z0.d
 ; CHECK-NEXT:    frinta z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -665,8 +665,8 @@ define void @frinta_v4f64(ptr %a) {
 define <2 x half> @frintn_v2f16(<2 x half> %op) {
 ; CHECK-LABEL: frintn_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    frintn z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -677,8 +677,8 @@ define <2 x half> @frintn_v2f16(<2 x half> %op) {
 define <4 x half> @frintn_v4f16(<4 x half> %op) {
 ; CHECK-LABEL: frintn_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    frintn z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -689,8 +689,8 @@ define <4 x half> @frintn_v4f16(<4 x half> %op) {
 define <8 x half> @frintn_v8f16(<8 x half> %op) {
 ; CHECK-LABEL: frintn_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    frintn z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -701,8 +701,8 @@ define <8 x half> @frintn_v8f16(<8 x half> %op) {
 define void @frintn_v16f16(ptr %a) {
 ; CHECK-LABEL: frintn_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    frintn z0.h, p0/m, z0.h
 ; CHECK-NEXT:    frintn z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -716,8 +716,8 @@ define void @frintn_v16f16(ptr %a) {
 define <2 x float> @frintn_v2f32(<2 x float> %op) {
 ; CHECK-LABEL: frintn_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    frintn z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -728,8 +728,8 @@ define <2 x float> @frintn_v2f32(<2 x float> %op) {
 define <4 x float> @frintn_v4f32(<4 x float> %op) {
 ; CHECK-LABEL: frintn_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    frintn z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -740,8 +740,8 @@ define <4 x float> @frintn_v4f32(<4 x float> %op) {
 define void @frintn_v8f32(ptr %a) {
 ; CHECK-LABEL: frintn_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    frintn z0.s, p0/m, z0.s
 ; CHECK-NEXT:    frintn z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -765,8 +765,8 @@ define <1 x double> @frintn_v1f64(<1 x double> %op) {
 define <2 x double> @frintn_v2f64(<2 x double> %op) {
 ; CHECK-LABEL: frintn_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    frintn z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -777,8 +777,8 @@ define <2 x double> @frintn_v2f64(<2 x double> %op) {
 define void @frintn_v4f64(ptr %a) {
 ; CHECK-LABEL: frintn_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    frintn z0.d, p0/m, z0.d
 ; CHECK-NEXT:    frintn z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -796,8 +796,8 @@ define void @frintn_v4f64(ptr %a) {
 define <2 x half> @frintz_v2f16(<2 x half> %op) {
 ; CHECK-LABEL: frintz_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    frintz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -808,8 +808,8 @@ define <2 x half> @frintz_v2f16(<2 x half> %op) {
 define <4 x half> @frintz_v4f16(<4 x half> %op) {
 ; CHECK-LABEL: frintz_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    frintz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -820,8 +820,8 @@ define <4 x half> @frintz_v4f16(<4 x half> %op) {
 define <8 x half> @frintz_v8f16(<8 x half> %op) {
 ; CHECK-LABEL: frintz_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    frintz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -832,8 +832,8 @@ define <8 x half> @frintz_v8f16(<8 x half> %op) {
 define void @frintz_v16f16(ptr %a) {
 ; CHECK-LABEL: frintz_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    frintz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    frintz z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -847,8 +847,8 @@ define void @frintz_v16f16(ptr %a) {
 define <2 x float> @frintz_v2f32(<2 x float> %op) {
 ; CHECK-LABEL: frintz_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    frintz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -859,8 +859,8 @@ define <2 x float> @frintz_v2f32(<2 x float> %op) {
 define <4 x float> @frintz_v4f32(<4 x float> %op) {
 ; CHECK-LABEL: frintz_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    frintz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -871,8 +871,8 @@ define <4 x float> @frintz_v4f32(<4 x float> %op) {
 define void @frintz_v8f32(ptr %a) {
 ; CHECK-LABEL: frintz_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    frintz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    frintz z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -896,8 +896,8 @@ define <1 x double> @frintz_v1f64(<1 x double> %op) {
 define <2 x double> @frintz_v2f64(<2 x double> %op) {
 ; CHECK-LABEL: frintz_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    frintz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -908,8 +908,8 @@ define <2 x double> @frintz_v2f64(<2 x double> %op) {
 define void @frintz_v4f64(ptr %a) {
 ; CHECK-LABEL: frintz_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    frintz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    frintz z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
index 2344b4741088d1..0f79310a69f664 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
@@ -6,8 +6,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    mov z2.h, w8
@@ -22,8 +22,8 @@ define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, i1 %mask) {
 define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    mov z2.h, w8
@@ -38,8 +38,8 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) {
 define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    mov z2.h, w8
@@ -54,14 +54,14 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) {
 define void @select_v16f16(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-LABEL: select_v16f16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    and w8, w2, #0x1
+; CHECK-NEXT:    mov z0.h, w8
+; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x0, #16]
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    ldr q2, [x1]
 ; CHECK-NEXT:    ldr q3, [x1, #16]
-; CHECK-NEXT:    mov z4.h, w8
-; CHECK-NEXT:    cmpne p0.h, p0/z, z4.h, #0
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z2.h
 ; CHECK-NEXT:    sel z1.h, p0, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -76,8 +76,8 @@ define void @select_v16f16(ptr %a, ptr %b, i1 %mask) {
 define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    mov z2.s, w8
@@ -92,8 +92,8 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) {
 define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    mov z2.s, w8
@@ -108,14 +108,14 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) {
 define void @select_v8f32(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-LABEL: select_v8f32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    and w8, w2, #0x1
+; CHECK-NEXT:    mov z0.s, w8
+; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x0, #16]
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ldr q2, [x1]
 ; CHECK-NEXT:    ldr q3, [x1, #16]
-; CHECK-NEXT:    mov z4.s, w8
-; CHECK-NEXT:    cmpne p0.s, p0/z, z4.s, #0
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z2.s
 ; CHECK-NEXT:    sel z1.s, p0, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -149,9 +149,9 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask
 define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v2f64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    and x8, x0, #0x1
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    mov z2.d, x8
@@ -166,15 +166,15 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask
 define void @select_v4f64(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-LABEL: select_v4f64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    and x8, x2, #0x1
+; CHECK-NEXT:    mov z0.d, x8
+; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x0, #16]
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldr q2, [x1]
 ; CHECK-NEXT:    ldr q3, [x1, #16]
-; CHECK-NEXT:    mov z4.d, x8
-; CHECK-NEXT:    cmpne p0.d, p0/z, z4.d, #0
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z2.d
 ; CHECK-NEXT:    sel z1.d, p0, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
index f2691e0635ac31..4b4e3522f8c6c1 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
@@ -10,8 +10,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x i16> @fcvtzu_v4f16_v4i16(<4 x half> %op1) {
 ; CHECK-LABEL: fcvtzu_v4f16_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fcvtzu z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -22,8 +22,8 @@ define <4 x i16> @fcvtzu_v4f16_v4i16(<4 x half> %op1) {
 define void @fcvtzu_v8f16_v8i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v8f16_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    fcvtzu z0.h, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
@@ -36,8 +36,8 @@ define void @fcvtzu_v8f16_v8i16(ptr %a, ptr %b) {
 define void @fcvtzu_v16f16_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v16f16_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fcvtzu z0.h, p0/m, z0.h
 ; CHECK-NEXT:    fcvtzu z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x1]
@@ -55,8 +55,8 @@ define void @fcvtzu_v16f16_v16i16(ptr %a, ptr %b) {
 define <2 x i32> @fcvtzu_v2f16_v2i32(<2 x half> %op1) {
 ; CHECK-LABEL: fcvtzu_v2f16_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -68,8 +68,8 @@ define <2 x i32> @fcvtzu_v2f16_v2i32(<2 x half> %op1) {
 define <4 x i32> @fcvtzu_v4f16_v4i32(<4 x half> %op1) {
 ; CHECK-LABEL: fcvtzu_v4f16_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -99,21 +99,20 @@ define void @fcvtzu_v8f16_v8i32(ptr %a, ptr %b) {
 define void @fcvtzu_v16f16_v16i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v16f16_v16i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z3.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    fcvtzu z2.s, p0/m, z2.h
 ; CHECK-NEXT:    fcvtzu z3.s, p0/m, z3.h
-; CHECK-NEXT:    fcvtzu z1.s, p0/m, z1.h
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
-; CHECK-NEXT:    stp q3, q1, [x1, #32]
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fcvtzu z1.s, p0/m, z2.h
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    fcvtzu z1.s, p0/m, z1.h
+; CHECK-NEXT:    stp q2, q0, [x1, #32]
+; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptoui <16 x half> %op1 to <16 x i32>
@@ -139,9 +138,9 @@ define <2 x i64> @fcvtzu_v2f16_v2i64(<2 x half> %op1) {
 ; CHECK-LABEL: fcvtzu_v2f16_v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov z1.h, z0.h[1]
 ; CHECK-NEXT:    fcvtzu x8, h0
-; CHECK-NEXT:    mov z0.h, z0.h[1]
-; CHECK-NEXT:    fcvtzu x9, h0
+; CHECK-NEXT:    fcvtzu x9, h1
 ; CHECK-NEXT:    stp x8, x9, [sp, #-16]!
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr q0, [sp], #16
@@ -156,10 +155,10 @@ define void @fcvtzu_v4f16_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    mov z1.h, z0.h[1]
 ; CHECK-NEXT:    fcvtzu x8, h0
-; CHECK-NEXT:    fcvtzu x9, h1
-; CHECK-NEXT:    mov z1.h, z0.h[3]
+; CHECK-NEXT:    mov z2.h, z0.h[3]
 ; CHECK-NEXT:    mov z0.h, z0.h[2]
-; CHECK-NEXT:    fcvtzu x10, h1
+; CHECK-NEXT:    fcvtzu x9, h1
+; CHECK-NEXT:    fcvtzu x10, h2
 ; CHECK-NEXT:    fcvtzu x11, h0
 ; CHECK-NEXT:    stp x8, x9, [sp, #-32]!
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
@@ -181,27 +180,27 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 64
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    mov z1.h, z0.h[1]
+; CHECK-NEXT:    mov z2.h, z0.h[3]
+; CHECK-NEXT:    mov z3.h, z0.h[2]
 ; CHECK-NEXT:    fcvtzu x8, h0
-; CHECK-NEXT:    fcvtzu x9, h1
-; CHECK-NEXT:    mov z1.h, z0.h[3]
-; CHECK-NEXT:    fcvtzu x10, h1
-; CHECK-NEXT:    mov z1.h, z0.h[2]
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    fcvtzu x11, h1
+; CHECK-NEXT:    fcvtzu x9, h1
+; CHECK-NEXT:    fcvtzu x10, h2
+; CHECK-NEXT:    fcvtzu x11, h3
 ; CHECK-NEXT:    mov z1.h, z0.h[1]
+; CHECK-NEXT:    mov z2.h, z0.h[3]
 ; CHECK-NEXT:    fcvtzu x12, h0
+; CHECK-NEXT:    mov z0.h, z0.h[2]
 ; CHECK-NEXT:    stp x8, x9, [sp, #32]
 ; CHECK-NEXT:    fcvtzu x8, h1
-; CHECK-NEXT:    mov z1.h, z0.h[3]
-; CHECK-NEXT:    mov z0.h, z0.h[2]
+; CHECK-NEXT:    fcvtzu x9, h2
 ; CHECK-NEXT:    stp x11, x10, [sp, #48]
-; CHECK-NEXT:    fcvtzu x9, h1
 ; CHECK-NEXT:    fcvtzu x10, h0
+; CHECK-NEXT:    ldp q2, q3, [sp, #32]
 ; CHECK-NEXT:    stp x12, x8, [sp]
-; CHECK-NEXT:    ldp q3, q2, [sp, #32]
 ; CHECK-NEXT:    stp x10, x9, [sp, #16]
 ; CHECK-NEXT:    ldp q1, q0, [sp]
-; CHECK-NEXT:    stp q3, q2, [x1]
+; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    stp q1, q0, [x1, #32]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
@@ -216,53 +215,54 @@ define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #128
 ; CHECK-NEXT:    .cfi_def_cfa_offset 128
-; CHECK-NEXT:    ldp q1, q0, [x0]
-; CHECK-NEXT:    mov z2.h, z1.h[1]
-; CHECK-NEXT:    mov z3.h, z1.h[3]
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    mov z1.h, z0.h[1]
+; CHECK-NEXT:    mov z2.h, z0.h[3]
+; CHECK-NEXT:    fcvtzu x8, h0
+; CHECK-NEXT:    mov z3.h, z0.h[2]
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    fcvtzu x9, h1
+; CHECK-NEXT:    fcvtzu x10, h2
+; CHECK-NEXT:    ldr q1, [x0, #16]
+; CHECK-NEXT:    fcvtzu x11, h3
+; CHECK-NEXT:    mov z2.h, z0.h[1]
+; CHECK-NEXT:    mov z3.h, z0.h[3]
+; CHECK-NEXT:    fcvtzu x12, h1
+; CHECK-NEXT:    stp x8, x9, [sp, #32]
+; CHECK-NEXT:    fcvtzu x8, h0
+; CHECK-NEXT:    mov z0.h, z0.h[2]
 ; CHECK-NEXT:    fcvtzu x9, h2
-; CHECK-NEXT:    mov z2.h, z1.h[2]
-; CHECK-NEXT:    fcvtzu x8, h1
+; CHECK-NEXT:    stp x11, x10, [sp, #48]
 ; CHECK-NEXT:    fcvtzu x10, h3
-; CHECK-NEXT:    fcvtzu x11, h2
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    fcvtzu x12, h1
 ; CHECK-NEXT:    mov z2.h, z1.h[1]
 ; CHECK-NEXT:    mov z3.h, z1.h[3]
-; CHECK-NEXT:    mov z1.h, z1.h[2]
-; CHECK-NEXT:    stp x8, x9, [sp, #32]
-; CHECK-NEXT:    fcvtzu x9, h3
-; CHECK-NEXT:    stp x11, x10, [sp, #48]
-; CHECK-NEXT:    fcvtzu x10, h1
+; CHECK-NEXT:    fcvtzu x11, h0
+; CHECK-NEXT:    mov z0.h, z1.h[2]
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    stp x8, x9, [sp]
 ; CHECK-NEXT:    fcvtzu x8, h2
-; CHECK-NEXT:    mov z1.h, z0.h[1]
-; CHECK-NEXT:    stp x10, x9, [sp, #16]
-; CHECK-NEXT:    fcvtzu x9, h1
-; CHECK-NEXT:    mov z1.h, z0.h[3]
-; CHECK-NEXT:    stp x12, x8, [sp]
-; CHECK-NEXT:    fcvtzu x8, h0
-; CHECK-NEXT:    fcvtzu x10, h1
-; CHECK-NEXT:    mov z1.h, z0.h[2]
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    fcvtzu x9, h3
+; CHECK-NEXT:    mov z2.h, z1.h[1]
+; CHECK-NEXT:    stp x11, x10, [sp, #16]
+; CHECK-NEXT:    fcvtzu x10, h0
+; CHECK-NEXT:    mov z0.h, z1.h[3]
 ; CHECK-NEXT:    fcvtzu x11, h1
-; CHECK-NEXT:    mov z1.h, z0.h[1]
-; CHECK-NEXT:    stp x8, x9, [sp, #96]
+; CHECK-NEXT:    mov z1.h, z1.h[2]
+; CHECK-NEXT:    stp x12, x8, [sp, #96]
+; CHECK-NEXT:    fcvtzu x12, h2
 ; CHECK-NEXT:    fcvtzu x8, h0
+; CHECK-NEXT:    ldp q3, q4, [sp]
+; CHECK-NEXT:    stp x10, x9, [sp, #112]
 ; CHECK-NEXT:    fcvtzu x9, h1
-; CHECK-NEXT:    mov z1.h, z0.h[3]
-; CHECK-NEXT:    mov z0.h, z0.h[2]
-; CHECK-NEXT:    stp x11, x10, [sp, #112]
-; CHECK-NEXT:    fcvtzu x10, h1
-; CHECK-NEXT:    fcvtzu x11, h0
-; CHECK-NEXT:    stp x8, x9, [sp, #64]
 ; CHECK-NEXT:    ldp q0, q1, [sp, #32]
-; CHECK-NEXT:    stp x11, x10, [sp, #80]
-; CHECK-NEXT:    ldp q2, q3, [sp]
-; CHECK-NEXT:    ldp q5, q4, [sp, #64]
-; CHECK-NEXT:    ldp q7, q6, [sp, #96]
+; CHECK-NEXT:    stp x11, x12, [sp, #64]
+; CHECK-NEXT:    ldp q6, q7, [sp, #96]
+; CHECK-NEXT:    stp x9, x8, [sp, #80]
+; CHECK-NEXT:    ldp q5, q2, [sp, #64]
 ; CHECK-NEXT:    stp q0, q1, [x1]
-; CHECK-NEXT:    stp q2, q3, [x1, #32]
-; CHECK-NEXT:    stp q5, q4, [x1, #96]
-; CHECK-NEXT:    stp q7, q6, [x1, #64]
+; CHECK-NEXT:    stp q3, q4, [x1, #32]
+; CHECK-NEXT:    stp q6, q7, [x1, #64]
+; CHECK-NEXT:    stp q5, q2, [x1, #96]
 ; CHECK-NEXT:    add sp, sp, #128
 ; CHECK-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
@@ -278,8 +278,8 @@ define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) {
 define <2 x i16> @fcvtzu_v2f32_v2i16(<2 x float> %op1) {
 ; CHECK-LABEL: fcvtzu_v2f32_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -290,8 +290,8 @@ define <2 x i16> @fcvtzu_v2f32_v2i16(<2 x float> %op1) {
 define <4 x i16> @fcvtzu_v4f32_v4i16(<4 x float> %op1) {
 ; CHECK-LABEL: fcvtzu_v4f32_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -303,14 +303,14 @@ define <4 x i16> @fcvtzu_v4f32_v4i16(<4 x float> %op1) {
 define <8 x i16> @fcvtzu_v8f32_v8i16(ptr %a) {
 ; CHECK-LABEL: fcvtzu_v8f32_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fcvtzu z1.s, p0/m, z1.s
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    uzp1 z2.h, z0.h, z0.h
-; CHECK-NEXT:    uzp1 z0.h, z1.h, z1.h
-; CHECK-NEXT:    splice z0.h, p0, z0.h, z2.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
@@ -321,21 +321,21 @@ define <8 x i16> @fcvtzu_v8f32_v8i16(ptr %a) {
 define void @fcvtzu_v16f32_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v16f32_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ptrue p1.h, vl4
-; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    ldp q3, q2, [x0, #32]
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    fcvtzu z1.s, p0/m, z1.s
-; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
-; CHECK-NEXT:    splice z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
 ; CHECK-NEXT:    fcvtzu z3.s, p0/m, z3.s
-; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; CHECK-NEXT:    fcvtzu z2.s, p0/m, z2.s
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    splice z3.h, p1, z3.h, z2.h
-; CHECK-NEXT:    stp q0, q3, [x1]
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
+; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x float>, ptr %a
   %res = fptoui <16 x float> %op1 to <16 x i16>
@@ -350,8 +350,8 @@ define void @fcvtzu_v16f32_v16i16(ptr %a, ptr %b) {
 define <2 x i32> @fcvtzu_v2f32_v2i32(<2 x float> %op1) {
 ; CHECK-LABEL: fcvtzu_v2f32_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -362,8 +362,8 @@ define <2 x i32> @fcvtzu_v2f32_v2i32(<2 x float> %op1) {
 define <4 x i32> @fcvtzu_v4f32_v4i32(<4 x float> %op1) {
 ; CHECK-LABEL: fcvtzu_v4f32_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -374,8 +374,8 @@ define <4 x i32> @fcvtzu_v4f32_v4i32(<4 x float> %op1) {
 define void @fcvtzu_v8f32_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v8f32_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
 ; CHECK-NEXT:    fcvtzu z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x1]
@@ -393,8 +393,8 @@ define void @fcvtzu_v8f32_v8i32(ptr %a, ptr %b) {
 define <1 x i64> @fcvtzu_v1f32_v1i64(<1 x float> %op1) {
 ; CHECK-LABEL: fcvtzu_v1f32_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -406,8 +406,8 @@ define <1 x i64> @fcvtzu_v1f32_v1i64(<1 x float> %op1) {
 define <2 x i64> @fcvtzu_v2f32_v2i64(<2 x float> %op1) {
 ; CHECK-LABEL: fcvtzu_v2f32_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -437,21 +437,20 @@ define void @fcvtzu_v4f32_v4i64(ptr %a, ptr %b) {
 define void @fcvtzu_v8f32_v8i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v8f32_v8i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    uunpklo z2.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    uunpklo z3.d, z1.s
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    fcvtzu z2.d, p0/m, z2.s
 ; CHECK-NEXT:    fcvtzu z3.d, p0/m, z3.s
-; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.s
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
-; CHECK-NEXT:    stp q3, q1, [x1, #32]
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fcvtzu z1.d, p0/m, z2.s
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.s
+; CHECK-NEXT:    stp q2, q0, [x1, #32]
+; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptoui <8 x float> %op1 to <8 x i64>
@@ -478,8 +477,8 @@ define <1 x i16> @fcvtzu_v1f64_v1i16(<1 x double> %op1) {
 define <2 x i16> @fcvtzu_v2f64_v2i16(<2 x double> %op1) {
 ; CHECK-LABEL: fcvtzu_v2f64_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -493,21 +492,21 @@ define <4 x i16> @fcvtzu_v4f64_v4i16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
-; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
-; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    mov z1.s, z1.s[1]
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    fmov w10, s0
-; CHECK-NEXT:    strh w9, [sp, #8]
+; CHECK-NEXT:    mov z2.s, z0.s[1]
+; CHECK-NEXT:    mov z0.s, z1.s[1]
 ; CHECK-NEXT:    strh w8, [sp, #12]
 ; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w10, [sp, #14]
+; CHECK-NEXT:    strh w8, [sp, #8]
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    strh w8, [sp, #14]
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    strh w8, [sp, #10]
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
@@ -522,37 +521,37 @@ define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ldp q0, q1, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    ldp q1, q0, [x0, #32]
 ; CHECK-NEXT:    ldp q3, q2, [x0]
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z4.s, z1.s[1]
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.d
 ; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.d
-; CHECK-NEXT:    strh w9, [sp, #8]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
 ; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.d
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    mov z0.s, z0.s[1]
 ; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov z1.s, z1.s[1]
+; CHECK-NEXT:    strh w8, [sp, #8]
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    mov z2.s, z2.s[1]
+; CHECK-NEXT:    strh w8, [sp, #4]
 ; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w10, s2
-; CHECK-NEXT:    mov z1.s, z0.s[1]
-; CHECK-NEXT:    mov z0.s, z2.s[1]
-; CHECK-NEXT:    mov z2.s, z3.s[1]
+; CHECK-NEXT:    mov z3.s, z3.s[1]
 ; CHECK-NEXT:    strh w8, [sp]
 ; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strh w10, [sp, #4]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    strh w9, [sp, #14]
-; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    strh w8, [sp, #14]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    strh w8, [sp, #10]
+; CHECK-NEXT:    fmov w8, s2
 ; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    strh w10, [sp, #10]
-; CHECK-NEXT:    strh w9, [sp, #2]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    strh w8, [sp, #2]
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
@@ -565,67 +564,68 @@ define void @fcvtzu_v16f64_v16i16(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #32
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    ldp q2, q3, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ldp q3, q2, [x0]
+; CHECK-NEXT:    ldr q6, [x0, #112]
+; CHECK-NEXT:    ldp q4, q5, [x0, #80]
+; CHECK-NEXT:    ldr q7, [x0, #64]
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.d
-; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
-; CHECK-NEXT:    ldp q4, q5, [x0]
 ; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.d
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z6.s, z3.s[1]
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z4.d
-; CHECK-NEXT:    mov z3.s, z2.s[1]
-; CHECK-NEXT:    uzp1 z4.s, z4.s, z4.s
-; CHECK-NEXT:    ldp q0, q1, [x0, #64]
+; CHECK-NEXT:    fcvtzs z6.d, p0/m, z6.d
 ; CHECK-NEXT:    fcvtzs z5.d, p0/m, z5.d
-; CHECK-NEXT:    uzp1 z5.s, z5.s, z5.s
-; CHECK-NEXT:    fmov w10, s5
-; CHECK-NEXT:    mov z5.s, z5.s[1]
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z4.d
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    ldp q2, q7, [x0, #96]
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
+; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
+; CHECK-NEXT:    uzp1 z6.s, z6.s, z6.s
+; CHECK-NEXT:    uzp1 z5.s, z5.s, z5.s
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov z16.s, z1.s[1]
+; CHECK-NEXT:    mov z1.s, z0.s[1]
 ; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    mov z4.s, z4.s[1]
-; CHECK-NEXT:    strh w10, [sp, #4]
-; CHECK-NEXT:    strh w8, [sp]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    mov z0.s, z2.s[1]
+; CHECK-NEXT:    strh w8, [sp, #8]
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    mov z2.s, z3.s[1]
+; CHECK-NEXT:    strh w8, [sp, #4]
 ; CHECK-NEXT:    fmov w8, s3
 ; CHECK-NEXT:    movprfx z3, z7
 ; CHECK-NEXT:    fcvtzs z3.d, p0/m, z7.d
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.d
-; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
+; CHECK-NEXT:    strh w8, [sp]
+; CHECK-NEXT:    fmov w8, s16
+; CHECK-NEXT:    strh w8, [sp, #14]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    uzp1 z1.s, z4.s, z4.s
 ; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w9, [sp, #14]
-; CHECK-NEXT:    fmov w9, s5
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
-; CHECK-NEXT:    strh w8, [sp, #28]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    uzp1 z0.s, z3.s, z3.s
+; CHECK-NEXT:    mov z3.s, z5.s[1]
+; CHECK-NEXT:    strh w8, [sp, #6]
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z3.s, z3.s[1]
-; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    strh w9, [sp, #6]
-; CHECK-NEXT:    fmov w9, s1
-; CHECK-NEXT:    strh w10, [sp, #2]
-; CHECK-NEXT:    fmov w10, s0
+; CHECK-NEXT:    mov z2.s, z6.s[1]
+; CHECK-NEXT:    strh w8, [sp, #2]
+; CHECK-NEXT:    fmov w8, s6
+; CHECK-NEXT:    strh w8, [sp, #28]
+; CHECK-NEXT:    fmov w8, s5
 ; CHECK-NEXT:    strh w8, [sp, #24]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z4.s, z2.s[1]
-; CHECK-NEXT:    mov z2.s, z1.s[1]
-; CHECK-NEXT:    mov z1.s, z0.s[1]
-; CHECK-NEXT:    strh w9, [sp, #20]
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    strh w10, [sp, #16]
-; CHECK-NEXT:    fmov w10, s2
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov z1.s, z1.s[1]
+; CHECK-NEXT:    strh w8, [sp, #20]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    mov z0.s, z0.s[1]
+; CHECK-NEXT:    strh w8, [sp, #16]
+; CHECK-NEXT:    fmov w8, s2
 ; CHECK-NEXT:    strh w8, [sp, #30]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    strh w8, [sp, #26]
 ; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w9, [sp, #26]
-; CHECK-NEXT:    strh w10, [sp, #22]
+; CHECK-NEXT:    strh w8, [sp, #22]
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    strh w8, [sp, #18]
 ; CHECK-NEXT:    ldp q1, q0, [sp]
 ; CHECK-NEXT:    stp q1, q0, [x1]
@@ -644,8 +644,8 @@ define void @fcvtzu_v16f64_v16i16(ptr %a, ptr %b) {
 define <1 x i32> @fcvtzu_v1f64_v1i32(<1 x double> %op1) {
 ; CHECK-LABEL: fcvtzu_v1f64_v1i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -657,8 +657,8 @@ define <1 x i32> @fcvtzu_v1f64_v1i32(<1 x double> %op1) {
 define <2 x i32> @fcvtzu_v2f64_v2i32(<2 x double> %op1) {
 ; CHECK-LABEL: fcvtzu_v2f64_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -670,14 +670,14 @@ define <2 x i32> @fcvtzu_v2f64_v2i32(<2 x double> %op1) {
 define <4 x i32> @fcvtzu_v4f64_v4i32(ptr %a) {
 ; CHECK-LABEL: fcvtzu_v4f64_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.d
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
 ; CHECK-NEXT:    ptrue p0.s, vl2
-; CHECK-NEXT:    uzp1 z2.s, z0.s, z0.s
-; CHECK-NEXT:    uzp1 z0.s, z1.s, z1.s
-; CHECK-NEXT:    splice z0.s, p0, z0.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
@@ -688,21 +688,21 @@ define <4 x i32> @fcvtzu_v4f64_v4i32(ptr %a) {
 define void @fcvtzu_v8f64_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v8f64_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ptrue p1.s, vl2
-; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    ldp q3, q2, [x0, #32]
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.d
-; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    splice z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fcvtzu z3.d, p0/m, z3.d
-; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
 ; CHECK-NEXT:    fcvtzu z2.d, p0/m, z2.d
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
 ; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
-; CHECK-NEXT:    splice z3.s, p1, z3.s, z2.s
-; CHECK-NEXT:    stp q0, q3, [x1]
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
+; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %res = fptoui <8 x double> %op1 to <8 x i32>
@@ -717,8 +717,8 @@ define void @fcvtzu_v8f64_v8i32(ptr %a, ptr %b) {
 define <1 x i64> @fcvtzu_v1f64_v1i64(<1 x double> %op1) {
 ; CHECK-LABEL: fcvtzu_v1f64_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -729,8 +729,8 @@ define <1 x i64> @fcvtzu_v1f64_v1i64(<1 x double> %op1) {
 define <2 x i64> @fcvtzu_v2f64_v2i64(<2 x double> %op1) {
 ; CHECK-LABEL: fcvtzu_v2f64_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -741,8 +741,8 @@ define <2 x i64> @fcvtzu_v2f64_v2i64(<2 x double> %op1) {
 define void @fcvtzu_v4f64_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v4f64_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x1]
@@ -760,8 +760,8 @@ define void @fcvtzu_v4f64_v4i64(ptr %a, ptr %b) {
 define <4 x i16> @fcvtzs_v4f16_v4i16(<4 x half> %op1) {
 ; CHECK-LABEL: fcvtzs_v4f16_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fcvtzs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -772,8 +772,8 @@ define <4 x i16> @fcvtzs_v4f16_v4i16(<4 x half> %op1) {
 define void @fcvtzs_v8f16_v8i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v8f16_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    fcvtzs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
@@ -786,8 +786,8 @@ define void @fcvtzs_v8f16_v8i16(ptr %a, ptr %b) {
 define void @fcvtzs_v16f16_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v16f16_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fcvtzs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    fcvtzs z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x1]
@@ -805,8 +805,8 @@ define void @fcvtzs_v16f16_v16i16(ptr %a, ptr %b) {
 define <2 x i32> @fcvtzs_v2f16_v2i32(<2 x half> %op1) {
 ; CHECK-LABEL: fcvtzs_v2f16_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -818,8 +818,8 @@ define <2 x i32> @fcvtzs_v2f16_v2i32(<2 x half> %op1) {
 define <4 x i32> @fcvtzs_v4f16_v4i32(<4 x half> %op1) {
 ; CHECK-LABEL: fcvtzs_v4f16_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -849,21 +849,20 @@ define void @fcvtzs_v8f16_v8i32(ptr %a, ptr %b) {
 define void @fcvtzs_v16f16_v16i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v16f16_v16i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z3.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    fcvtzs z2.s, p0/m, z2.h
 ; CHECK-NEXT:    fcvtzs z3.s, p0/m, z3.h
-; CHECK-NEXT:    fcvtzs z1.s, p0/m, z1.h
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
-; CHECK-NEXT:    stp q3, q1, [x1, #32]
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fcvtzs z1.s, p0/m, z2.h
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    fcvtzs z1.s, p0/m, z1.h
+; CHECK-NEXT:    stp q2, q0, [x1, #32]
+; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %res = fptosi <16 x half> %op1 to <16 x i32>
@@ -890,9 +889,9 @@ define <2 x i64> @fcvtzs_v2f16_v2i64(<2 x half> %op1) {
 ; CHECK-LABEL: fcvtzs_v2f16_v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov z1.h, z0.h[1]
 ; CHECK-NEXT:    fcvtzs x8, h0
-; CHECK-NEXT:    mov z0.h, z0.h[1]
-; CHECK-NEXT:    fcvtzs x9, h0
+; CHECK-NEXT:    fcvtzs x9, h1
 ; CHECK-NEXT:    stp x8, x9, [sp, #-16]!
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr q0, [sp], #16
@@ -907,10 +906,10 @@ define void @fcvtzs_v4f16_v4i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    mov z1.h, z0.h[1]
 ; CHECK-NEXT:    fcvtzs x8, h0
-; CHECK-NEXT:    fcvtzs x9, h1
-; CHECK-NEXT:    mov z1.h, z0.h[3]
+; CHECK-NEXT:    mov z2.h, z0.h[3]
 ; CHECK-NEXT:    mov z0.h, z0.h[2]
-; CHECK-NEXT:    fcvtzs x10, h1
+; CHECK-NEXT:    fcvtzs x9, h1
+; CHECK-NEXT:    fcvtzs x10, h2
 ; CHECK-NEXT:    fcvtzs x11, h0
 ; CHECK-NEXT:    stp x8, x9, [sp, #-32]!
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
@@ -932,27 +931,27 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 64
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    mov z1.h, z0.h[1]
+; CHECK-NEXT:    mov z2.h, z0.h[3]
+; CHECK-NEXT:    mov z3.h, z0.h[2]
 ; CHECK-NEXT:    fcvtzs x8, h0
-; CHECK-NEXT:    fcvtzs x9, h1
-; CHECK-NEXT:    mov z1.h, z0.h[3]
-; CHECK-NEXT:    fcvtzs x10, h1
-; CHECK-NEXT:    mov z1.h, z0.h[2]
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    fcvtzs x11, h1
+; CHECK-NEXT:    fcvtzs x9, h1
+; CHECK-NEXT:    fcvtzs x10, h2
+; CHECK-NEXT:    fcvtzs x11, h3
 ; CHECK-NEXT:    mov z1.h, z0.h[1]
+; CHECK-NEXT:    mov z2.h, z0.h[3]
 ; CHECK-NEXT:    fcvtzs x12, h0
+; CHECK-NEXT:    mov z0.h, z0.h[2]
 ; CHECK-NEXT:    stp x8, x9, [sp, #32]
 ; CHECK-NEXT:    fcvtzs x8, h1
-; CHECK-NEXT:    mov z1.h, z0.h[3]
-; CHECK-NEXT:    mov z0.h, z0.h[2]
+; CHECK-NEXT:    fcvtzs x9, h2
 ; CHECK-NEXT:    stp x11, x10, [sp, #48]
-; CHECK-NEXT:    fcvtzs x9, h1
 ; CHECK-NEXT:    fcvtzs x10, h0
+; CHECK-NEXT:    ldp q2, q3, [sp, #32]
 ; CHECK-NEXT:    stp x12, x8, [sp]
-; CHECK-NEXT:    ldp q3, q2, [sp, #32]
 ; CHECK-NEXT:    stp x10, x9, [sp, #16]
 ; CHECK-NEXT:    ldp q1, q0, [sp]
-; CHECK-NEXT:    stp q3, q2, [x1]
+; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    stp q1, q0, [x1, #32]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
@@ -967,53 +966,54 @@ define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #128
 ; CHECK-NEXT:    .cfi_def_cfa_offset 128
-; CHECK-NEXT:    ldp q1, q0, [x0]
-; CHECK-NEXT:    mov z2.h, z1.h[1]
-; CHECK-NEXT:    mov z3.h, z1.h[3]
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    mov z1.h, z0.h[1]
+; CHECK-NEXT:    mov z2.h, z0.h[3]
+; CHECK-NEXT:    fcvtzs x8, h0
+; CHECK-NEXT:    mov z3.h, z0.h[2]
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    fcvtzs x9, h1
+; CHECK-NEXT:    fcvtzs x10, h2
+; CHECK-NEXT:    ldr q1, [x0, #16]
+; CHECK-NEXT:    fcvtzs x11, h3
+; CHECK-NEXT:    mov z2.h, z0.h[1]
+; CHECK-NEXT:    mov z3.h, z0.h[3]
+; CHECK-NEXT:    fcvtzs x12, h1
+; CHECK-NEXT:    stp x8, x9, [sp, #32]
+; CHECK-NEXT:    fcvtzs x8, h0
+; CHECK-NEXT:    mov z0.h, z0.h[2]
 ; CHECK-NEXT:    fcvtzs x9, h2
-; CHECK-NEXT:    mov z2.h, z1.h[2]
-; CHECK-NEXT:    fcvtzs x8, h1
+; CHECK-NEXT:    stp x11, x10, [sp, #48]
 ; CHECK-NEXT:    fcvtzs x10, h3
-; CHECK-NEXT:    fcvtzs x11, h2
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    fcvtzs x12, h1
 ; CHECK-NEXT:    mov z2.h, z1.h[1]
 ; CHECK-NEXT:    mov z3.h, z1.h[3]
-; CHECK-NEXT:    mov z1.h, z1.h[2]
-; CHECK-NEXT:    stp x8, x9, [sp, #32]
-; CHECK-NEXT:    fcvtzs x9, h3
-; CHECK-NEXT:    stp x11, x10, [sp, #48]
-; CHECK-NEXT:    fcvtzs x10, h1
+; CHECK-NEXT:    fcvtzs x11, h0
+; CHECK-NEXT:    mov z0.h, z1.h[2]
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    stp x8, x9, [sp]
 ; CHECK-NEXT:    fcvtzs x8, h2
-; CHECK-NEXT:    mov z1.h, z0.h[1]
-; CHECK-NEXT:    stp x10, x9, [sp, #16]
-; CHECK-NEXT:    fcvtzs x9, h1
-; CHECK-NEXT:    mov z1.h, z0.h[3]
-; CHECK-NEXT:    stp x12, x8, [sp]
-; CHECK-NEXT:    fcvtzs x8, h0
-; CHECK-NEXT:    fcvtzs x10, h1
-; CHECK-NEXT:    mov z1.h, z0.h[2]
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    fcvtzs x9, h3
+; CHECK-NEXT:    mov z2.h, z1.h[1]
+; CHECK-NEXT:    stp x11, x10, [sp, #16]
+; CHECK-NEXT:    fcvtzs x10, h0
+; CHECK-NEXT:    mov z0.h, z1.h[3]
 ; CHECK-NEXT:    fcvtzs x11, h1
-; CHECK-NEXT:    mov z1.h, z0.h[1]
-; CHECK-NEXT:    stp x8, x9, [sp, #96]
+; CHECK-NEXT:    mov z1.h, z1.h[2]
+; CHECK-NEXT:    stp x12, x8, [sp, #96]
+; CHECK-NEXT:    fcvtzs x12, h2
 ; CHECK-NEXT:    fcvtzs x8, h0
+; CHECK-NEXT:    ldp q3, q4, [sp]
+; CHECK-NEXT:    stp x10, x9, [sp, #112]
 ; CHECK-NEXT:    fcvtzs x9, h1
-; CHECK-NEXT:    mov z1.h, z0.h[3]
-; CHECK-NEXT:    mov z0.h, z0.h[2]
-; CHECK-NEXT:    stp x11, x10, [sp, #112]
-; CHECK-NEXT:    fcvtzs x10, h1
-; CHECK-NEXT:    fcvtzs x11, h0
-; CHECK-NEXT:    stp x8, x9, [sp, #64]
 ; CHECK-NEXT:    ldp q0, q1, [sp, #32]
-; CHECK-NEXT:    stp x11, x10, [sp, #80]
-; CHECK-NEXT:    ldp q2, q3, [sp]
-; CHECK-NEXT:    ldp q5, q4, [sp, #64]
-; CHECK-NEXT:    ldp q7, q6, [sp, #96]
+; CHECK-NEXT:    stp x11, x12, [sp, #64]
+; CHECK-NEXT:    ldp q6, q7, [sp, #96]
+; CHECK-NEXT:    stp x9, x8, [sp, #80]
+; CHECK-NEXT:    ldp q5, q2, [sp, #64]
 ; CHECK-NEXT:    stp q0, q1, [x1]
-; CHECK-NEXT:    stp q2, q3, [x1, #32]
-; CHECK-NEXT:    stp q5, q4, [x1, #96]
-; CHECK-NEXT:    stp q7, q6, [x1, #64]
+; CHECK-NEXT:    stp q3, q4, [x1, #32]
+; CHECK-NEXT:    stp q6, q7, [x1, #64]
+; CHECK-NEXT:    stp q5, q2, [x1, #96]
 ; CHECK-NEXT:    add sp, sp, #128
 ; CHECK-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
@@ -1029,8 +1029,8 @@ define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) {
 define <2 x i16> @fcvtzs_v2f32_v2i16(<2 x float> %op1) {
 ; CHECK-LABEL: fcvtzs_v2f32_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -1041,8 +1041,8 @@ define <2 x i16> @fcvtzs_v2f32_v2i16(<2 x float> %op1) {
 define <4 x i16> @fcvtzs_v4f32_v4i16(<4 x float> %op1) {
 ; CHECK-LABEL: fcvtzs_v4f32_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -1054,14 +1054,14 @@ define <4 x i16> @fcvtzs_v4f32_v4i16(<4 x float> %op1) {
 define <8 x i16> @fcvtzs_v8f32_v8i16(ptr %a) {
 ; CHECK-LABEL: fcvtzs_v8f32_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fcvtzs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    uzp1 z2.h, z0.h, z0.h
-; CHECK-NEXT:    uzp1 z0.h, z1.h, z1.h
-; CHECK-NEXT:    splice z0.h, p0, z0.h, z2.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
@@ -1072,21 +1072,21 @@ define <8 x i16> @fcvtzs_v8f32_v8i16(ptr %a) {
 define void @fcvtzs_v16f32_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v16f32_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ptrue p1.h, vl4
-; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    ldp q3, q2, [x0, #32]
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    fcvtzs z1.s, p0/m, z1.s
-; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
-; CHECK-NEXT:    splice z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    fcvtzs z3.s, p0/m, z3.s
-; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; CHECK-NEXT:    fcvtzs z2.s, p0/m, z2.s
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    splice z3.h, p1, z3.h, z2.h
-; CHECK-NEXT:    stp q0, q3, [x1]
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
+; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x float>, ptr %a
   %res = fptosi <16 x float> %op1 to <16 x i16>
@@ -1101,8 +1101,8 @@ define void @fcvtzs_v16f32_v16i16(ptr %a, ptr %b) {
 define <2 x i32> @fcvtzs_v2f32_v2i32(<2 x float> %op1) {
 ; CHECK-LABEL: fcvtzs_v2f32_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -1113,8 +1113,8 @@ define <2 x i32> @fcvtzs_v2f32_v2i32(<2 x float> %op1) {
 define <4 x i32> @fcvtzs_v4f32_v4i32(<4 x float> %op1) {
 ; CHECK-LABEL: fcvtzs_v4f32_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -1125,8 +1125,8 @@ define <4 x i32> @fcvtzs_v4f32_v4i32(<4 x float> %op1) {
 define void @fcvtzs_v8f32_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v8f32_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    fcvtzs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x1]
@@ -1144,8 +1144,8 @@ define void @fcvtzs_v8f32_v8i32(ptr %a, ptr %b) {
 define <1 x i64> @fcvtzs_v1f32_v1i64(<1 x float> %op1) {
 ; CHECK-LABEL: fcvtzs_v1f32_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -1157,8 +1157,8 @@ define <1 x i64> @fcvtzs_v1f32_v1i64(<1 x float> %op1) {
 define <2 x i64> @fcvtzs_v2f32_v2i64(<2 x float> %op1) {
 ; CHECK-LABEL: fcvtzs_v2f32_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -1188,21 +1188,20 @@ define void @fcvtzs_v4f32_v4i64(ptr %a, ptr %b) {
 define void @fcvtzs_v8f32_v8i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v8f32_v8i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    uunpklo z2.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    uunpklo z3.d, z1.s
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.s
 ; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.s
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.s
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
-; CHECK-NEXT:    stp q3, q1, [x1, #32]
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z2.s
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.s
+; CHECK-NEXT:    stp q2, q0, [x1, #32]
+; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %res = fptosi <8 x float> %op1 to <8 x i64>
@@ -1231,8 +1230,8 @@ define <1 x i16> @fcvtzs_v1f64_v1i16(<1 x double> %op1) {
 define <2 x i16> @fcvtzs_v2f64_v2i16(<2 x double> %op1) {
 ; CHECK-LABEL: fcvtzs_v2f64_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -1246,21 +1245,21 @@ define <4 x i16> @fcvtzs_v4f64_v4i16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
-; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
-; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    mov z1.s, z1.s[1]
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    fmov w10, s0
-; CHECK-NEXT:    strh w9, [sp, #8]
+; CHECK-NEXT:    mov z2.s, z0.s[1]
+; CHECK-NEXT:    mov z0.s, z1.s[1]
 ; CHECK-NEXT:    strh w8, [sp, #12]
 ; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w10, [sp, #14]
+; CHECK-NEXT:    strh w8, [sp, #8]
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    strh w8, [sp, #14]
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    strh w8, [sp, #10]
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
@@ -1275,37 +1274,37 @@ define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ldp q0, q1, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    ldp q1, q0, [x0, #32]
 ; CHECK-NEXT:    ldp q3, q2, [x0]
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z4.s, z1.s[1]
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.d
 ; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.d
-; CHECK-NEXT:    strh w9, [sp, #8]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
 ; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.d
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    mov z0.s, z0.s[1]
 ; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov z1.s, z1.s[1]
+; CHECK-NEXT:    strh w8, [sp, #8]
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    mov z2.s, z2.s[1]
+; CHECK-NEXT:    strh w8, [sp, #4]
 ; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w10, s2
-; CHECK-NEXT:    mov z1.s, z0.s[1]
-; CHECK-NEXT:    mov z0.s, z2.s[1]
-; CHECK-NEXT:    mov z2.s, z3.s[1]
+; CHECK-NEXT:    mov z3.s, z3.s[1]
 ; CHECK-NEXT:    strh w8, [sp]
 ; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strh w10, [sp, #4]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    strh w9, [sp, #14]
-; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    strh w8, [sp, #14]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    strh w8, [sp, #10]
+; CHECK-NEXT:    fmov w8, s2
 ; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    strh w10, [sp, #10]
-; CHECK-NEXT:    strh w9, [sp, #2]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    strh w8, [sp, #2]
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
@@ -1318,67 +1317,68 @@ define void @fcvtzs_v16f64_v16i16(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #32
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    ldp q2, q3, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ldp q3, q2, [x0]
+; CHECK-NEXT:    ldr q6, [x0, #112]
+; CHECK-NEXT:    ldp q4, q5, [x0, #80]
+; CHECK-NEXT:    ldr q7, [x0, #64]
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.d
-; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
-; CHECK-NEXT:    ldp q4, q5, [x0]
 ; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.d
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z6.s, z3.s[1]
-; CHECK-NEXT:    fcvtzs z4.d, p0/m, z4.d
-; CHECK-NEXT:    mov z3.s, z2.s[1]
-; CHECK-NEXT:    uzp1 z4.s, z4.s, z4.s
-; CHECK-NEXT:    ldp q0, q1, [x0, #64]
+; CHECK-NEXT:    fcvtzs z6.d, p0/m, z6.d
 ; CHECK-NEXT:    fcvtzs z5.d, p0/m, z5.d
-; CHECK-NEXT:    uzp1 z5.s, z5.s, z5.s
-; CHECK-NEXT:    fmov w10, s5
-; CHECK-NEXT:    mov z5.s, z5.s[1]
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z4.d
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    ldp q2, q7, [x0, #96]
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
+; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
+; CHECK-NEXT:    uzp1 z6.s, z6.s, z6.s
+; CHECK-NEXT:    uzp1 z5.s, z5.s, z5.s
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov z16.s, z1.s[1]
+; CHECK-NEXT:    mov z1.s, z0.s[1]
 ; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    mov z4.s, z4.s[1]
-; CHECK-NEXT:    strh w10, [sp, #4]
-; CHECK-NEXT:    strh w8, [sp]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    mov z0.s, z2.s[1]
+; CHECK-NEXT:    strh w8, [sp, #8]
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    mov z2.s, z3.s[1]
+; CHECK-NEXT:    strh w8, [sp, #4]
 ; CHECK-NEXT:    fmov w8, s3
 ; CHECK-NEXT:    movprfx z3, z7
 ; CHECK-NEXT:    fcvtzs z3.d, p0/m, z7.d
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.d
-; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
+; CHECK-NEXT:    strh w8, [sp]
+; CHECK-NEXT:    fmov w8, s16
+; CHECK-NEXT:    strh w8, [sp, #14]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    uzp1 z1.s, z4.s, z4.s
 ; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w9, [sp, #14]
-; CHECK-NEXT:    fmov w9, s5
-; CHECK-NEXT:    fmov w10, s4
-; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
-; CHECK-NEXT:    strh w8, [sp, #28]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    uzp1 z0.s, z3.s, z3.s
+; CHECK-NEXT:    mov z3.s, z5.s[1]
+; CHECK-NEXT:    strh w8, [sp, #6]
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z3.s, z3.s[1]
-; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    strh w9, [sp, #6]
-; CHECK-NEXT:    fmov w9, s1
-; CHECK-NEXT:    strh w10, [sp, #2]
-; CHECK-NEXT:    fmov w10, s0
+; CHECK-NEXT:    mov z2.s, z6.s[1]
+; CHECK-NEXT:    strh w8, [sp, #2]
+; CHECK-NEXT:    fmov w8, s6
+; CHECK-NEXT:    strh w8, [sp, #28]
+; CHECK-NEXT:    fmov w8, s5
 ; CHECK-NEXT:    strh w8, [sp, #24]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z4.s, z2.s[1]
-; CHECK-NEXT:    mov z2.s, z1.s[1]
-; CHECK-NEXT:    mov z1.s, z0.s[1]
-; CHECK-NEXT:    strh w9, [sp, #20]
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    strh w10, [sp, #16]
-; CHECK-NEXT:    fmov w10, s2
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov z1.s, z1.s[1]
+; CHECK-NEXT:    strh w8, [sp, #20]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    mov z0.s, z0.s[1]
+; CHECK-NEXT:    strh w8, [sp, #16]
+; CHECK-NEXT:    fmov w8, s2
 ; CHECK-NEXT:    strh w8, [sp, #30]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    strh w8, [sp, #26]
 ; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w9, [sp, #26]
-; CHECK-NEXT:    strh w10, [sp, #22]
+; CHECK-NEXT:    strh w8, [sp, #22]
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    strh w8, [sp, #18]
 ; CHECK-NEXT:    ldp q1, q0, [sp]
 ; CHECK-NEXT:    stp q1, q0, [x1]
@@ -1397,8 +1397,8 @@ define void @fcvtzs_v16f64_v16i16(ptr %a, ptr %b) {
 define <1 x i32> @fcvtzs_v1f64_v1i32(<1 x double> %op1) {
 ; CHECK-LABEL: fcvtzs_v1f64_v1i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -1410,8 +1410,8 @@ define <1 x i32> @fcvtzs_v1f64_v1i32(<1 x double> %op1) {
 define <2 x i32> @fcvtzs_v2f64_v2i32(<2 x double> %op1) {
 ; CHECK-LABEL: fcvtzs_v2f64_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -1423,14 +1423,14 @@ define <2 x i32> @fcvtzs_v2f64_v2i32(<2 x double> %op1) {
 define <4 x i32> @fcvtzs_v4f64_v4i32(ptr %a) {
 ; CHECK-LABEL: fcvtzs_v4f64_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    ptrue p0.s, vl2
-; CHECK-NEXT:    uzp1 z2.s, z0.s, z0.s
-; CHECK-NEXT:    uzp1 z0.s, z1.s, z1.s
-; CHECK-NEXT:    splice z0.s, p0, z0.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
@@ -1441,21 +1441,21 @@ define <4 x i32> @fcvtzs_v4f64_v4i32(ptr %a) {
 define void @fcvtzs_v8f64_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v8f64_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ptrue p1.s, vl2
-; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    ldp q3, q2, [x0, #32]
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
-; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    splice z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.d
-; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
 ; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.d
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
 ; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
-; CHECK-NEXT:    splice z3.s, p1, z3.s, z2.s
-; CHECK-NEXT:    stp q0, q3, [x1]
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
+; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %res = fptosi <8 x double> %op1 to <8 x i32>
@@ -1470,8 +1470,8 @@ define void @fcvtzs_v8f64_v8i32(ptr %a, ptr %b) {
 define <1 x i64> @fcvtzs_v1f64_v1i64(<1 x double> %op1) {
 ; CHECK-LABEL: fcvtzs_v1f64_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -1482,8 +1482,8 @@ define <1 x i64> @fcvtzs_v1f64_v1i64(<1 x double> %op1) {
 define <2 x i64> @fcvtzs_v2f64_v2i64(<2 x double> %op1) {
 ; CHECK-LABEL: fcvtzs_v2f64_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -1494,8 +1494,8 @@ define <2 x i64> @fcvtzs_v2f64_v2i64(<2 x double> %op1) {
 define void @fcvtzs_v4f64_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v4f64_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x1]

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
index 685efd0574347c..41981f06d65523 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
@@ -11,12 +11,12 @@ define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x i1> %mask
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $z2
 ; CHECK-NEXT:    mov z3.s, z2.s[1]
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    strh w9, [sp, #10]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    strh w8, [sp, #10]
 ; CHECK-NEXT:    ldr d2, [sp, #8]
 ; CHECK-NEXT:    lsl z2.h, z2.h, #15
 ; CHECK-NEXT:    asr z2.h, z2.h, #15
@@ -70,14 +70,14 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask
 define void @select_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: select_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fcmeq p1.h, p0/z, z1.h, z2.h
-; CHECK-NEXT:    sel z1.h, p1, z1.h, z2.h
-; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, z3.h
-; CHECK-NEXT:    sel z0.h, p0, z0.h, z3.h
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    ldp q0, q2, [x0]
+; CHECK-NEXT:    ldp q1, q3, [x1]
+; CHECK-NEXT:    fcmeq p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    fcmeq p0.h, p0/z, z2.h, z3.h
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    sel z1.h, p0, z2.h, z3.h
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -127,14 +127,14 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %m
 define void @select_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: select_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fcmeq p1.s, p0/z, z1.s, z2.s
-; CHECK-NEXT:    sel z1.s, p1, z1.s, z2.s
-; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, z3.s
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z3.s
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    ldp q0, q2, [x0]
+; CHECK-NEXT:    ldp q1, q3, [x1]
+; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    fcmeq p0.s, p0/z, z2.s, z3.s
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    sel z1.s, p0, z2.s, z3.s
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -185,14 +185,14 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1>
 define void @select_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: select_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fcmeq p1.d, p0/z, z1.d, z2.d
-; CHECK-NEXT:    sel z1.d, p1, z1.d, z2.d
-; CHECK-NEXT:    fcmeq p0.d, p0/z, z0.d, z3.d
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z3.d
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    ldp q0, q2, [x0]
+; CHECK-NEXT:    ldp q1, q3, [x1]
+; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    fcmeq p0.d, p0/z, z2.d, z3.d
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    sel z1.d, p0, z2.d, z3.d
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
index 8252ca84aa0482..0b3e7695e6a0a5 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
@@ -11,14 +11,14 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x i8> @insertelement_v4i8(<4 x i8> %op1) {
 ; CHECK-LABEL: insertelement_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #3 // =0x3
-; CHECK-NEXT:    mov w9, #5 // =0x5
-; CHECK-NEXT:    index z2.h, #0, #1
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov w8, #3 // =0x3
+; CHECK-NEXT:    index z1.h, #0, #1
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    mov z1.h, w8
-; CHECK-NEXT:    cmpeq p0.h, p0/z, z2.h, z1.h
-; CHECK-NEXT:    mov z0.h, p0/m, w9
+; CHECK-NEXT:    mov w8, #5 // =0x5
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z1.h, z2.h
+; CHECK-NEXT:    mov z0.h, p0/m, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
     %r = insertelement <4 x i8> %op1, i8 5, i64 3
@@ -28,14 +28,14 @@ define <4 x i8> @insertelement_v4i8(<4 x i8> %op1) {
 define <8 x i8> @insertelement_v8i8(<8 x i8> %op1) {
 ; CHECK-LABEL: insertelement_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #7 // =0x7
-; CHECK-NEXT:    mov w9, #5 // =0x5
-; CHECK-NEXT:    index z2.b, #0, #1
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov w8, #7 // =0x7
+; CHECK-NEXT:    index z1.b, #0, #1
+; CHECK-NEXT:    mov z2.b, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    mov z1.b, w8
-; CHECK-NEXT:    cmpeq p0.b, p0/z, z2.b, z1.b
-; CHECK-NEXT:    mov z0.b, p0/m, w9
+; CHECK-NEXT:    mov w8, #5 // =0x5
+; CHECK-NEXT:    cmpeq p0.b, p0/z, z1.b, z2.b
+; CHECK-NEXT:    mov z0.b, p0/m, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
     %r = insertelement <8 x i8> %op1, i8 5, i64 7
@@ -45,14 +45,14 @@ define <8 x i8> @insertelement_v8i8(<8 x i8> %op1) {
 define <16 x i8> @insertelement_v16i8(<16 x i8> %op1) {
 ; CHECK-LABEL: insertelement_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #15 // =0xf
-; CHECK-NEXT:    mov w9, #5 // =0x5
-; CHECK-NEXT:    index z2.b, #0, #1
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov w8, #15 // =0xf
+; CHECK-NEXT:    index z1.b, #0, #1
+; CHECK-NEXT:    mov z2.b, w8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    mov z1.b, w8
-; CHECK-NEXT:    cmpeq p0.b, p0/z, z2.b, z1.b
-; CHECK-NEXT:    mov z0.b, p0/m, w9
+; CHECK-NEXT:    mov w8, #5 // =0x5
+; CHECK-NEXT:    cmpeq p0.b, p0/z, z1.b, z2.b
+; CHECK-NEXT:    mov z0.b, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
     %r = insertelement <16 x i8> %op1, i8 5, i64 15
@@ -62,14 +62,14 @@ define <16 x i8> @insertelement_v16i8(<16 x i8> %op1) {
 define <32 x i8> @insertelement_v32i8(<32 x i8> %op1) {
 ; CHECK-LABEL: insertelement_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #15 // =0xf
-; CHECK-NEXT:    mov w9, #5 // =0x5
-; CHECK-NEXT:    index z3.b, #0, #1
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov w8, #15 // =0xf
+; CHECK-NEXT:    index z2.b, #0, #1
+; CHECK-NEXT:    mov z3.b, w8
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
-; CHECK-NEXT:    mov z2.b, w8
-; CHECK-NEXT:    cmpeq p0.b, p0/z, z3.b, z2.b
-; CHECK-NEXT:    mov z1.b, p0/m, w9
+; CHECK-NEXT:    mov w8, #5 // =0x5
+; CHECK-NEXT:    cmpeq p0.b, p0/z, z2.b, z3.b
+; CHECK-NEXT:    mov z1.b, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
     %r = insertelement <32 x i8> %op1, i8 5, i64 31
@@ -80,14 +80,14 @@ define <32 x i8> @insertelement_v32i8(<32 x i8> %op1) {
 define <2 x i16> @insertelement_v2i16(<2 x i16> %op1) {
 ; CHECK-LABEL: insertelement_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    mov w9, #5 // =0x5
-; CHECK-NEXT:    index z2.s, #0, #1
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    index z1.s, #0, #1
+; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    mov z1.s, w8
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z2.s, z1.s
-; CHECK-NEXT:    mov z0.s, p0/m, w9
+; CHECK-NEXT:    mov w8, #5 // =0x5
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z1.s, z2.s
+; CHECK-NEXT:    mov z0.s, p0/m, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
     %r = insertelement <2 x i16> %op1, i16 5, i64 1
@@ -97,14 +97,14 @@ define <2 x i16> @insertelement_v2i16(<2 x i16> %op1) {
 define <4 x i16> @insertelement_v4i16(<4 x i16> %op1) {
 ; CHECK-LABEL: insertelement_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #3 // =0x3
-; CHECK-NEXT:    mov w9, #5 // =0x5
-; CHECK-NEXT:    index z2.h, #0, #1
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov w8, #3 // =0x3
+; CHECK-NEXT:    index z1.h, #0, #1
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    mov z1.h, w8
-; CHECK-NEXT:    cmpeq p0.h, p0/z, z2.h, z1.h
-; CHECK-NEXT:    mov z0.h, p0/m, w9
+; CHECK-NEXT:    mov w8, #5 // =0x5
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z1.h, z2.h
+; CHECK-NEXT:    mov z0.h, p0/m, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
     %r = insertelement <4 x i16> %op1, i16 5, i64 3
@@ -114,14 +114,14 @@ define <4 x i16> @insertelement_v4i16(<4 x i16> %op1) {
 define <8 x i16> @insertelement_v8i16(<8 x i16> %op1) {
 ; CHECK-LABEL: insertelement_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #7 // =0x7
-; CHECK-NEXT:    mov w9, #5 // =0x5
-; CHECK-NEXT:    index z2.h, #0, #1
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov w8, #7 // =0x7
+; CHECK-NEXT:    index z1.h, #0, #1
+; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    mov z1.h, w8
-; CHECK-NEXT:    cmpeq p0.h, p0/z, z2.h, z1.h
-; CHECK-NEXT:    mov z0.h, p0/m, w9
+; CHECK-NEXT:    mov w8, #5 // =0x5
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z1.h, z2.h
+; CHECK-NEXT:    mov z0.h, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
     %r = insertelement <8 x i16> %op1, i16 5, i64 7
@@ -131,14 +131,14 @@ define <8 x i16> @insertelement_v8i16(<8 x i16> %op1) {
 define <16 x i16> @insertelement_v16i16(<16 x i16> %op1) {
 ; CHECK-LABEL: insertelement_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #7 // =0x7
-; CHECK-NEXT:    mov w9, #5 // =0x5
-; CHECK-NEXT:    index z3.h, #0, #1
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov w8, #7 // =0x7
+; CHECK-NEXT:    index z2.h, #0, #1
+; CHECK-NEXT:    mov z3.h, w8
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
-; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    cmpeq p0.h, p0/z, z3.h, z2.h
-; CHECK-NEXT:    mov z1.h, p0/m, w9
+; CHECK-NEXT:    mov w8, #5 // =0x5
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z2.h, z3.h
+; CHECK-NEXT:    mov z1.h, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
     %r = insertelement <16 x i16> %op1, i16 5, i64 15
@@ -149,14 +149,14 @@ define <16 x i16> @insertelement_v16i16(<16 x i16> %op1) {
 define <2 x i32> @insertelement_v2i32(<2 x i32> %op1) {
 ; CHECK-LABEL: insertelement_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    mov w9, #5 // =0x5
-; CHECK-NEXT:    index z2.s, #0, #1
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    index z1.s, #0, #1
+; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    mov z1.s, w8
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z2.s, z1.s
-; CHECK-NEXT:    mov z0.s, p0/m, w9
+; CHECK-NEXT:    mov w8, #5 // =0x5
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z1.s, z2.s
+; CHECK-NEXT:    mov z0.s, p0/m, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
     %r = insertelement <2 x i32> %op1, i32 5, i64 1
@@ -166,14 +166,14 @@ define <2 x i32> @insertelement_v2i32(<2 x i32> %op1) {
 define <4 x i32> @insertelement_v4i32(<4 x i32> %op1) {
 ; CHECK-LABEL: insertelement_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #3 // =0x3
-; CHECK-NEXT:    mov w9, #5 // =0x5
-; CHECK-NEXT:    index z2.s, #0, #1
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov w8, #3 // =0x3
+; CHECK-NEXT:    index z1.s, #0, #1
+; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    mov z1.s, w8
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z2.s, z1.s
-; CHECK-NEXT:    mov z0.s, p0/m, w9
+; CHECK-NEXT:    mov w8, #5 // =0x5
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z1.s, z2.s
+; CHECK-NEXT:    mov z0.s, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
     %r = insertelement <4 x i32> %op1, i32 5, i64 3
@@ -183,13 +183,13 @@ define <4 x i32> @insertelement_v4i32(<4 x i32> %op1) {
 define <8 x i32> @insertelement_v8i32(ptr %a) {
 ; CHECK-LABEL: insertelement_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #3 // =0x3
-; CHECK-NEXT:    index z3.s, #0, #1
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z2.s, w8
+; CHECK-NEXT:    mov w8, #3 // =0x3
+; CHECK-NEXT:    index z0.s, #0, #1
+; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mov w8, #5 // =0x5
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z3.s, z2.s
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    mov z1.s, p0/m, w8
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
@@ -212,14 +212,14 @@ define <1 x i64> @insertelement_v1i64(<1 x i64> %op1) {
 define <2 x i64> @insertelement_v2i64(<2 x i64> %op1) {
 ; CHECK-LABEL: insertelement_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    mov w9, #5 // =0x5
-; CHECK-NEXT:    index z2.d, #0, #1
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    index z1.d, #0, #1
+; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    mov z1.d, x8
-; CHECK-NEXT:    cmpeq p0.d, p0/z, z2.d, z1.d
-; CHECK-NEXT:    mov z0.d, p0/m, x9
+; CHECK-NEXT:    mov w8, #5 // =0x5
+; CHECK-NEXT:    cmpeq p0.d, p0/z, z1.d, z2.d
+; CHECK-NEXT:    mov z0.d, p0/m, x8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
     %r = insertelement <2 x i64> %op1, i64 5, i64 1
@@ -229,13 +229,13 @@ define <2 x i64> @insertelement_v2i64(<2 x i64> %op1) {
 define <4 x i64> @insertelement_v4i64(ptr %a) {
 ; CHECK-LABEL: insertelement_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    index z3.d, #0, #1
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    index z0.d, #0, #1
+; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mov w8, #5 // =0x5
-; CHECK-NEXT:    cmpeq p0.d, p0/z, z3.d, z2.d
+; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    mov z1.d, p0/m, x8
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
@@ -264,13 +264,13 @@ define <2 x half> @insertelement_v2f16(<2 x half> %op1) {
 define <4 x half> @insertelement_v4f16(<4 x half> %op1) {
 ; CHECK-LABEL: insertelement_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #3 // =0x3
-; CHECK-NEXT:    fmov h1, #5.00000000
-; CHECK-NEXT:    index z3.h, #0, #1
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov w8, #3 // =0x3
+; CHECK-NEXT:    index z1.h, #0, #1
 ; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    cmpeq p0.h, p0/z, z3.h, z2.h
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z1.h, z2.h
+; CHECK-NEXT:    fmov h1, #5.00000000
 ; CHECK-NEXT:    mov z0.h, p0/m, h1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -281,13 +281,13 @@ define <4 x half> @insertelement_v4f16(<4 x half> %op1) {
 define <8 x half> @insertelement_v8f16(<8 x half> %op1) {
 ; CHECK-LABEL: insertelement_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #7 // =0x7
-; CHECK-NEXT:    fmov h1, #5.00000000
-; CHECK-NEXT:    index z3.h, #0, #1
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov w8, #7 // =0x7
+; CHECK-NEXT:    index z1.h, #0, #1
 ; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    cmpeq p0.h, p0/z, z3.h, z2.h
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z1.h, z2.h
+; CHECK-NEXT:    fmov h1, #5.00000000
 ; CHECK-NEXT:    mov z0.h, p0/m, h1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -298,14 +298,14 @@ define <8 x half> @insertelement_v8f16(<8 x half> %op1) {
 define <16 x half> @insertelement_v16f16(ptr %a) {
 ; CHECK-LABEL: insertelement_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    mov w8, #7 // =0x7
-; CHECK-NEXT:    fmov h3, #5.00000000
-; CHECK-NEXT:    index z4.h, #0, #1
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    cmpeq p0.h, p0/z, z4.h, z2.h
-; CHECK-NEXT:    mov z1.h, p0/m, h3
+; CHECK-NEXT:    mov w8, #7 // =0x7
+; CHECK-NEXT:    index z0.h, #0, #1
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    fmov h2, #5.00000000
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    mov z1.h, p0/m, h2
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
     %op1 = load <16 x half>, ptr %a
@@ -317,13 +317,13 @@ define <16 x half> @insertelement_v16f16(ptr %a) {
 define <2 x float> @insertelement_v2f32(<2 x float> %op1) {
 ; CHECK-LABEL: insertelement_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    fmov s1, #5.00000000
-; CHECK-NEXT:    index z3.s, #0, #1
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    index z1.s, #0, #1
 ; CHECK-NEXT:    mov z2.s, w8
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z3.s, z2.s
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z1.s, z2.s
+; CHECK-NEXT:    fmov s1, #5.00000000
 ; CHECK-NEXT:    mov z0.s, p0/m, s1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -334,13 +334,13 @@ define <2 x float> @insertelement_v2f32(<2 x float> %op1) {
 define <4 x float> @insertelement_v4f32(<4 x float> %op1) {
 ; CHECK-LABEL: insertelement_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #3 // =0x3
-; CHECK-NEXT:    fmov s1, #5.00000000
-; CHECK-NEXT:    index z3.s, #0, #1
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov w8, #3 // =0x3
+; CHECK-NEXT:    index z1.s, #0, #1
 ; CHECK-NEXT:    mov z2.s, w8
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z3.s, z2.s
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z1.s, z2.s
+; CHECK-NEXT:    fmov s1, #5.00000000
 ; CHECK-NEXT:    mov z0.s, p0/m, s1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -351,14 +351,14 @@ define <4 x float> @insertelement_v4f32(<4 x float> %op1) {
 define <8 x float> @insertelement_v8f32(ptr %a) {
 ; CHECK-LABEL: insertelement_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    mov w8, #3 // =0x3
-; CHECK-NEXT:    fmov s4, #5.00000000
-; CHECK-NEXT:    index z2.s, #0, #1
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov z3.s, w8
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z2.s, z3.s
-; CHECK-NEXT:    mov z1.s, p0/m, s4
+; CHECK-NEXT:    mov w8, #3 // =0x3
+; CHECK-NEXT:    index z0.s, #0, #1
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    fmov s2, #5.00000000
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    mov z1.s, p0/m, s2
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
     %op1 = load <8 x float>, ptr %a
@@ -379,13 +379,13 @@ define <1 x double> @insertelement_v1f64(<1 x double> %op1) {
 define <2 x double> @insertelement_v2f64(<2 x double> %op1) {
 ; CHECK-LABEL: insertelement_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    fmov d1, #5.00000000
-; CHECK-NEXT:    index z3.d, #0, #1
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    index z1.d, #0, #1
 ; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    cmpeq p0.d, p0/z, z3.d, z2.d
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    cmpeq p0.d, p0/z, z1.d, z2.d
+; CHECK-NEXT:    fmov d1, #5.00000000
 ; CHECK-NEXT:    mov z0.d, p0/m, d1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -396,14 +396,14 @@ define <2 x double> @insertelement_v2f64(<2 x double> %op1) {
 define <4 x double> @insertelement_v4f64(ptr %a) {
 ; CHECK-LABEL: insertelement_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    fmov d4, #5.00000000
-; CHECK-NEXT:    index z2.d, #0, #1
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z3.d, x8
-; CHECK-NEXT:    cmpeq p0.d, p0/z, z2.d, z3.d
-; CHECK-NEXT:    mov z1.d, p0/m, d4
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    index z0.d, #0, #1
+; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    fmov d2, #5.00000000
+; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    mov z1.d, p0/m, d2
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
     %op1 = load <4 x double>, ptr %a

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
index e93c3b4cf685f0..a66ffc10ca5f42 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
@@ -46,10 +46,10 @@ define <16 x i8> @add_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @add_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: add_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    add z0.b, z0.b, z2.b
-; CHECK-NEXT:    add z1.b, z1.b, z3.b
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    add z0.b, z1.b, z0.b
+; CHECK-NEXT:    add z1.b, z2.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
@@ -98,10 +98,10 @@ define <8 x i16> @add_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @add_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: add_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    add z0.h, z0.h, z2.h
-; CHECK-NEXT:    add z1.h, z1.h, z3.h
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    add z0.h, z1.h, z0.h
+; CHECK-NEXT:    add z1.h, z2.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
@@ -138,10 +138,10 @@ define <4 x i32> @add_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @add_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: add_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEXT:    add z1.s, z1.s, z3.s
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    add z0.s, z1.s, z0.s
+; CHECK-NEXT:    add z1.s, z2.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
@@ -178,10 +178,10 @@ define <2 x i64> @add_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @add_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: add_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEXT:    add z1.d, z1.d, z3.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    add z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
@@ -198,8 +198,8 @@ define void @add_v4i64(ptr %a, ptr %b) {
 define <4 x i8> @mul_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; SVE-LABEL: mul_v4i8:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    ptrue p0.h, vl4
+; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; SVE-NEXT:    mul z0.h, p0/m, z0.h, z1.h
 ; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -219,8 +219,8 @@ define <4 x i8> @mul_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 define <8 x i8> @mul_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; SVE-LABEL: mul_v8i8:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    ptrue p0.b, vl8
+; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; SVE-NEXT:    mul z0.b, p0/m, z0.b, z1.b
 ; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -240,8 +240,8 @@ define <8 x i8> @mul_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; SVE-LABEL: mul_v16i8:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    ptrue p0.b, vl16
+; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; SVE-NEXT:    mul z0.b, p0/m, z0.b, z1.b
 ; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -261,20 +261,21 @@ define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @mul_v32i8(ptr %a, ptr %b) {
 ; SVE-LABEL: mul_v32i8:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ldp q0, q1, [x0]
 ; SVE-NEXT:    ptrue p0.b, vl16
-; SVE-NEXT:    ldp q2, q3, [x1]
-; SVE-NEXT:    mul z0.b, p0/m, z0.b, z2.b
+; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ldp q1, q2, [x0]
+; SVE-NEXT:    mul z0.b, p0/m, z0.b, z1.b
+; SVE-NEXT:    movprfx z1, z2
 ; SVE-NEXT:    mul z1.b, p0/m, z1.b, z3.b
 ; SVE-NEXT:    stp q0, q1, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: mul_v32i8:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ldp q0, q1, [x0]
-; SVE2-NEXT:    ldp q2, q3, [x1]
-; SVE2-NEXT:    mul z0.b, z0.b, z2.b
-; SVE2-NEXT:    mul z1.b, z1.b, z3.b
+; SVE2-NEXT:    ldp q0, q3, [x1]
+; SVE2-NEXT:    ldp q1, q2, [x0]
+; SVE2-NEXT:    mul z0.b, z1.b, z0.b
+; SVE2-NEXT:    mul z1.b, z2.b, z3.b
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
@@ -287,8 +288,8 @@ define void @mul_v32i8(ptr %a, ptr %b) {
 define <2 x i16> @mul_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; SVE-LABEL: mul_v2i16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    ptrue p0.s, vl2
+; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; SVE-NEXT:    mul z0.s, p0/m, z0.s, z1.s
 ; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -308,8 +309,8 @@ define <2 x i16> @mul_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 define <4 x i16> @mul_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; SVE-LABEL: mul_v4i16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    ptrue p0.h, vl4
+; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; SVE-NEXT:    mul z0.h, p0/m, z0.h, z1.h
 ; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -329,8 +330,8 @@ define <4 x i16> @mul_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; SVE-LABEL: mul_v8i16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    ptrue p0.h, vl8
+; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; SVE-NEXT:    mul z0.h, p0/m, z0.h, z1.h
 ; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -350,20 +351,21 @@ define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @mul_v16i16(ptr %a, ptr %b) {
 ; SVE-LABEL: mul_v16i16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ldp q0, q1, [x0]
 ; SVE-NEXT:    ptrue p0.h, vl8
-; SVE-NEXT:    ldp q2, q3, [x1]
-; SVE-NEXT:    mul z0.h, p0/m, z0.h, z2.h
+; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ldp q1, q2, [x0]
+; SVE-NEXT:    mul z0.h, p0/m, z0.h, z1.h
+; SVE-NEXT:    movprfx z1, z2
 ; SVE-NEXT:    mul z1.h, p0/m, z1.h, z3.h
 ; SVE-NEXT:    stp q0, q1, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: mul_v16i16:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ldp q0, q1, [x0]
-; SVE2-NEXT:    ldp q2, q3, [x1]
-; SVE2-NEXT:    mul z0.h, z0.h, z2.h
-; SVE2-NEXT:    mul z1.h, z1.h, z3.h
+; SVE2-NEXT:    ldp q0, q3, [x1]
+; SVE2-NEXT:    ldp q1, q2, [x0]
+; SVE2-NEXT:    mul z0.h, z1.h, z0.h
+; SVE2-NEXT:    mul z1.h, z2.h, z3.h
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
@@ -376,8 +378,8 @@ define void @mul_v16i16(ptr %a, ptr %b) {
 define <2 x i32> @mul_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; SVE-LABEL: mul_v2i32:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    ptrue p0.s, vl2
+; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; SVE-NEXT:    mul z0.s, p0/m, z0.s, z1.s
 ; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -397,8 +399,8 @@ define <2 x i32> @mul_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; SVE-LABEL: mul_v4i32:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    ptrue p0.s, vl4
+; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; SVE-NEXT:    mul z0.s, p0/m, z0.s, z1.s
 ; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -418,20 +420,21 @@ define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @mul_v8i32(ptr %a, ptr %b) {
 ; SVE-LABEL: mul_v8i32:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ldp q0, q1, [x0]
 ; SVE-NEXT:    ptrue p0.s, vl4
-; SVE-NEXT:    ldp q2, q3, [x1]
-; SVE-NEXT:    mul z0.s, p0/m, z0.s, z2.s
+; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ldp q1, q2, [x0]
+; SVE-NEXT:    mul z0.s, p0/m, z0.s, z1.s
+; SVE-NEXT:    movprfx z1, z2
 ; SVE-NEXT:    mul z1.s, p0/m, z1.s, z3.s
 ; SVE-NEXT:    stp q0, q1, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: mul_v8i32:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ldp q0, q1, [x0]
-; SVE2-NEXT:    ldp q2, q3, [x1]
-; SVE2-NEXT:    mul z0.s, z0.s, z2.s
-; SVE2-NEXT:    mul z1.s, z1.s, z3.s
+; SVE2-NEXT:    ldp q0, q3, [x1]
+; SVE2-NEXT:    ldp q1, q2, [x0]
+; SVE2-NEXT:    mul z0.s, z1.s, z0.s
+; SVE2-NEXT:    mul z1.s, z2.s, z3.s
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
@@ -444,8 +447,8 @@ define void @mul_v8i32(ptr %a, ptr %b) {
 define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; SVE-LABEL: mul_v1i64:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    ptrue p0.d, vl1
+; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; SVE-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -465,8 +468,8 @@ define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; SVE-LABEL: mul_v2i64:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    ptrue p0.d, vl2
+; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; SVE-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -486,20 +489,21 @@ define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @mul_v4i64(ptr %a, ptr %b) {
 ; SVE-LABEL: mul_v4i64:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ldp q0, q1, [x0]
 ; SVE-NEXT:    ptrue p0.d, vl2
-; SVE-NEXT:    ldp q2, q3, [x1]
-; SVE-NEXT:    mul z0.d, p0/m, z0.d, z2.d
+; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ldp q1, q2, [x0]
+; SVE-NEXT:    mul z0.d, p0/m, z0.d, z1.d
+; SVE-NEXT:    movprfx z1, z2
 ; SVE-NEXT:    mul z1.d, p0/m, z1.d, z3.d
 ; SVE-NEXT:    stp q0, q1, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: mul_v4i64:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ldp q0, q1, [x0]
-; SVE2-NEXT:    ldp q2, q3, [x1]
-; SVE2-NEXT:    mul z0.d, z0.d, z2.d
-; SVE2-NEXT:    mul z1.d, z1.d, z3.d
+; SVE2-NEXT:    ldp q0, q3, [x1]
+; SVE2-NEXT:    ldp q1, q2, [x0]
+; SVE2-NEXT:    mul z0.d, z1.d, z0.d
+; SVE2-NEXT:    mul z1.d, z2.d, z3.d
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
@@ -552,10 +556,10 @@ define <16 x i8> @sub_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @sub_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: sub_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    sub z0.b, z0.b, z2.b
-; CHECK-NEXT:    sub z1.b, z1.b, z3.b
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    sub z0.b, z1.b, z0.b
+; CHECK-NEXT:    sub z1.b, z2.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
@@ -604,10 +608,10 @@ define <8 x i16> @sub_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @sub_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: sub_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    sub z0.h, z0.h, z2.h
-; CHECK-NEXT:    sub z1.h, z1.h, z3.h
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    sub z0.h, z1.h, z0.h
+; CHECK-NEXT:    sub z1.h, z2.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
@@ -644,10 +648,10 @@ define <4 x i32> @sub_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @sub_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: sub_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    sub z0.s, z0.s, z2.s
-; CHECK-NEXT:    sub z1.s, z1.s, z3.s
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    sub z0.s, z1.s, z0.s
+; CHECK-NEXT:    sub z1.s, z2.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
@@ -684,10 +688,10 @@ define <2 x i64> @sub_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @sub_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: sub_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    sub z0.d, z0.d, z2.d
-; CHECK-NEXT:    sub z1.d, z1.d, z3.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    sub z0.d, z1.d, z0.d
+; CHECK-NEXT:    sub z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
@@ -704,8 +708,8 @@ define void @sub_v4i64(ptr %a, ptr %b) {
 define <4 x i8> @abs_v4i8(<4 x i8> %op1) {
 ; CHECK-LABEL: abs_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    sxtb z0.h, p0/m, z0.h
 ; CHECK-NEXT:    abs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -717,8 +721,8 @@ define <4 x i8> @abs_v4i8(<4 x i8> %op1) {
 define <8 x i8> @abs_v8i8(<8 x i8> %op1) {
 ; CHECK-LABEL: abs_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    abs z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -729,8 +733,8 @@ define <8 x i8> @abs_v8i8(<8 x i8> %op1) {
 define <16 x i8> @abs_v16i8(<16 x i8> %op1) {
 ; CHECK-LABEL: abs_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    abs z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -741,8 +745,8 @@ define <16 x i8> @abs_v16i8(<16 x i8> %op1) {
 define void @abs_v32i8(ptr %a) {
 ; CHECK-LABEL: abs_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    abs z0.b, p0/m, z0.b
 ; CHECK-NEXT:    abs z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -756,8 +760,8 @@ define void @abs_v32i8(ptr %a) {
 define <2 x i16> @abs_v2i16(<2 x i16> %op1) {
 ; CHECK-LABEL: abs_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    sxth z0.s, p0/m, z0.s
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -769,8 +773,8 @@ define <2 x i16> @abs_v2i16(<2 x i16> %op1) {
 define <4 x i16> @abs_v4i16(<4 x i16> %op1) {
 ; CHECK-LABEL: abs_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    abs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -781,8 +785,8 @@ define <4 x i16> @abs_v4i16(<4 x i16> %op1) {
 define <8 x i16> @abs_v8i16(<8 x i16> %op1) {
 ; CHECK-LABEL: abs_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    abs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -793,8 +797,8 @@ define <8 x i16> @abs_v8i16(<8 x i16> %op1) {
 define void @abs_v16i16(ptr %a) {
 ; CHECK-LABEL: abs_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    abs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    abs z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -808,8 +812,8 @@ define void @abs_v16i16(ptr %a) {
 define <2 x i32> @abs_v2i32(<2 x i32> %op1) {
 ; CHECK-LABEL: abs_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -820,8 +824,8 @@ define <2 x i32> @abs_v2i32(<2 x i32> %op1) {
 define <4 x i32> @abs_v4i32(<4 x i32> %op1) {
 ; CHECK-LABEL: abs_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -832,8 +836,8 @@ define <4 x i32> @abs_v4i32(<4 x i32> %op1) {
 define void @abs_v8i32(ptr %a) {
 ; CHECK-LABEL: abs_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    abs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -847,8 +851,8 @@ define void @abs_v8i32(ptr %a) {
 define <1 x i64> @abs_v1i64(<1 x i64> %op1) {
 ; CHECK-LABEL: abs_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    abs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -859,8 +863,8 @@ define <1 x i64> @abs_v1i64(<1 x i64> %op1) {
 define <2 x i64> @abs_v2i64(<2 x i64> %op1) {
 ; CHECK-LABEL: abs_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    abs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -871,8 +875,8 @@ define <2 x i64> @abs_v2i64(<2 x i64> %op1) {
 define void @abs_v4i64(ptr %a) {
 ; CHECK-LABEL: abs_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    abs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    abs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
index 6d37b119782ba7..3cbd4bb129822b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
@@ -40,12 +40,12 @@ define <16 x i8> @icmp_eq_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @icmp_eq_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: icmp_eq_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    cmpeq p1.b, p0/z, z0.b, z2.b
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    cmpeq p1.b, p0/z, z1.b, z0.b
+; CHECK-NEXT:    cmpeq p0.b, p0/z, z2.b, z3.b
 ; CHECK-NEXT:    mov z0.b, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    cmpeq p0.b, p0/z, z1.b, z3.b
 ; CHECK-NEXT:    mov z1.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -90,12 +90,12 @@ define <8 x i16> @icmp_eq_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @icmp_eq_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: icmp_eq_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    cmpeq p1.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    cmpeq p1.h, p0/z, z1.h, z0.h
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z2.h, z3.h
 ; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    cmpeq p0.h, p0/z, z1.h, z3.h
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -140,12 +140,12 @@ define <4 x i32> @icmp_eq_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @icmp_eq_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: icmp_eq_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    cmpeq p1.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    cmpeq p1.s, p0/z, z1.s, z0.s
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z2.s, z3.s
 ; CHECK-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z1.s, z3.s
 ; CHECK-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -190,12 +190,12 @@ define <2 x i64> @icmp_eq_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @icmp_eq_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: icmp_eq_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, z2.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z1.d, z0.d
+; CHECK-NEXT:    cmpeq p0.d, p0/z, z2.d, z3.d
 ; CHECK-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    cmpeq p0.d, p0/z, z1.d, z3.d
 ; CHECK-NEXT:    mov z1.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -214,12 +214,12 @@ define void @icmp_eq_v4i64(ptr %a, ptr %b) {
 define void @icmp_ne_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: icmp_ne_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    cmpne p1.b, p0/z, z0.b, z2.b
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    cmpne p1.b, p0/z, z1.b, z0.b
+; CHECK-NEXT:    cmpne p0.b, p0/z, z2.b, z3.b
 ; CHECK-NEXT:    mov z0.b, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, z3.b
 ; CHECK-NEXT:    mov z1.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -238,8 +238,8 @@ define void @icmp_ne_v32i8(ptr %a, ptr %b) {
 define void @icmp_sge_v8i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: icmp_sge_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    cmpge p0.h, p0/z, z0.h, z1.h
 ; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
@@ -260,12 +260,12 @@ define void @icmp_sge_v8i16(ptr %a, ptr %b) {
 define void @icmp_sgt_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: icmp_sgt_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    cmpgt p1.h, p0/z, z0.h, z2.h
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    cmpgt p1.h, p0/z, z1.h, z0.h
+; CHECK-NEXT:    cmpgt p0.h, p0/z, z2.h, z3.h
 ; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    cmpgt p0.h, p0/z, z1.h, z3.h
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -284,8 +284,8 @@ define void @icmp_sgt_v16i16(ptr %a, ptr %b) {
 define void @icmp_sle_v4i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: icmp_sle_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    cmpge p0.s, p0/z, z1.s, z0.s
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
@@ -306,12 +306,12 @@ define void @icmp_sle_v4i32(ptr %a, ptr %b) {
 define void @icmp_slt_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: icmp_slt_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    cmpgt p1.s, p0/z, z2.s, z0.s
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    cmpgt p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    cmpgt p0.s, p0/z, z3.s, z2.s
 ; CHECK-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    cmpgt p0.s, p0/z, z3.s, z1.s
 ; CHECK-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -330,8 +330,8 @@ define void @icmp_slt_v8i32(ptr %a, ptr %b) {
 define void @icmp_uge_v2i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: icmp_uge_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    cmphs p0.d, p0/z, z0.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
@@ -352,8 +352,8 @@ define void @icmp_uge_v2i64(ptr %a, ptr %b) {
 define void @icmp_ugt_v2i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: icmp_ugt_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    cmphi p0.d, p0/z, z0.d, z1.d
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
@@ -374,8 +374,8 @@ define void @icmp_ugt_v2i64(ptr %a, ptr %b) {
 define void @icmp_ule_v2i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: icmp_ule_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    cmphs p0.d, p0/z, z1.d, z0.d
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
@@ -396,8 +396,8 @@ define void @icmp_ule_v2i64(ptr %a, ptr %b) {
 define void @icmp_ult_v2i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: icmp_ult_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    cmphi p0.d, p0/z, z1.d, z0.d
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
index 9b7ec3e423c2fe..421bfe15207673 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
@@ -11,14 +11,14 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x i8> @sdiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-LABEL: sdiv_v4i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    sxtb z1.h, p0/m, z1.h
 ; CHECK-NEXT:    sxtb z0.h, p0/m, z0.h
+; CHECK-NEXT:    sxtb z1.h, p0/m, z1.h
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -43,11 +43,11 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    splice z2.h, p0, z2.h, z0.h
-; CHECK-NEXT:    uzp1 z0.b, z2.b, z2.b
+; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    splice z1.h, p0, z1.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %res = sdiv <8 x i8> %op1, %op2
@@ -61,41 +61,40 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z1.h, z1.b
+; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    sunpklo z2.h, z2.b
 ; CHECK-NEXT:    sunpklo z3.h, z3.b
 ; CHECK-NEXT:    sunpklo z4.s, z2.h
 ; CHECK-NEXT:    sunpklo z5.s, z3.h
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    sunpklo z1.h, z1.b
-; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT:    sunpklo z5.s, z1.h
+; CHECK-NEXT:    sunpklo z5.s, z0.h
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    sunpklo z3.s, z0.h
+; CHECK-NEXT:    sunpklo z3.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    ptrue p1.h, vl4
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z5.s
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    splice z4.h, p1, z4.h, z2.h
-; CHECK-NEXT:    movprfx z2, z3
-; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z5.s
-; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    uzp1 z1.h, z4.h, z4.h
+; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT:    splice z1.h, p0, z1.h, z2.h
+; CHECK-NEXT:    uzp1 z1.b, z1.b, z1.b
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    uzp1 z2.b, z4.b, z4.b
-; CHECK-NEXT:    splice z1.h, p1, z1.h, z0.h
+; CHECK-NEXT:    splice z3.h, p0, z3.h, z0.h
 ; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
-; CHECK-NEXT:    splice z0.b, p0, z0.b, z2.b
+; CHECK-NEXT:    uzp1 z0.b, z3.b, z3.b
+; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %res = sdiv <16 x i8> %op1, %op2
@@ -105,76 +104,78 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @sdiv_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: sdiv_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q2, [x0]
+; CHECK-NEXT:    ldp q6, q2, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ptrue p1.h, vl4
-; CHECK-NEXT:    ptrue p2.b, vl8
-; CHECK-NEXT:    ldp q1, q3, [x1]
-; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    ldp q7, q3, [x1]
+; CHECK-NEXT:    mov z1.d, z2.d
+; CHECK-NEXT:    mov z16.d, z6.d
+; CHECK-NEXT:    mov z0.d, z3.d
+; CHECK-NEXT:    ext z1.b, z1.b, z2.b, #8
 ; CHECK-NEXT:    sunpklo z2.h, z2.b
-; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
-; CHECK-NEXT:    sunpklo z5.h, z5.b
-; CHECK-NEXT:    sunpklo z7.s, z5.h
-; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
-; CHECK-NEXT:    sunpklo z5.s, z5.h
-; CHECK-NEXT:    mov z4.d, z3.d
+; CHECK-NEXT:    ext z16.b, z16.b, z6.b, #8
+; CHECK-NEXT:    ext z0.b, z0.b, z3.b, #8
 ; CHECK-NEXT:    sunpklo z3.h, z3.b
+; CHECK-NEXT:    sunpklo z6.h, z6.b
+; CHECK-NEXT:    sunpklo z1.h, z1.b
+; CHECK-NEXT:    sunpklo z16.h, z16.b
+; CHECK-NEXT:    sunpklo z4.h, z0.b
+; CHECK-NEXT:    sunpklo z5.s, z1.h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z18.s, z16.h
+; CHECK-NEXT:    sunpklo z0.s, z4.h
 ; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
-; CHECK-NEXT:    sunpklo z4.h, z4.b
-; CHECK-NEXT:    sunpklo z6.s, z4.h
-; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT:    ext z16.b, z16.b, z16.b, #8
+; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    sunpklo z4.s, z4.h
-; CHECK-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT:    sunpklo z7.s, z3.h
+; CHECK-NEXT:    sunpklo z16.s, z16.h
+; CHECK-NEXT:    sdivr z0.s, p0/m, z0.s, z5.s
 ; CHECK-NEXT:    sunpklo z5.s, z2.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
-; CHECK-NEXT:    sunpklo z3.s, z3.h
-; CHECK-NEXT:    splice z6.h, p1, z6.h, z4.h
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
-; CHECK-NEXT:    uzp1 z4.b, z6.b, z6.b
-; CHECK-NEXT:    sdiv z5.s, p0/m, z5.s, z7.s
-; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    sunpklo z3.h, z1.b
-; CHECK-NEXT:    sunpklo z6.h, z0.b
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    sunpklo z1.h, z1.b
-; CHECK-NEXT:    sunpklo z0.h, z0.b
-; CHECK-NEXT:    splice z5.h, p1, z5.h, z2.h
-; CHECK-NEXT:    sunpklo z2.s, z1.h
-; CHECK-NEXT:    sunpklo z7.s, z0.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z7.s
-; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
-; CHECK-NEXT:    sunpklo z2.s, z3.h
-; CHECK-NEXT:    sunpklo z7.s, z6.h
+; CHECK-NEXT:    sdiv z1.s, p0/m, z1.s, z4.s
+; CHECK-NEXT:    sunpklo z4.s, z3.h
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
-; CHECK-NEXT:    sunpklo z6.s, z6.h
-; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z7.s
-; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z6.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT:    mov z5.d, z7.d
+; CHECK-NEXT:    ext z5.b, z5.b, z7.b, #8
+; CHECK-NEXT:    sunpklo z7.h, z7.b
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
+; CHECK-NEXT:    sunpklo z5.h, z5.b
+; CHECK-NEXT:    sunpklo z17.s, z5.h
+; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
+; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    sunpklo z5.s, z5.h
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    sdivr z17.s, p0/m, z17.s, z18.s
+; CHECK-NEXT:    sunpklo z18.s, z6.h
+; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT:    sunpklo z6.s, z6.h
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    splice z1.h, p1, z1.h, z0.h
-; CHECK-NEXT:    splice z2.h, p1, z2.h, z3.h
-; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
-; CHECK-NEXT:    uzp1 z1.b, z2.b, z2.b
-; CHECK-NEXT:    uzp1 z2.b, z5.b, z5.b
-; CHECK-NEXT:    splice z1.b, p2, z1.b, z0.b
-; CHECK-NEXT:    splice z2.b, p2, z2.b, z4.b
-; CHECK-NEXT:    stp q1, q2, [x0]
+; CHECK-NEXT:    sdivr z5.s, p0/m, z5.s, z16.s
+; CHECK-NEXT:    sunpklo z16.s, z7.h
+; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
+; CHECK-NEXT:    sunpklo z7.s, z7.h
+; CHECK-NEXT:    uzp1 z3.h, z17.h, z17.h
+; CHECK-NEXT:    sdivr z16.s, p0/m, z16.s, z18.s
+; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT:    sdiv z6.s, p0/m, z6.s, z7.s
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    uzp1 z7.h, z16.h, z16.h
+; CHECK-NEXT:    splice z3.h, p0, z3.h, z5.h
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    splice z4.h, p0, z4.h, z2.h
+; CHECK-NEXT:    uzp1 z1.b, z3.b, z3.b
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    uzp1 z3.b, z4.b, z4.b
+; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
+; CHECK-NEXT:    splice z7.h, p0, z7.h, z6.h
+; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    uzp1 z2.b, z7.b, z7.b
+; CHECK-NEXT:    splice z3.b, p0, z3.b, z0.b
+; CHECK-NEXT:    splice z2.b, p0, z2.b, z1.b
+; CHECK-NEXT:    stp q2, q3, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -186,9 +187,9 @@ define void @sdiv_v32i8(ptr %a, ptr %b) {
 define <2 x i16> @sdiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-LABEL: sdiv_v2i16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    sxth z1.s, p0/m, z1.s
 ; CHECK-NEXT:    sxth z0.s, p0/m, z0.s
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
@@ -201,9 +202,9 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-LABEL: sdiv_v4i16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
@@ -221,18 +222,18 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
-; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -243,34 +244,38 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @sdiv_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: sdiv_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q3, q0, [x1]
+; CHECK-NEXT:    ldp q4, q1, [x1]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ptrue p1.h, vl4
-; CHECK-NEXT:    sunpklo z6.s, z3.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    sunpklo z4.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    sunpklo z7.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ldr q0, [x0, #16]
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
+; CHECK-NEXT:    ext z5.b, z5.b, z4.b, #8
+; CHECK-NEXT:    sunpklo z4.s, z4.h
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEXT:    sunpklo z5.s, z2.h
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
-; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT:    sdivr z0.s, p0/m, z0.s, z2.s
-; CHECK-NEXT:    sunpklo z2.s, z3.h
-; CHECK-NEXT:    sdiv z1.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    movprfx z2, z7
-; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z6.s
+; CHECK-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEXT:    sunpklo z5.s, z5.h
+; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    ldr q3, [x0]
+; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
+; CHECK-NEXT:    ext z6.b, z6.b, z3.b, #8
+; CHECK-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEXT:    sunpklo z6.s, z6.h
+; CHECK-NEXT:    sdivr z5.s, p0/m, z5.s, z6.s
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    uzp1 z3.h, z4.h, z4.h
-; CHECK-NEXT:    splice z2.h, p1, z2.h, z1.h
-; CHECK-NEXT:    splice z3.h, p1, z3.h, z0.h
-; CHECK-NEXT:    stp q2, q3, [x0]
+; CHECK-NEXT:    sdiv z3.s, p0/m, z3.s, z4.s
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    uzp1 z1.h, z5.h, z5.h
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z2.h
+; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT:    splice z3.h, p0, z3.h, z1.h
+; CHECK-NEXT:    stp q3, q0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -282,8 +287,8 @@ define void @sdiv_v16i16(ptr %a, ptr %b) {
 define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-LABEL: sdiv_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -295,8 +300,8 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-LABEL: sdiv_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -308,10 +313,11 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @sdiv_v8i32(ptr %a, ptr %b)  {
 ; CHECK-LABEL: sdiv_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    sdivr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    sdiv z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -325,8 +331,8 @@ define void @sdiv_v8i32(ptr %a, ptr %b)  {
 define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-LABEL: sdiv_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -338,8 +344,8 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-LABEL: sdiv_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -351,10 +357,11 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @sdiv_v4i64(ptr %a, ptr %b)  {
 ; CHECK-LABEL: sdiv_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    sdiv z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    sdivr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    sdiv z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -403,11 +410,11 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    splice z2.h, p0, z2.h, z0.h
-; CHECK-NEXT:    uzp1 z0.b, z2.b, z2.b
+; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    splice z1.h, p0, z1.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %res = udiv <8 x i8> %op1, %op2
@@ -421,41 +428,40 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z1.h, z1.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    uunpklo z2.h, z2.b
 ; CHECK-NEXT:    uunpklo z3.h, z3.b
 ; CHECK-NEXT:    uunpklo z4.s, z2.h
 ; CHECK-NEXT:    uunpklo z5.s, z3.h
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    uunpklo z1.h, z1.b
-; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT:    uunpklo z5.s, z1.h
+; CHECK-NEXT:    uunpklo z5.s, z0.h
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    uunpklo z3.s, z0.h
+; CHECK-NEXT:    uunpklo z3.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    ptrue p1.h, vl4
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z5.s
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    splice z4.h, p1, z4.h, z2.h
-; CHECK-NEXT:    movprfx z2, z3
-; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z5.s
-; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    uzp1 z1.h, z4.h, z4.h
+; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT:    splice z1.h, p0, z1.h, z2.h
+; CHECK-NEXT:    uzp1 z1.b, z1.b, z1.b
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    uzp1 z2.b, z4.b, z4.b
-; CHECK-NEXT:    splice z1.h, p1, z1.h, z0.h
+; CHECK-NEXT:    splice z3.h, p0, z3.h, z0.h
 ; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
-; CHECK-NEXT:    splice z0.b, p0, z0.b, z2.b
+; CHECK-NEXT:    uzp1 z0.b, z3.b, z3.b
+; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %res = udiv <16 x i8> %op1, %op2
@@ -465,76 +471,78 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @udiv_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: udiv_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q2, [x0]
+; CHECK-NEXT:    ldp q6, q2, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ptrue p1.h, vl4
-; CHECK-NEXT:    ptrue p2.b, vl8
-; CHECK-NEXT:    ldp q1, q3, [x1]
-; CHECK-NEXT:    mov z5.d, z2.d
+; CHECK-NEXT:    ldp q7, q3, [x1]
+; CHECK-NEXT:    mov z1.d, z2.d
+; CHECK-NEXT:    mov z16.d, z6.d
+; CHECK-NEXT:    mov z0.d, z3.d
+; CHECK-NEXT:    ext z1.b, z1.b, z2.b, #8
 ; CHECK-NEXT:    uunpklo z2.h, z2.b
-; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
-; CHECK-NEXT:    uunpklo z5.h, z5.b
-; CHECK-NEXT:    uunpklo z7.s, z5.h
-; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
-; CHECK-NEXT:    uunpklo z5.s, z5.h
-; CHECK-NEXT:    mov z4.d, z3.d
+; CHECK-NEXT:    ext z16.b, z16.b, z6.b, #8
+; CHECK-NEXT:    ext z0.b, z0.b, z3.b, #8
 ; CHECK-NEXT:    uunpklo z3.h, z3.b
+; CHECK-NEXT:    uunpklo z6.h, z6.b
+; CHECK-NEXT:    uunpklo z1.h, z1.b
+; CHECK-NEXT:    uunpklo z16.h, z16.b
+; CHECK-NEXT:    uunpklo z4.h, z0.b
+; CHECK-NEXT:    uunpklo z5.s, z1.h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z18.s, z16.h
+; CHECK-NEXT:    uunpklo z0.s, z4.h
 ; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
-; CHECK-NEXT:    uunpklo z4.h, z4.b
-; CHECK-NEXT:    uunpklo z6.s, z4.h
-; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
+; CHECK-NEXT:    ext z16.b, z16.b, z16.b, #8
+; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z4.s, z4.h
-; CHECK-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT:    uunpklo z7.s, z3.h
+; CHECK-NEXT:    uunpklo z16.s, z16.h
+; CHECK-NEXT:    udivr z0.s, p0/m, z0.s, z5.s
 ; CHECK-NEXT:    uunpklo z5.s, z2.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
-; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    splice z6.h, p1, z6.h, z4.h
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEXT:    uzp1 z4.b, z6.b, z6.b
-; CHECK-NEXT:    udiv z5.s, p0/m, z5.s, z7.s
-; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    uunpklo z3.h, z1.b
-; CHECK-NEXT:    uunpklo z6.h, z0.b
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    uunpklo z1.h, z1.b
-; CHECK-NEXT:    uunpklo z0.h, z0.b
-; CHECK-NEXT:    splice z5.h, p1, z5.h, z2.h
-; CHECK-NEXT:    uunpklo z2.s, z1.h
-; CHECK-NEXT:    uunpklo z7.s, z0.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z7.s
-; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
-; CHECK-NEXT:    uunpklo z2.s, z3.h
-; CHECK-NEXT:    uunpklo z7.s, z6.h
+; CHECK-NEXT:    udiv z1.s, p0/m, z1.s, z4.s
+; CHECK-NEXT:    uunpklo z4.s, z3.h
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    uunpklo z6.s, z6.h
-; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z7.s
-; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z6.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT:    mov z5.d, z7.d
+; CHECK-NEXT:    ext z5.b, z5.b, z7.b, #8
+; CHECK-NEXT:    uunpklo z7.h, z7.b
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
+; CHECK-NEXT:    uunpklo z5.h, z5.b
+; CHECK-NEXT:    uunpklo z17.s, z5.h
+; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
+; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    uunpklo z5.s, z5.h
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    udivr z17.s, p0/m, z17.s, z18.s
+; CHECK-NEXT:    uunpklo z18.s, z6.h
+; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT:    uunpklo z6.s, z6.h
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    splice z1.h, p1, z1.h, z0.h
-; CHECK-NEXT:    splice z2.h, p1, z2.h, z3.h
-; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
-; CHECK-NEXT:    uzp1 z1.b, z2.b, z2.b
-; CHECK-NEXT:    uzp1 z2.b, z5.b, z5.b
-; CHECK-NEXT:    splice z1.b, p2, z1.b, z0.b
-; CHECK-NEXT:    splice z2.b, p2, z2.b, z4.b
-; CHECK-NEXT:    stp q1, q2, [x0]
+; CHECK-NEXT:    udivr z5.s, p0/m, z5.s, z16.s
+; CHECK-NEXT:    uunpklo z16.s, z7.h
+; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
+; CHECK-NEXT:    uunpklo z7.s, z7.h
+; CHECK-NEXT:    uzp1 z3.h, z17.h, z17.h
+; CHECK-NEXT:    udivr z16.s, p0/m, z16.s, z18.s
+; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT:    udiv z6.s, p0/m, z6.s, z7.s
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    uzp1 z7.h, z16.h, z16.h
+; CHECK-NEXT:    splice z3.h, p0, z3.h, z5.h
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    splice z4.h, p0, z4.h, z2.h
+; CHECK-NEXT:    uzp1 z1.b, z3.b, z3.b
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    uzp1 z3.b, z4.b, z4.b
+; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
+; CHECK-NEXT:    splice z7.h, p0, z7.h, z6.h
+; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    uzp1 z2.b, z7.b, z7.b
+; CHECK-NEXT:    splice z3.b, p0, z3.b, z0.b
+; CHECK-NEXT:    splice z2.b, p0, z2.b, z1.b
+; CHECK-NEXT:    stp q2, q3, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -546,9 +554,9 @@ define void @udiv_v32i8(ptr %a, ptr %b) {
 define <2 x i16> @udiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-LABEL: udiv_v2i16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    and z1.s, z1.s, #0xffff
 ; CHECK-NEXT:    and z0.s, z0.s, #0xffff
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
@@ -561,9 +569,9 @@ define <2 x i16> @udiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-LABEL: udiv_v4i16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
@@ -581,18 +589,18 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -603,34 +611,38 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @udiv_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: udiv_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q3, q0, [x1]
+; CHECK-NEXT:    ldp q4, q1, [x1]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ptrue p1.h, vl4
-; CHECK-NEXT:    uunpklo z6.s, z3.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    uunpklo z4.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z7.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ldr q0, [x0, #16]
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
+; CHECK-NEXT:    ext z5.b, z5.b, z4.b, #8
+; CHECK-NEXT:    uunpklo z4.s, z4.h
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    uunpklo z5.s, z2.h
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT:    udivr z0.s, p0/m, z0.s, z2.s
-; CHECK-NEXT:    uunpklo z2.s, z3.h
-; CHECK-NEXT:    udiv z1.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    movprfx z2, z7
-; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z6.s
+; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    uunpklo z5.s, z5.h
+; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    ldr q3, [x0]
+; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
+; CHECK-NEXT:    ext z6.b, z6.b, z3.b, #8
+; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    uunpklo z6.s, z6.h
+; CHECK-NEXT:    udivr z5.s, p0/m, z5.s, z6.s
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    uzp1 z3.h, z4.h, z4.h
-; CHECK-NEXT:    splice z2.h, p1, z2.h, z1.h
-; CHECK-NEXT:    splice z3.h, p1, z3.h, z0.h
-; CHECK-NEXT:    stp q2, q3, [x0]
+; CHECK-NEXT:    udiv z3.s, p0/m, z3.s, z4.s
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    uzp1 z1.h, z5.h, z5.h
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z2.h
+; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT:    splice z3.h, p0, z3.h, z1.h
+; CHECK-NEXT:    stp q3, q0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -642,8 +654,8 @@ define void @udiv_v16i16(ptr %a, ptr %b) {
 define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-LABEL: udiv_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -655,8 +667,8 @@ define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-LABEL: udiv_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -668,10 +680,11 @@ define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @udiv_v8i32(ptr %a, ptr %b)  {
 ; CHECK-LABEL: udiv_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    udivr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    udiv z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -685,8 +698,8 @@ define void @udiv_v8i32(ptr %a, ptr %b)  {
 define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-LABEL: udiv_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -698,8 +711,8 @@ define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-LABEL: udiv_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -711,10 +724,11 @@ define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @udiv_v4i64(ptr %a, ptr %b)  {
 ; CHECK-LABEL: udiv_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    udiv z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    udivr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    udiv z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -728,40 +742,40 @@ define void @udiv_v4i64(ptr %a, ptr %b)  {
 define void @udiv_constantsplat_v8i32(ptr %a)  {
 ; SVE-LABEL: udiv_constantsplat_v8i32:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ldp q0, q1, [x0]
-; SVE-NEXT:    mov w8, #8969 // =0x2309
 ; SVE-NEXT:    ptrue p0.s, vl4
+; SVE-NEXT:    mov w8, #8969 // =0x2309
 ; SVE-NEXT:    movk w8, #22765, lsl #16
-; SVE-NEXT:    mov z2.s, w8
-; SVE-NEXT:    movprfx z3, z0
-; SVE-NEXT:    umulh z3.s, p0/m, z3.s, z2.s
-; SVE-NEXT:    umulh z2.s, p0/m, z2.s, z1.s
-; SVE-NEXT:    sub z0.s, z0.s, z3.s
-; SVE-NEXT:    sub z1.s, z1.s, z2.s
-; SVE-NEXT:    lsr z0.s, z0.s, #1
+; SVE-NEXT:    ldp q1, q2, [x0]
+; SVE-NEXT:    mov z0.s, w8
+; SVE-NEXT:    movprfx z3, z1
+; SVE-NEXT:    umulh z3.s, p0/m, z3.s, z0.s
+; SVE-NEXT:    sub z1.s, z1.s, z3.s
+; SVE-NEXT:    umulh z0.s, p0/m, z0.s, z2.s
 ; SVE-NEXT:    lsr z1.s, z1.s, #1
-; SVE-NEXT:    add z0.s, z0.s, z3.s
-; SVE-NEXT:    add z1.s, z1.s, z2.s
-; SVE-NEXT:    lsr z0.s, z0.s, #6
+; SVE-NEXT:    sub z2.s, z2.s, z0.s
+; SVE-NEXT:    add z1.s, z1.s, z3.s
+; SVE-NEXT:    lsr z2.s, z2.s, #1
 ; SVE-NEXT:    lsr z1.s, z1.s, #6
-; SVE-NEXT:    stp q0, q1, [x0]
+; SVE-NEXT:    add z0.s, z2.s, z0.s
+; SVE-NEXT:    lsr z0.s, z0.s, #6
+; SVE-NEXT:    stp q1, q0, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: udiv_constantsplat_v8i32:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ldp q0, q1, [x0]
 ; SVE2-NEXT:    mov w8, #8969 // =0x2309
+; SVE2-NEXT:    ldp q1, q2, [x0]
 ; SVE2-NEXT:    movk w8, #22765, lsl #16
-; SVE2-NEXT:    mov z2.s, w8
-; SVE2-NEXT:    umulh z3.s, z0.s, z2.s
-; SVE2-NEXT:    umulh z2.s, z1.s, z2.s
-; SVE2-NEXT:    sub z0.s, z0.s, z3.s
-; SVE2-NEXT:    sub z1.s, z1.s, z2.s
-; SVE2-NEXT:    usra z3.s, z0.s, #1
-; SVE2-NEXT:    usra z2.s, z1.s, #1
-; SVE2-NEXT:    lsr z0.s, z3.s, #6
-; SVE2-NEXT:    lsr z1.s, z2.s, #6
-; SVE2-NEXT:    stp q0, q1, [x0]
+; SVE2-NEXT:    mov z0.s, w8
+; SVE2-NEXT:    umulh z3.s, z1.s, z0.s
+; SVE2-NEXT:    umulh z0.s, z2.s, z0.s
+; SVE2-NEXT:    sub z1.s, z1.s, z3.s
+; SVE2-NEXT:    sub z2.s, z2.s, z0.s
+; SVE2-NEXT:    usra z3.s, z1.s, #1
+; SVE2-NEXT:    usra z0.s, z2.s, #1
+; SVE2-NEXT:    lsr z1.s, z3.s, #6
+; SVE2-NEXT:    lsr z0.s, z0.s, #6
+; SVE2-NEXT:    stp q1, q0, [x0]
 ; SVE2-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = udiv <8 x i32> %op1, <i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95, i32 95>

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
index 9b79467c6df3c1..e39b51ace8d694 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
@@ -78,17 +78,17 @@ define void @sext_v16i8_v16i16(<16 x i8> %a, ptr %out) {
 define void @sext_v32i8_v32i16(ptr %in, ptr %out) {
 ; CHECK-LABEL: sext_v32i8_v32i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    add z1.b, z1.b, z1.b
 ; CHECK-NEXT:    sunpklo z2.h, z0.b
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    add z1.b, z1.b, z1.b
-; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    sunpklo z3.h, z1.b
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    sunpklo z1.h, z1.b
-; CHECK-NEXT:    stp q2, q0, [x1]
-; CHECK-NEXT:    stp q3, q1, [x1, #32]
+; CHECK-NEXT:    stp q2, q0, [x1, #32]
+; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
@@ -122,15 +122,15 @@ define void @sext_v16i8_v16i32(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    sunpklo z1.h, z0.b
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
+; CHECK-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    sunpklo z3.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
-; CHECK-NEXT:    sunpklo z0.s, z1.h
-; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
   %b = sext <16 x i8> %a to <16 x i32>
   store <16 x i32> %b, ptr %out
@@ -140,31 +140,31 @@ define void @sext_v16i8_v16i32(<16 x i8> %a, ptr %out) {
 define void @sext_v32i8_v32i32(ptr %in, ptr %out) {
 ; CHECK-LABEL: sext_v32i8_v32i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    add z1.b, z1.b, z1.b
 ; CHECK-NEXT:    sunpklo z2.h, z0.b
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    add z1.b, z1.b, z1.b
-; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    sunpklo z3.h, z1.b
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z5.s, z3.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    sunpklo z1.h, z1.b
-; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sunpklo z4.s, z2.h
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    sunpklo z5.s, z3.h
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; CHECK-NEXT:    sunpklo z6.s, z0.h
-; CHECK-NEXT:    stp q5, q3, [x1, #64]
-; CHECK-NEXT:    sunpklo z5.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
-; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    sunpklo z7.s, z1.h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    stp q4, q2, [x1]
-; CHECK-NEXT:    stp q6, q0, [x1, #32]
-; CHECK-NEXT:    stp q5, q1, [x1, #96]
+; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    stp q4, q2, [x1, #64]
+; CHECK-NEXT:    stp q5, q3, [x1]
+; CHECK-NEXT:    stp q6, q0, [x1, #96]
+; CHECK-NEXT:    stp q7, q1, [x1, #32]
 ; CHECK-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
@@ -226,29 +226,31 @@ define void @sext_v16i8_v16i64(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    sunpklo z1.h, z0.b
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
+; CHECK-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z3.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEXT:    sunpklo z6.d, z0.s
+; CHECK-NEXT:    sunpklo z3.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z4.d, z2.s
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    mov z7.d, z1.d
+; CHECK-NEXT:    sunpklo z2.d, z2.s
 ; CHECK-NEXT:    sunpklo z5.d, z3.s
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    sunpklo z7.d, z1.s
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z7.b, z7.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    mov z6.d, z0.d
 ; CHECK-NEXT:    sunpklo z3.d, z3.s
-; CHECK-NEXT:    sunpklo z2.d, z2.s
-; CHECK-NEXT:    stp q5, q3, [x0, #64]
 ; CHECK-NEXT:    stp q4, q2, [x0]
-; CHECK-NEXT:    stp q6, q0, [x0, #96]
-; CHECK-NEXT:    sunpklo z0.d, z1.s
-; CHECK-NEXT:    stp q7, q0, [x0, #32]
+; CHECK-NEXT:    sunpklo z4.d, z7.s
+; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    stp q5, q3, [x0, #64]
+; CHECK-NEXT:    sunpklo z2.d, z6.s
+; CHECK-NEXT:    stp q1, q4, [x0, #32]
+; CHECK-NEXT:    stp q0, q2, [x0, #96]
 ; CHECK-NEXT:    ret
   %b = sext <16 x i8> %a to <16 x i64>
   store <16 x i64> %b, ptr %out
@@ -258,59 +260,65 @@ define void @sext_v16i8_v16i64(<16 x i8> %a, ptr %out) {
 define void @sext_v32i8_v32i64(ptr %in, ptr %out) {
 ; CHECK-LABEL: sext_v32i8_v32i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.b, z0.b, z0.b
-; CHECK-NEXT:    sunpklo z2.h, z0.b
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    add z1.b, z1.b, z1.b
+; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
-; CHECK-NEXT:    sunpklo z3.h, z1.b
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z5.s, z3.h
+; CHECK-NEXT:    mov z3.d, z1.d
 ; CHECK-NEXT:    sunpklo z1.h, z1.b
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    sunpklo z4.s, z2.h
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    sunpklo z6.s, z0.h
-; CHECK-NEXT:    sunpklo z7.s, z1.h
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    sunpklo z4.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z5.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z3.s, z3.h
-; CHECK-NEXT:    sunpklo z2.s, z2.h
+; CHECK-NEXT:    sunpklo z2.h, z2.b
+; CHECK-NEXT:    sunpklo z3.h, z3.b
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEXT:    sunpklo z20.d, z3.s
-; CHECK-NEXT:    sunpklo z22.d, z4.s
+; CHECK-NEXT:    sunpklo z16.d, z4.s
 ; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    sunpklo z16.d, z2.s
-; CHECK-NEXT:    sunpklo z17.d, z6.s
-; CHECK-NEXT:    sunpklo z18.d, z0.s
-; CHECK-NEXT:    sunpklo z19.d, z1.s
-; CHECK-NEXT:    sunpklo z21.d, z7.s
-; CHECK-NEXT:    sunpklo z23.d, z5.s
+; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    sunpklo z17.d, z5.s
 ; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
-; CHECK-NEXT:    sunpklo z4.d, z4.s
-; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
-; CHECK-NEXT:    sunpklo z3.d, z3.s
-; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT:    sunpklo z6.s, z2.h
+; CHECK-NEXT:    sunpklo z7.s, z3.h
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z4.d, z4.s
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    sunpklo z19.d, z0.s
 ; CHECK-NEXT:    sunpklo z5.d, z5.s
-; CHECK-NEXT:    stp q22, q4, [x1]
-; CHECK-NEXT:    sunpklo z4.d, z7.s
-; CHECK-NEXT:    stp q23, q5, [x1, #128]
-; CHECK-NEXT:    sunpklo z2.d, z2.s
-; CHECK-NEXT:    stp q20, q3, [x1, #160]
-; CHECK-NEXT:    sunpklo z3.d, z6.s
-; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z2.s, z2.h
+; CHECK-NEXT:    sunpklo z18.d, z6.s
+; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEXT:    stp q16, q4, [x1, #128]
+; CHECK-NEXT:    mov z16.d, z7.d
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    stp q16, q2, [x1, #32]
-; CHECK-NEXT:    stp q17, q3, [x1, #64]
-; CHECK-NEXT:    stp q18, q0, [x1, #96]
-; CHECK-NEXT:    stp q21, q4, [x1, #192]
-; CHECK-NEXT:    stp q19, q1, [x1, #224]
+; CHECK-NEXT:    stp q17, q5, [x1]
+; CHECK-NEXT:    sunpklo z5.d, z7.s
+; CHECK-NEXT:    sunpklo z4.d, z6.s
+; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    ext z16.b, z16.b, z7.b, #8
+; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    stp q19, q0, [x1, #160]
+; CHECK-NEXT:    sunpklo z0.d, z2.s
+; CHECK-NEXT:    ext z6.b, z6.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    stp q18, q4, [x1, #192]
+; CHECK-NEXT:    mov z4.d, z3.d
+; CHECK-NEXT:    ext z7.b, z7.b, z2.b, #8
+; CHECK-NEXT:    sunpklo z16.d, z16.s
+; CHECK-NEXT:    sunpklo z6.d, z6.s
+; CHECK-NEXT:    ext z4.b, z4.b, z3.b, #8
+; CHECK-NEXT:    sunpklo z2.d, z7.s
+; CHECK-NEXT:    sunpklo z3.d, z3.s
+; CHECK-NEXT:    stp q5, q16, [x1, #64]
+; CHECK-NEXT:    stp q1, q6, [x1, #32]
+; CHECK-NEXT:    sunpklo z1.d, z4.s
+; CHECK-NEXT:    stp q0, q2, [x1, #224]
+; CHECK-NEXT:    stp q3, q1, [x1, #96]
 ; CHECK-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
@@ -340,17 +348,17 @@ define void @sext_v8i16_v8i32(<8 x i16> %a, ptr %out) {
 define void @sext_v16i16_v16i32(ptr %in, ptr %out) {
 ; CHECK-LABEL: sext_v16i16_v16i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    add z1.h, z1.h, z1.h
 ; CHECK-NEXT:    sunpklo z2.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    add z1.h, z1.h, z1.h
-; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z3.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEXT:    stp q2, q0, [x1]
-; CHECK-NEXT:    stp q3, q1, [x1, #32]
+; CHECK-NEXT:    stp q2, q0, [x1, #32]
+; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
@@ -384,15 +392,15 @@ define void @sext_v8i16_v8i64(<8 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    sunpklo z1.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z2.d, z1.s
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    sunpklo z2.d, z1.s
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z1.d, z1.s
 ; CHECK-NEXT:    sunpklo z3.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
-; CHECK-NEXT:    sunpklo z0.d, z1.s
-; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
   %b = sext <8 x i16> %a to <8 x i64>
   store <8 x i64>%b, ptr %out
@@ -402,31 +410,31 @@ define void @sext_v8i16_v8i64(<8 x i16> %a, ptr %out) {
 define void @sext_v16i16_v16i64(ptr %in, ptr %out) {
 ; CHECK-LABEL: sext_v16i16_v16i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    add z1.h, z1.h, z1.h
 ; CHECK-NEXT:    sunpklo z2.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    add z1.h, z1.h, z1.h
-; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z3.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEXT:    sunpklo z3.d, z3.s
 ; CHECK-NEXT:    sunpklo z4.d, z2.s
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    sunpklo z5.d, z3.s
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; CHECK-NEXT:    sunpklo z6.d, z0.s
-; CHECK-NEXT:    stp q5, q3, [x1, #64]
-; CHECK-NEXT:    sunpklo z5.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z2.d, z2.s
-; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    sunpklo z7.d, z1.s
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z3.d, z3.s
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    stp q4, q2, [x1]
-; CHECK-NEXT:    stp q6, q0, [x1, #32]
-; CHECK-NEXT:    stp q5, q1, [x1, #96]
+; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    stp q4, q2, [x1, #64]
+; CHECK-NEXT:    stp q5, q3, [x1]
+; CHECK-NEXT:    stp q6, q0, [x1, #96]
+; CHECK-NEXT:    stp q7, q1, [x1, #32]
 ; CHECK-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
@@ -456,17 +464,17 @@ define void @sext_v4i32_v4i64(<4 x i32> %a, ptr %out) {
 define void @sext_v8i32_v8i64(ptr %in, ptr %out) {
 ; CHECK-LABEL: sext_v8i32_v8i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.s, z0.s, z0.s
+; CHECK-NEXT:    add z1.s, z1.s, z1.s
 ; CHECK-NEXT:    sunpklo z2.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    add z1.s, z1.s, z1.s
-; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    sunpklo z3.d, z1.s
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEXT:    stp q2, q0, [x1]
-; CHECK-NEXT:    stp q3, q1, [x1, #32]
+; CHECK-NEXT:    stp q2, q0, [x1, #32]
+; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   %b = add <8 x i32> %a, %a
@@ -497,17 +505,17 @@ define void @zext_v16i8_v16i16(<16 x i8> %a, ptr %out) {
 define void @zext_v32i8_v32i16(ptr %in, ptr %out) {
 ; CHECK-LABEL: zext_v32i8_v32i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    add z1.b, z1.b, z1.b
 ; CHECK-NEXT:    uunpklo z2.h, z0.b
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    add z1.b, z1.b, z1.b
-; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    uunpklo z3.h, z1.b
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    uunpklo z1.h, z1.b
-; CHECK-NEXT:    stp q2, q0, [x1]
-; CHECK-NEXT:    stp q3, q1, [x1, #32]
+; CHECK-NEXT:    stp q2, q0, [x1, #32]
+; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
@@ -541,15 +549,15 @@ define void @zext_v16i8_v16i32(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    uunpklo z1.h, z0.b
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z2.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpklo z2.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z3.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
-; CHECK-NEXT:    uunpklo z0.s, z1.h
-; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
   %b = zext <16 x i8> %a to <16 x i32>
   store <16 x i32> %b, ptr %out
@@ -559,31 +567,31 @@ define void @zext_v16i8_v16i32(<16 x i8> %a, ptr %out) {
 define void @zext_v32i8_v32i32(ptr %in, ptr %out) {
 ; CHECK-LABEL: zext_v32i8_v32i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    add z1.b, z1.b, z1.b
 ; CHECK-NEXT:    uunpklo z2.h, z0.b
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    add z1.b, z1.b, z1.b
-; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    uunpklo z3.h, z1.b
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    uunpklo z5.s, z3.h
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    uunpklo z1.h, z1.b
-; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    uunpklo z4.s, z2.h
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    uunpklo z5.s, z3.h
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; CHECK-NEXT:    uunpklo z6.s, z0.h
-; CHECK-NEXT:    stp q5, q3, [x1, #64]
-; CHECK-NEXT:    uunpklo z5.s, z1.h
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uunpklo z7.s, z1.h
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    stp q4, q2, [x1]
-; CHECK-NEXT:    stp q6, q0, [x1, #32]
-; CHECK-NEXT:    stp q5, q1, [x1, #96]
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    stp q4, q2, [x1, #64]
+; CHECK-NEXT:    stp q5, q3, [x1]
+; CHECK-NEXT:    stp q6, q0, [x1, #96]
+; CHECK-NEXT:    stp q7, q1, [x1, #32]
 ; CHECK-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
@@ -643,29 +651,31 @@ define void @zext_v16i8_v16i64(<16 x i8> %a, ptr %out) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    uunpklo z1.h, z0.b
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z2.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpklo z2.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    uunpklo z3.s, z0.h
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    uunpklo z6.d, z0.s
+; CHECK-NEXT:    uunpklo z3.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z4.d, z2.s
+; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    mov z7.d, z1.d
+; CHECK-NEXT:    uunpklo z2.d, z2.s
 ; CHECK-NEXT:    uunpklo z5.d, z3.s
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    uunpklo z7.d, z1.s
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    ext z7.b, z7.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    mov z6.d, z0.d
 ; CHECK-NEXT:    uunpklo z3.d, z3.s
-; CHECK-NEXT:    uunpklo z2.d, z2.s
-; CHECK-NEXT:    stp q5, q3, [x0, #64]
 ; CHECK-NEXT:    stp q4, q2, [x0]
-; CHECK-NEXT:    stp q6, q0, [x0, #96]
-; CHECK-NEXT:    uunpklo z0.d, z1.s
-; CHECK-NEXT:    stp q7, q0, [x0, #32]
+; CHECK-NEXT:    uunpklo z4.d, z7.s
+; CHECK-NEXT:    ext z6.b, z6.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    stp q5, q3, [x0, #64]
+; CHECK-NEXT:    uunpklo z2.d, z6.s
+; CHECK-NEXT:    stp q1, q4, [x0, #32]
+; CHECK-NEXT:    stp q0, q2, [x0, #96]
 ; CHECK-NEXT:    ret
   %b = zext <16 x i8> %a to <16 x i64>
   store <16 x i64> %b, ptr %out
@@ -675,59 +685,65 @@ define void @zext_v16i8_v16i64(<16 x i8> %a, ptr %out) {
 define void @zext_v32i8_v32i64(ptr %in, ptr %out) {
 ; CHECK-LABEL: zext_v32i8_v32i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.b, z0.b, z0.b
-; CHECK-NEXT:    uunpklo z2.h, z0.b
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    add z1.b, z1.b, z1.b
+; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
-; CHECK-NEXT:    uunpklo z3.h, z1.b
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    uunpklo z5.s, z3.h
+; CHECK-NEXT:    mov z3.d, z1.d
 ; CHECK-NEXT:    uunpklo z1.h, z1.b
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    uunpklo z4.s, z2.h
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    uunpklo z6.s, z0.h
-; CHECK-NEXT:    uunpklo z7.s, z1.h
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    uunpklo z4.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z5.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    uunpklo z2.s, z2.h
+; CHECK-NEXT:    uunpklo z2.h, z2.b
+; CHECK-NEXT:    uunpklo z3.h, z3.b
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    uunpklo z20.d, z3.s
-; CHECK-NEXT:    uunpklo z22.d, z4.s
+; CHECK-NEXT:    uunpklo z16.d, z4.s
 ; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    uunpklo z16.d, z2.s
-; CHECK-NEXT:    uunpklo z17.d, z6.s
-; CHECK-NEXT:    uunpklo z18.d, z0.s
-; CHECK-NEXT:    uunpklo z19.d, z1.s
-; CHECK-NEXT:    uunpklo z21.d, z7.s
-; CHECK-NEXT:    uunpklo z23.d, z5.s
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uunpklo z17.d, z5.s
 ; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
-; CHECK-NEXT:    uunpklo z4.d, z4.s
-; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
-; CHECK-NEXT:    uunpklo z3.d, z3.s
-; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT:    uunpklo z6.s, z2.h
+; CHECK-NEXT:    uunpklo z7.s, z3.h
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z4.d, z4.s
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    uunpklo z19.d, z0.s
 ; CHECK-NEXT:    uunpklo z5.d, z5.s
-; CHECK-NEXT:    stp q22, q4, [x1]
-; CHECK-NEXT:    uunpklo z4.d, z7.s
-; CHECK-NEXT:    stp q23, q5, [x1, #128]
-; CHECK-NEXT:    uunpklo z2.d, z2.s
-; CHECK-NEXT:    stp q20, q3, [x1, #160]
-; CHECK-NEXT:    uunpklo z3.d, z6.s
-; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z2.s, z2.h
+; CHECK-NEXT:    uunpklo z18.d, z6.s
+; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    stp q16, q4, [x1, #128]
+; CHECK-NEXT:    mov z16.d, z7.d
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    stp q16, q2, [x1, #32]
-; CHECK-NEXT:    stp q17, q3, [x1, #64]
-; CHECK-NEXT:    stp q18, q0, [x1, #96]
-; CHECK-NEXT:    stp q21, q4, [x1, #192]
-; CHECK-NEXT:    stp q19, q1, [x1, #224]
+; CHECK-NEXT:    stp q17, q5, [x1]
+; CHECK-NEXT:    uunpklo z5.d, z7.s
+; CHECK-NEXT:    uunpklo z4.d, z6.s
+; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    ext z16.b, z16.b, z7.b, #8
+; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    stp q19, q0, [x1, #160]
+; CHECK-NEXT:    uunpklo z0.d, z2.s
+; CHECK-NEXT:    ext z6.b, z6.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    stp q18, q4, [x1, #192]
+; CHECK-NEXT:    mov z4.d, z3.d
+; CHECK-NEXT:    ext z7.b, z7.b, z2.b, #8
+; CHECK-NEXT:    uunpklo z16.d, z16.s
+; CHECK-NEXT:    uunpklo z6.d, z6.s
+; CHECK-NEXT:    ext z4.b, z4.b, z3.b, #8
+; CHECK-NEXT:    uunpklo z2.d, z7.s
+; CHECK-NEXT:    uunpklo z3.d, z3.s
+; CHECK-NEXT:    stp q5, q16, [x1, #64]
+; CHECK-NEXT:    stp q1, q6, [x1, #32]
+; CHECK-NEXT:    uunpklo z1.d, z4.s
+; CHECK-NEXT:    stp q0, q2, [x1, #224]
+; CHECK-NEXT:    stp q3, q1, [x1, #96]
 ; CHECK-NEXT:    ret
   %a = load <32 x i8>, ptr %in
   %b = add <32 x i8> %a, %a
@@ -757,17 +773,17 @@ define void @zext_v8i16_v8i32(<8 x i16> %a, ptr %out) {
 define void @zext_v16i16_v16i32(ptr %in, ptr %out) {
 ; CHECK-LABEL: zext_v16i16_v16i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    add z1.h, z1.h, z1.h
 ; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    add z1.h, z1.h, z1.h
-; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z3.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    stp q2, q0, [x1]
-; CHECK-NEXT:    stp q3, q1, [x1, #32]
+; CHECK-NEXT:    stp q2, q0, [x1, #32]
+; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
@@ -801,15 +817,15 @@ define void @zext_v8i16_v8i64(<8 x i16> %a, ptr %out) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    uunpklo z1.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z2.d, z1.s
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z2.d, z1.s
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    uunpklo z3.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    stp q3, q0, [x0, #32]
-; CHECK-NEXT:    uunpklo z0.d, z1.s
-; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
   %b = zext <8 x i16> %a to <8 x i64>
   store <8 x i64>%b, ptr %out
@@ -819,31 +835,31 @@ define void @zext_v8i16_v8i64(<8 x i16> %a, ptr %out) {
 define void @zext_v16i16_v16i64(ptr %in, ptr %out) {
 ; CHECK-LABEL: zext_v16i16_v16i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    add z1.h, z1.h, z1.h
 ; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    add z1.h, z1.h, z1.h
-; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z3.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    uunpklo z3.d, z3.s
 ; CHECK-NEXT:    uunpklo z4.d, z2.s
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
+; CHECK-NEXT:    uunpklo z5.d, z3.s
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; CHECK-NEXT:    uunpklo z6.d, z0.s
-; CHECK-NEXT:    stp q5, q3, [x1, #64]
-; CHECK-NEXT:    uunpklo z5.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z2.d, z2.s
-; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    uunpklo z7.d, z1.s
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z3.d, z3.s
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    stp q4, q2, [x1]
-; CHECK-NEXT:    stp q6, q0, [x1, #32]
-; CHECK-NEXT:    stp q5, q1, [x1, #96]
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    stp q4, q2, [x1, #64]
+; CHECK-NEXT:    stp q5, q3, [x1]
+; CHECK-NEXT:    stp q6, q0, [x1, #96]
+; CHECK-NEXT:    stp q7, q1, [x1, #32]
 ; CHECK-NEXT:    ret
   %a = load <16 x i16>, ptr %in
   %b = add <16 x i16> %a, %a
@@ -873,17 +889,17 @@ define void @zext_v4i32_v4i64(<4 x i32> %a, ptr %out) {
 define void @zext_v8i32_v8i64(ptr %in, ptr %out) {
 ; CHECK-LABEL: zext_v8i32_v8i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.s, z0.s, z0.s
+; CHECK-NEXT:    add z1.s, z1.s, z1.s
 ; CHECK-NEXT:    uunpklo z2.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    add z1.s, z1.s, z1.s
-; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    uunpklo z3.d, z1.s
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEXT:    stp q2, q0, [x1]
-; CHECK-NEXT:    stp q3, q1, [x1, #32]
+; CHECK-NEXT:    stp q2, q0, [x1, #32]
+; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
   %a = load <8 x i32>, ptr %in
   %b = add <8 x i32> %a, %a
@@ -896,8 +912,8 @@ define void @extend_and_mul(i32 %0, <2 x i64> %1, ptr %2) {
 ; SVE-LABEL: extend_and_mul:
 ; SVE:       // %bb.0:
 ; SVE-NEXT:    mov z1.s, w0
-; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    ptrue p0.d, vl2
+; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    uunpklo z1.d, z1.s
 ; SVE-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; SVE-NEXT:    str q0, [x1]

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
index df3b622c66f0c3..a83faee694e646 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
@@ -220,11 +220,11 @@ define void @ashr_v4i64(ptr %a) {
 define void @icmp_eq_v32i8(ptr %a) {
 ; CHECK-LABEL: icmp_eq_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    cmpeq p1.b, p0/z, z0.b, #7
-; CHECK-NEXT:    mov z0.b, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    cmpeq p0.b, p0/z, z1.b, #7
+; CHECK-NEXT:    mov z0.b, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -240,11 +240,11 @@ define void @icmp_eq_v32i8(ptr %a) {
 define void @icmp_sge_v16i16(ptr %a) {
 ; CHECK-LABEL: icmp_sge_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    cmpge p1.h, p0/z, z0.h, #15
-; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    cmpge p0.h, p0/z, z1.h, #15
+; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -260,11 +260,11 @@ define void @icmp_sge_v16i16(ptr %a) {
 define void @icmp_sgt_v8i32(ptr %a) {
 ; CHECK-LABEL: icmp_sgt_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    cmpgt p1.s, p0/z, z0.s, #-8
-; CHECK-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    cmpgt p0.s, p0/z, z1.s, #-8
+; CHECK-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -280,11 +280,11 @@ define void @icmp_sgt_v8i32(ptr %a) {
 define void @icmp_ult_v4i64(ptr %a) {
 ; CHECK-LABEL: icmp_ult_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    cmplo p1.d, p0/z, z0.d, #63
-; CHECK-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    cmplo p0.d, p0/z, z1.d, #63
+; CHECK-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
index 052d5245b9522a..eac0bdc66ba45e 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
@@ -34,10 +34,10 @@ define <16 x i8> @and_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @and_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: and_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    and z0.d, z0.d, z2.d
-; CHECK-NEXT:    and z1.d, z1.d, z3.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    and z0.d, z1.d, z0.d
+; CHECK-NEXT:    and z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
@@ -74,10 +74,10 @@ define <8 x i16> @and_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @and_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: and_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    and z0.d, z0.d, z2.d
-; CHECK-NEXT:    and z1.d, z1.d, z3.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    and z0.d, z1.d, z0.d
+; CHECK-NEXT:    and z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
@@ -114,10 +114,10 @@ define <4 x i32> @and_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @and_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: and_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    and z0.d, z0.d, z2.d
-; CHECK-NEXT:    and z1.d, z1.d, z3.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    and z0.d, z1.d, z0.d
+; CHECK-NEXT:    and z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
@@ -154,10 +154,10 @@ define <2 x i64> @and_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @and_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: and_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    and z0.d, z0.d, z2.d
-; CHECK-NEXT:    and z1.d, z1.d, z3.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    and z0.d, z1.d, z0.d
+; CHECK-NEXT:    and z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
@@ -198,10 +198,10 @@ define <16 x i8> @or_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @or_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: or_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    orr z0.d, z0.d, z2.d
-; CHECK-NEXT:    orr z1.d, z1.d, z3.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    orr z0.d, z1.d, z0.d
+; CHECK-NEXT:    orr z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
@@ -238,10 +238,10 @@ define <8 x i16> @or_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @or_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: or_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    orr z0.d, z0.d, z2.d
-; CHECK-NEXT:    orr z1.d, z1.d, z3.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    orr z0.d, z1.d, z0.d
+; CHECK-NEXT:    orr z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
@@ -278,10 +278,10 @@ define <4 x i32> @or_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @or_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: or_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    orr z0.d, z0.d, z2.d
-; CHECK-NEXT:    orr z1.d, z1.d, z3.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    orr z0.d, z1.d, z0.d
+; CHECK-NEXT:    orr z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
@@ -318,10 +318,10 @@ define <2 x i64> @or_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @or_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: or_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    orr z0.d, z0.d, z2.d
-; CHECK-NEXT:    orr z1.d, z1.d, z3.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    orr z0.d, z1.d, z0.d
+; CHECK-NEXT:    orr z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
@@ -362,10 +362,10 @@ define <16 x i8> @xor_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @xor_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: xor_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    eor z0.d, z0.d, z2.d
-; CHECK-NEXT:    eor z1.d, z1.d, z3.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    eor z0.d, z1.d, z0.d
+; CHECK-NEXT:    eor z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
@@ -402,10 +402,10 @@ define <8 x i16> @xor_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @xor_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: xor_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    eor z0.d, z0.d, z2.d
-; CHECK-NEXT:    eor z1.d, z1.d, z3.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    eor z0.d, z1.d, z0.d
+; CHECK-NEXT:    eor z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
@@ -442,10 +442,10 @@ define <4 x i32> @xor_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @xor_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: xor_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    eor z0.d, z0.d, z2.d
-; CHECK-NEXT:    eor z1.d, z1.d, z3.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    eor z0.d, z1.d, z0.d
+; CHECK-NEXT:    eor z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
@@ -482,10 +482,10 @@ define <2 x i64> @xor_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @xor_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: xor_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    eor z0.d, z0.d, z2.d
-; CHECK-NEXT:    eor z1.d, z1.d, z3.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    eor z0.d, z1.d, z0.d
+; CHECK-NEXT:    eor z1.d, z2.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
index 02c60fbf99bb37..de44c44a62bf7a 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
@@ -10,8 +10,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define <8 x i8> @smax_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-LABEL: smax_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    smax z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -23,8 +23,8 @@ define <8 x i8> @smax_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 define <16 x i8> @smax_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-LABEL: smax_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    smax z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -36,10 +36,11 @@ define <16 x i8> @smax_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @smax_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: smax_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    smax z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    smax z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    smax z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -53,8 +54,8 @@ define void @smax_v32i8(ptr %a, ptr %b) {
 define <4 x i16> @smax_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-LABEL: smax_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    smax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -66,8 +67,8 @@ define <4 x i16> @smax_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 define <8 x i16> @smax_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-LABEL: smax_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    smax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -79,10 +80,11 @@ define <8 x i16> @smax_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @smax_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: smax_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    smax z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    smax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    smax z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -96,8 +98,8 @@ define void @smax_v16i16(ptr %a, ptr %b) {
 define <2 x i32> @smax_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-LABEL: smax_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -109,8 +111,8 @@ define <2 x i32> @smax_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 define <4 x i32> @smax_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-LABEL: smax_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -122,10 +124,11 @@ define <4 x i32> @smax_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @smax_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: smax_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    smax z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -140,8 +143,8 @@ define void @smax_v8i32(ptr %a, ptr %b) {
 define <1 x i64> @smax_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-LABEL: smax_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -154,8 +157,8 @@ define <1 x i64> @smax_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 define <2 x i64> @smax_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-LABEL: smax_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -167,10 +170,11 @@ define <2 x i64> @smax_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @smax_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: smax_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    smax z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -188,8 +192,8 @@ define void @smax_v4i64(ptr %a, ptr %b) {
 define <8 x i8> @smin_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-LABEL: smin_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    smin z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -201,8 +205,8 @@ define <8 x i8> @smin_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 define <16 x i8> @smin_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-LABEL: smin_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    smin z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -214,10 +218,11 @@ define <16 x i8> @smin_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @smin_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: smin_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    smin z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    smin z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    smin z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -231,8 +236,8 @@ define void @smin_v32i8(ptr %a, ptr %b) {
 define <4 x i16> @smin_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-LABEL: smin_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    smin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -244,8 +249,8 @@ define <4 x i16> @smin_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 define <8 x i16> @smin_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-LABEL: smin_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    smin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -257,10 +262,11 @@ define <8 x i16> @smin_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @smin_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: smin_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    smin z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    smin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    smin z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -274,8 +280,8 @@ define void @smin_v16i16(ptr %a, ptr %b) {
 define <2 x i32> @smin_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-LABEL: smin_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -287,8 +293,8 @@ define <2 x i32> @smin_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 define <4 x i32> @smin_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-LABEL: smin_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -300,10 +306,11 @@ define <4 x i32> @smin_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @smin_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: smin_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    smin z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -318,8 +325,8 @@ define void @smin_v8i32(ptr %a, ptr %b) {
 define <1 x i64> @smin_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-LABEL: smin_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -332,8 +339,8 @@ define <1 x i64> @smin_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 define <2 x i64> @smin_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-LABEL: smin_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -345,10 +352,11 @@ define <2 x i64> @smin_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @smin_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: smin_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    smin z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -366,8 +374,8 @@ define void @smin_v4i64(ptr %a, ptr %b) {
 define <8 x i8> @umax_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-LABEL: umax_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    umax z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -379,8 +387,8 @@ define <8 x i8> @umax_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 define <16 x i8> @umax_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-LABEL: umax_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    umax z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -392,10 +400,11 @@ define <16 x i8> @umax_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @umax_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: umax_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    umax z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    umax z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    umax z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -409,8 +418,8 @@ define void @umax_v32i8(ptr %a, ptr %b) {
 define <4 x i16> @umax_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-LABEL: umax_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -422,8 +431,8 @@ define <4 x i16> @umax_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 define <8 x i16> @umax_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-LABEL: umax_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -435,10 +444,11 @@ define <8 x i16> @umax_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @umax_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: umax_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    umax z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -452,8 +462,8 @@ define void @umax_v16i16(ptr %a, ptr %b) {
 define <2 x i32> @umax_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-LABEL: umax_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -465,8 +475,8 @@ define <2 x i32> @umax_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 define <4 x i32> @umax_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-LABEL: umax_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -478,10 +488,11 @@ define <4 x i32> @umax_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @umax_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: umax_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    umax z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -496,8 +507,8 @@ define void @umax_v8i32(ptr %a, ptr %b) {
 define <1 x i64> @umax_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-LABEL: umax_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -510,8 +521,8 @@ define <1 x i64> @umax_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 define <2 x i64> @umax_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-LABEL: umax_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -523,10 +534,11 @@ define <2 x i64> @umax_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @umax_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: umax_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    umax z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -544,8 +556,8 @@ define void @umax_v4i64(ptr %a, ptr %b) {
 define <8 x i8> @umin_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-LABEL: umin_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    umin z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -557,8 +569,8 @@ define <8 x i8> @umin_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 define <16 x i8> @umin_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-LABEL: umin_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    umin z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -570,10 +582,11 @@ define <16 x i8> @umin_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @umin_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: umin_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    umin z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    umin z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    umin z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -587,8 +600,8 @@ define void @umin_v32i8(ptr %a, ptr %b) {
 define <4 x i16> @umin_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-LABEL: umin_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    umin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -600,8 +613,8 @@ define <4 x i16> @umin_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 define <8 x i16> @umin_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-LABEL: umin_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    umin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -613,10 +626,11 @@ define <8 x i16> @umin_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @umin_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: umin_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    umin z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    umin z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    umin z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -630,8 +644,8 @@ define void @umin_v16i16(ptr %a, ptr %b) {
 define <2 x i32> @umin_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-LABEL: umin_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -643,8 +657,8 @@ define <2 x i32> @umin_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 define <4 x i32> @umin_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-LABEL: umin_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -656,10 +670,11 @@ define <4 x i32> @umin_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @umin_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: umin_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    umin z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -674,8 +689,8 @@ define void @umin_v8i32(ptr %a, ptr %b) {
 define <1 x i64> @umin_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-LABEL: umin_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -688,8 +703,8 @@ define <1 x i64> @umin_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 define <2 x i64> @umin_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-LABEL: umin_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -701,10 +716,11 @@ define <2 x i64> @umin_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @umin_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: umin_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    umin z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
index b6a9acb4d550ea..6f7a5077ee6a1b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
@@ -14,9 +14,9 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x i8> @smulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; SVE-LABEL: smulh_v4i8:
 ; SVE:       // %bb.0:
+; SVE-NEXT:    ptrue p0.h, vl4
 ; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
-; SVE-NEXT:    ptrue p0.h, vl4
 ; SVE-NEXT:    sxtb z0.h, p0/m, z0.h
 ; SVE-NEXT:    sxtb z1.h, p0/m, z1.h
 ; SVE-NEXT:    mul z0.h, p0/m, z0.h, z1.h
@@ -26,9 +26,9 @@ define <4 x i8> @smulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ;
 ; SVE2-LABEL: smulh_v4i8:
 ; SVE2:       // %bb.0:
+; SVE2-NEXT:    ptrue p0.h, vl4
 ; SVE2-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
-; SVE2-NEXT:    ptrue p0.h, vl4
 ; SVE2-NEXT:    sxtb z0.h, p0/m, z0.h
 ; SVE2-NEXT:    sxtb z1.h, p0/m, z1.h
 ; SVE2-NEXT:    mul z0.h, z0.h, z1.h
@@ -48,8 +48,8 @@ define <4 x i8> @smulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; SVE-LABEL: smulh_v8i8:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    ptrue p0.b, vl8
+; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; SVE-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
 ; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -75,8 +75,8 @@ define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; SVE-LABEL: smulh_v16i8:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    ptrue p0.b, vl16
+; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; SVE-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
 ; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -100,20 +100,21 @@ define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @smulh_v32i8(ptr %a, ptr %b) {
 ; SVE-LABEL: smulh_v32i8:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ldp q0, q1, [x0]
 ; SVE-NEXT:    ptrue p0.b, vl16
-; SVE-NEXT:    ldp q2, q3, [x1]
-; SVE-NEXT:    smulh z0.b, p0/m, z0.b, z2.b
+; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ldp q1, q2, [x0]
+; SVE-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
+; SVE-NEXT:    movprfx z1, z2
 ; SVE-NEXT:    smulh z1.b, p0/m, z1.b, z3.b
 ; SVE-NEXT:    stp q0, q1, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: smulh_v32i8:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ldp q0, q1, [x0]
-; SVE2-NEXT:    ldp q2, q3, [x1]
-; SVE2-NEXT:    smulh z0.b, z0.b, z2.b
-; SVE2-NEXT:    smulh z1.b, z1.b, z3.b
+; SVE2-NEXT:    ldp q0, q3, [x1]
+; SVE2-NEXT:    ldp q1, q2, [x0]
+; SVE2-NEXT:    smulh z0.b, z1.b, z0.b
+; SVE2-NEXT:    smulh z1.b, z2.b, z3.b
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
@@ -130,9 +131,9 @@ define void @smulh_v32i8(ptr %a, ptr %b) {
 define <2 x i16> @smulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; SVE-LABEL: smulh_v2i16:
 ; SVE:       // %bb.0:
+; SVE-NEXT:    ptrue p0.s, vl2
 ; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
-; SVE-NEXT:    ptrue p0.s, vl2
 ; SVE-NEXT:    sxth z0.s, p0/m, z0.s
 ; SVE-NEXT:    sxth z1.s, p0/m, z1.s
 ; SVE-NEXT:    mul z0.s, p0/m, z0.s, z1.s
@@ -142,9 +143,9 @@ define <2 x i16> @smulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ;
 ; SVE2-LABEL: smulh_v2i16:
 ; SVE2:       // %bb.0:
+; SVE2-NEXT:    ptrue p0.s, vl2
 ; SVE2-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
-; SVE2-NEXT:    ptrue p0.s, vl2
 ; SVE2-NEXT:    sxth z0.s, p0/m, z0.s
 ; SVE2-NEXT:    sxth z1.s, p0/m, z1.s
 ; SVE2-NEXT:    mul z0.s, z0.s, z1.s
@@ -162,8 +163,8 @@ define <2 x i16> @smulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; SVE-LABEL: smulh_v4i16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    ptrue p0.h, vl4
+; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; SVE-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
 ; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -187,8 +188,8 @@ define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; SVE-LABEL: smulh_v8i16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    ptrue p0.h, vl8
+; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; SVE-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
 ; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -212,20 +213,21 @@ define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @smulh_v16i16(ptr %a, ptr %b) {
 ; SVE-LABEL: smulh_v16i16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ldp q0, q1, [x0]
 ; SVE-NEXT:    ptrue p0.h, vl8
-; SVE-NEXT:    ldp q2, q3, [x1]
-; SVE-NEXT:    smulh z0.h, p0/m, z0.h, z2.h
+; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ldp q1, q2, [x0]
+; SVE-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
+; SVE-NEXT:    movprfx z1, z2
 ; SVE-NEXT:    smulh z1.h, p0/m, z1.h, z3.h
 ; SVE-NEXT:    stp q0, q1, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: smulh_v16i16:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ldp q0, q1, [x0]
-; SVE2-NEXT:    ldp q2, q3, [x1]
-; SVE2-NEXT:    smulh z0.h, z0.h, z2.h
-; SVE2-NEXT:    smulh z1.h, z1.h, z3.h
+; SVE2-NEXT:    ldp q0, q3, [x1]
+; SVE2-NEXT:    ldp q1, q2, [x0]
+; SVE2-NEXT:    smulh z0.h, z1.h, z0.h
+; SVE2-NEXT:    smulh z1.h, z2.h, z3.h
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
@@ -242,8 +244,8 @@ define void @smulh_v16i16(ptr %a, ptr %b) {
 define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; SVE-LABEL: smulh_v2i32:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    ptrue p0.s, vl2
+; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; SVE-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
 ; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -267,8 +269,8 @@ define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; SVE-LABEL: smulh_v4i32:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    ptrue p0.s, vl4
+; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; SVE-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
 ; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -292,20 +294,21 @@ define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @smulh_v8i32(ptr %a, ptr %b) {
 ; SVE-LABEL: smulh_v8i32:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ldp q0, q1, [x0]
 ; SVE-NEXT:    ptrue p0.s, vl4
-; SVE-NEXT:    ldp q2, q3, [x1]
-; SVE-NEXT:    smulh z0.s, p0/m, z0.s, z2.s
+; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ldp q1, q2, [x0]
+; SVE-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
+; SVE-NEXT:    movprfx z1, z2
 ; SVE-NEXT:    smulh z1.s, p0/m, z1.s, z3.s
 ; SVE-NEXT:    stp q0, q1, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: smulh_v8i32:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ldp q0, q1, [x0]
-; SVE2-NEXT:    ldp q2, q3, [x1]
-; SVE2-NEXT:    smulh z0.s, z0.s, z2.s
-; SVE2-NEXT:    smulh z1.s, z1.s, z3.s
+; SVE2-NEXT:    ldp q0, q3, [x1]
+; SVE2-NEXT:    ldp q1, q2, [x0]
+; SVE2-NEXT:    smulh z0.s, z1.s, z0.s
+; SVE2-NEXT:    smulh z1.s, z2.s, z3.s
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
@@ -322,8 +325,8 @@ define void @smulh_v8i32(ptr %a, ptr %b) {
 define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; SVE-LABEL: smulh_v1i64:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    ptrue p0.d, vl1
+; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; SVE-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
 ; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -349,8 +352,8 @@ define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; SVE-LABEL: smulh_v2i64:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    ptrue p0.d, vl2
+; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; SVE-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
 ; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -374,20 +377,21 @@ define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @smulh_v4i64(ptr %a, ptr %b) {
 ; SVE-LABEL: smulh_v4i64:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ldp q0, q1, [x0]
 ; SVE-NEXT:    ptrue p0.d, vl2
-; SVE-NEXT:    ldp q2, q3, [x1]
-; SVE-NEXT:    smulh z0.d, p0/m, z0.d, z2.d
+; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ldp q1, q2, [x0]
+; SVE-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
+; SVE-NEXT:    movprfx z1, z2
 ; SVE-NEXT:    smulh z1.d, p0/m, z1.d, z3.d
 ; SVE-NEXT:    stp q0, q1, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: smulh_v4i64:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ldp q0, q1, [x0]
-; SVE2-NEXT:    ldp q2, q3, [x1]
-; SVE2-NEXT:    smulh z0.d, z0.d, z2.d
-; SVE2-NEXT:    smulh z1.d, z1.d, z3.d
+; SVE2-NEXT:    ldp q0, q3, [x1]
+; SVE2-NEXT:    ldp q1, q2, [x0]
+; SVE2-NEXT:    smulh z0.d, z1.d, z0.d
+; SVE2-NEXT:    smulh z1.d, z2.d, z3.d
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
@@ -408,9 +412,9 @@ define void @smulh_v4i64(ptr %a, ptr %b) {
 define <4 x i8> @umulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; SVE-LABEL: umulh_v4i8:
 ; SVE:       // %bb.0:
+; SVE-NEXT:    ptrue p0.h, vl4
 ; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
-; SVE-NEXT:    ptrue p0.h, vl4
 ; SVE-NEXT:    and z0.h, z0.h, #0xff
 ; SVE-NEXT:    and z1.h, z1.h, #0xff
 ; SVE-NEXT:    mul z0.h, p0/m, z0.h, z1.h
@@ -439,8 +443,8 @@ define <4 x i8> @umulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; SVE-LABEL: umulh_v8i8:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    ptrue p0.b, vl8
+; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; SVE-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
 ; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -464,8 +468,8 @@ define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; SVE-LABEL: umulh_v16i8:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    ptrue p0.b, vl16
+; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; SVE-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
 ; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -489,20 +493,21 @@ define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @umulh_v32i8(ptr %a, ptr %b) {
 ; SVE-LABEL: umulh_v32i8:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ldp q0, q1, [x0]
 ; SVE-NEXT:    ptrue p0.b, vl16
-; SVE-NEXT:    ldp q2, q3, [x1]
-; SVE-NEXT:    umulh z0.b, p0/m, z0.b, z2.b
+; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ldp q1, q2, [x0]
+; SVE-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
+; SVE-NEXT:    movprfx z1, z2
 ; SVE-NEXT:    umulh z1.b, p0/m, z1.b, z3.b
 ; SVE-NEXT:    stp q0, q1, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: umulh_v32i8:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ldp q0, q1, [x0]
-; SVE2-NEXT:    ldp q2, q3, [x1]
-; SVE2-NEXT:    umulh z0.b, z0.b, z2.b
-; SVE2-NEXT:    umulh z1.b, z1.b, z3.b
+; SVE2-NEXT:    ldp q0, q3, [x1]
+; SVE2-NEXT:    ldp q1, q2, [x0]
+; SVE2-NEXT:    umulh z0.b, z1.b, z0.b
+; SVE2-NEXT:    umulh z1.b, z2.b, z3.b
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
@@ -519,9 +524,9 @@ define void @umulh_v32i8(ptr %a, ptr %b) {
 define <2 x i16> @umulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; SVE-LABEL: umulh_v2i16:
 ; SVE:       // %bb.0:
+; SVE-NEXT:    ptrue p0.s, vl2
 ; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
-; SVE-NEXT:    ptrue p0.s, vl2
 ; SVE-NEXT:    and z0.s, z0.s, #0xffff
 ; SVE-NEXT:    and z1.s, z1.s, #0xffff
 ; SVE-NEXT:    mul z0.s, p0/m, z0.s, z1.s
@@ -550,8 +555,8 @@ define <2 x i16> @umulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; SVE-LABEL: umulh_v4i16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    ptrue p0.h, vl4
+; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; SVE-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
 ; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -575,8 +580,8 @@ define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; SVE-LABEL: umulh_v8i16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    ptrue p0.h, vl8
+; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; SVE-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
 ; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -600,20 +605,21 @@ define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @umulh_v16i16(ptr %a, ptr %b) {
 ; SVE-LABEL: umulh_v16i16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ldp q0, q1, [x0]
 ; SVE-NEXT:    ptrue p0.h, vl8
-; SVE-NEXT:    ldp q2, q3, [x1]
-; SVE-NEXT:    umulh z0.h, p0/m, z0.h, z2.h
+; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ldp q1, q2, [x0]
+; SVE-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
+; SVE-NEXT:    movprfx z1, z2
 ; SVE-NEXT:    umulh z1.h, p0/m, z1.h, z3.h
 ; SVE-NEXT:    stp q0, q1, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: umulh_v16i16:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ldp q0, q1, [x0]
-; SVE2-NEXT:    ldp q2, q3, [x1]
-; SVE2-NEXT:    umulh z0.h, z0.h, z2.h
-; SVE2-NEXT:    umulh z1.h, z1.h, z3.h
+; SVE2-NEXT:    ldp q0, q3, [x1]
+; SVE2-NEXT:    ldp q1, q2, [x0]
+; SVE2-NEXT:    umulh z0.h, z1.h, z0.h
+; SVE2-NEXT:    umulh z1.h, z2.h, z3.h
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
@@ -630,8 +636,8 @@ define void @umulh_v16i16(ptr %a, ptr %b) {
 define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; SVE-LABEL: umulh_v2i32:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    ptrue p0.s, vl2
+; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; SVE-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
 ; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -655,8 +661,8 @@ define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; SVE-LABEL: umulh_v4i32:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    ptrue p0.s, vl4
+; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; SVE-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
 ; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -680,20 +686,21 @@ define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @umulh_v8i32(ptr %a, ptr %b) {
 ; SVE-LABEL: umulh_v8i32:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ldp q0, q1, [x0]
 ; SVE-NEXT:    ptrue p0.s, vl4
-; SVE-NEXT:    ldp q2, q3, [x1]
-; SVE-NEXT:    umulh z0.s, p0/m, z0.s, z2.s
+; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ldp q1, q2, [x0]
+; SVE-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
+; SVE-NEXT:    movprfx z1, z2
 ; SVE-NEXT:    umulh z1.s, p0/m, z1.s, z3.s
 ; SVE-NEXT:    stp q0, q1, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: umulh_v8i32:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ldp q0, q1, [x0]
-; SVE2-NEXT:    ldp q2, q3, [x1]
-; SVE2-NEXT:    umulh z0.s, z0.s, z2.s
-; SVE2-NEXT:    umulh z1.s, z1.s, z3.s
+; SVE2-NEXT:    ldp q0, q3, [x1]
+; SVE2-NEXT:    ldp q1, q2, [x0]
+; SVE2-NEXT:    umulh z0.s, z1.s, z0.s
+; SVE2-NEXT:    umulh z1.s, z2.s, z3.s
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
@@ -712,8 +719,8 @@ define void @umulh_v8i32(ptr %a, ptr %b) {
 define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; SVE-LABEL: umulh_v1i64:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    ptrue p0.d, vl1
+; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; SVE-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
 ; SVE-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -737,8 +744,8 @@ define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; SVE-LABEL: umulh_v2i64:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    ptrue p0.d, vl2
+; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; SVE-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
 ; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -762,20 +769,21 @@ define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @umulh_v4i64(ptr %a, ptr %b) {
 ; SVE-LABEL: umulh_v4i64:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ldp q0, q1, [x0]
 ; SVE-NEXT:    ptrue p0.d, vl2
-; SVE-NEXT:    ldp q2, q3, [x1]
-; SVE-NEXT:    umulh z0.d, p0/m, z0.d, z2.d
+; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ldp q1, q2, [x0]
+; SVE-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
+; SVE-NEXT:    movprfx z1, z2
 ; SVE-NEXT:    umulh z1.d, p0/m, z1.d, z3.d
 ; SVE-NEXT:    stp q0, q1, [x0]
 ; SVE-NEXT:    ret
 ;
 ; SVE2-LABEL: umulh_v4i64:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ldp q0, q1, [x0]
-; SVE2-NEXT:    ldp q2, q3, [x1]
-; SVE2-NEXT:    umulh z0.d, z0.d, z2.d
-; SVE2-NEXT:    umulh z1.d, z1.d, z3.d
+; SVE2-NEXT:    ldp q0, q3, [x1]
+; SVE2-NEXT:    ldp q1, q2, [x0]
+; SVE2-NEXT:    umulh z0.d, z1.d, z0.d
+; SVE2-NEXT:    umulh z1.d, z2.d, z3.d
 ; SVE2-NEXT:    stp q0, q1, [x0]
 ; SVE2-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
index b8570865846ad4..44bc77615ef27e 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
@@ -10,8 +10,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define i8 @uaddv_v8i8(<8 x i8> %a) {
 ; CHECK-LABEL: uaddv_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uaddv d0, p0, z0.b
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -23,8 +23,8 @@ define i8 @uaddv_v8i8(<8 x i8> %a) {
 define i8 @uaddv_v16i8(<16 x i8> %a) {
 ; CHECK-LABEL: uaddv_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    uaddv d0, p0, z0.b
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -36,8 +36,8 @@ define i8 @uaddv_v16i8(<16 x i8> %a) {
 define i8 @uaddv_v32i8(ptr %a) {
 ; CHECK-LABEL: uaddv_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.b, z1.b, z0.b
 ; CHECK-NEXT:    uaddv d0, p0, z0.b
 ; CHECK-NEXT:    fmov x0, d0
@@ -51,8 +51,8 @@ define i8 @uaddv_v32i8(ptr %a) {
 define i16 @uaddv_v4i16(<4 x i16> %a) {
 ; CHECK-LABEL: uaddv_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -64,8 +64,8 @@ define i16 @uaddv_v4i16(<4 x i16> %a) {
 define i16 @uaddv_v8i16(<8 x i16> %a) {
 ; CHECK-LABEL: uaddv_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -77,8 +77,8 @@ define i16 @uaddv_v8i16(<8 x i16> %a) {
 define i16 @uaddv_v16i16(ptr %a) {
 ; CHECK-LABEL: uaddv_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.h, z1.h, z0.h
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
 ; CHECK-NEXT:    fmov x0, d0
@@ -92,8 +92,8 @@ define i16 @uaddv_v16i16(ptr %a) {
 define i32 @uaddv_v2i32(<2 x i32> %a) {
 ; CHECK-LABEL: uaddv_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -105,8 +105,8 @@ define i32 @uaddv_v2i32(<2 x i32> %a) {
 define i32 @uaddv_v4i32(<4 x i32> %a) {
 ; CHECK-LABEL: uaddv_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -118,8 +118,8 @@ define i32 @uaddv_v4i32(<4 x i32> %a) {
 define i32 @uaddv_v8i32(ptr %a) {
 ; CHECK-LABEL: uaddv_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.s, z1.s, z0.s
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
 ; CHECK-NEXT:    fmov x0, d0
@@ -133,8 +133,8 @@ define i32 @uaddv_v8i32(ptr %a) {
 define i64 @uaddv_v2i64(<2 x i64> %a) {
 ; CHECK-LABEL: uaddv_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    uaddv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -145,8 +145,8 @@ define i64 @uaddv_v2i64(<2 x i64> %a) {
 define i64 @uaddv_v4i64(ptr %a) {
 ; CHECK-LABEL: uaddv_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    add z0.d, z1.d, z0.d
 ; CHECK-NEXT:    uaddv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
@@ -163,8 +163,8 @@ define i64 @uaddv_v4i64(ptr %a) {
 define i8 @smaxv_v8i8(<8 x i8> %a) {
 ; CHECK-LABEL: smaxv_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    smaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -175,8 +175,8 @@ define i8 @smaxv_v8i8(<8 x i8> %a) {
 define i8 @smaxv_v16i8(<16 x i8> %a) {
 ; CHECK-LABEL: smaxv_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    smaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -187,8 +187,8 @@ define i8 @smaxv_v16i8(<16 x i8> %a) {
 define i8 @smaxv_v32i8(ptr %a) {
 ; CHECK-LABEL: smaxv_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    smax z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    smaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
@@ -201,8 +201,8 @@ define i8 @smaxv_v32i8(ptr %a) {
 define i16 @smaxv_v4i16(<4 x i16> %a) {
 ; CHECK-LABEL: smaxv_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    smaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -213,8 +213,8 @@ define i16 @smaxv_v4i16(<4 x i16> %a) {
 define i16 @smaxv_v8i16(<8 x i16> %a) {
 ; CHECK-LABEL: smaxv_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    smaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -225,8 +225,8 @@ define i16 @smaxv_v8i16(<8 x i16> %a) {
 define i16 @smaxv_v16i16(ptr %a) {
 ; CHECK-LABEL: smaxv_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    smax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    smaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
@@ -239,8 +239,8 @@ define i16 @smaxv_v16i16(ptr %a) {
 define i32 @smaxv_v2i32(<2 x i32> %a) {
 ; CHECK-LABEL: smaxv_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    smaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -251,8 +251,8 @@ define i32 @smaxv_v2i32(<2 x i32> %a) {
 define i32 @smaxv_v4i32(<4 x i32> %a) {
 ; CHECK-LABEL: smaxv_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    smaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -263,8 +263,8 @@ define i32 @smaxv_v4i32(<4 x i32> %a) {
 define i32 @smaxv_v8i32(ptr %a) {
 ; CHECK-LABEL: smaxv_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    smaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
@@ -278,8 +278,8 @@ define i32 @smaxv_v8i32(ptr %a) {
 define i64 @smaxv_v2i64(<2 x i64> %a) {
 ; CHECK-LABEL: smaxv_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    smaxv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -290,8 +290,8 @@ define i64 @smaxv_v2i64(<2 x i64> %a) {
 define i64 @smaxv_v4i64(ptr %a) {
 ; CHECK-LABEL: smaxv_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    smaxv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
@@ -308,8 +308,8 @@ define i64 @smaxv_v4i64(ptr %a) {
 define i8 @sminv_v8i8(<8 x i8> %a) {
 ; CHECK-LABEL: sminv_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    sminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -320,8 +320,8 @@ define i8 @sminv_v8i8(<8 x i8> %a) {
 define i8 @sminv_v16i8(<16 x i8> %a) {
 ; CHECK-LABEL: sminv_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    sminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -332,8 +332,8 @@ define i8 @sminv_v16i8(<16 x i8> %a) {
 define i8 @sminv_v32i8(ptr %a) {
 ; CHECK-LABEL: sminv_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    smin z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    sminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
@@ -346,8 +346,8 @@ define i8 @sminv_v32i8(ptr %a) {
 define i16 @sminv_v4i16(<4 x i16> %a) {
 ; CHECK-LABEL: sminv_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    sminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -358,8 +358,8 @@ define i16 @sminv_v4i16(<4 x i16> %a) {
 define i16 @sminv_v8i16(<8 x i16> %a) {
 ; CHECK-LABEL: sminv_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    sminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -370,8 +370,8 @@ define i16 @sminv_v8i16(<8 x i16> %a) {
 define i16 @sminv_v16i16(ptr %a) {
 ; CHECK-LABEL: sminv_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    smin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    sminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
@@ -384,8 +384,8 @@ define i16 @sminv_v16i16(ptr %a) {
 define i32 @sminv_v2i32(<2 x i32> %a) {
 ; CHECK-LABEL: sminv_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    sminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -396,8 +396,8 @@ define i32 @sminv_v2i32(<2 x i32> %a) {
 define i32 @sminv_v4i32(<4 x i32> %a) {
 ; CHECK-LABEL: sminv_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    sminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -408,8 +408,8 @@ define i32 @sminv_v4i32(<4 x i32> %a) {
 define i32 @sminv_v8i32(ptr %a) {
 ; CHECK-LABEL: sminv_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    sminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
@@ -423,8 +423,8 @@ define i32 @sminv_v8i32(ptr %a) {
 define i64 @sminv_v2i64(<2 x i64> %a) {
 ; CHECK-LABEL: sminv_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    sminv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -435,8 +435,8 @@ define i64 @sminv_v2i64(<2 x i64> %a) {
 define i64 @sminv_v4i64(ptr %a) {
 ; CHECK-LABEL: sminv_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    sminv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
@@ -453,8 +453,8 @@ define i64 @sminv_v4i64(ptr %a) {
 define i8 @umaxv_v8i8(<8 x i8> %a) {
 ; CHECK-LABEL: umaxv_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    umaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -465,8 +465,8 @@ define i8 @umaxv_v8i8(<8 x i8> %a) {
 define i8 @umaxv_v16i8(<16 x i8> %a) {
 ; CHECK-LABEL: umaxv_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    umaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -477,8 +477,8 @@ define i8 @umaxv_v16i8(<16 x i8> %a) {
 define i8 @umaxv_v32i8(ptr %a) {
 ; CHECK-LABEL: umaxv_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    umax z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    umaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
@@ -491,8 +491,8 @@ define i8 @umaxv_v32i8(ptr %a) {
 define i16 @umaxv_v4i16(<4 x i16> %a) {
 ; CHECK-LABEL: umaxv_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    umaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -503,8 +503,8 @@ define i16 @umaxv_v4i16(<4 x i16> %a) {
 define i16 @umaxv_v8i16(<8 x i16> %a) {
 ; CHECK-LABEL: umaxv_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    umaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -515,8 +515,8 @@ define i16 @umaxv_v8i16(<8 x i16> %a) {
 define i16 @umaxv_v16i16(ptr %a) {
 ; CHECK-LABEL: umaxv_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    umaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
@@ -529,8 +529,8 @@ define i16 @umaxv_v16i16(ptr %a) {
 define i32 @umaxv_v2i32(<2 x i32> %a) {
 ; CHECK-LABEL: umaxv_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    umaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -541,8 +541,8 @@ define i32 @umaxv_v2i32(<2 x i32> %a) {
 define i32 @umaxv_v4i32(<4 x i32> %a) {
 ; CHECK-LABEL: umaxv_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    umaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -553,8 +553,8 @@ define i32 @umaxv_v4i32(<4 x i32> %a) {
 define i32 @umaxv_v8i32(ptr %a) {
 ; CHECK-LABEL: umaxv_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    umaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
@@ -568,8 +568,8 @@ define i32 @umaxv_v8i32(ptr %a) {
 define i64 @umaxv_v2i64(<2 x i64> %a) {
 ; CHECK-LABEL: umaxv_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    umaxv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -580,8 +580,8 @@ define i64 @umaxv_v2i64(<2 x i64> %a) {
 define i64 @umaxv_v4i64(ptr %a) {
 ; CHECK-LABEL: umaxv_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    umaxv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
@@ -598,8 +598,8 @@ define i64 @umaxv_v4i64(ptr %a) {
 define i8 @uminv_v8i8(<8 x i8> %a) {
 ; CHECK-LABEL: uminv_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -610,8 +610,8 @@ define i8 @uminv_v8i8(<8 x i8> %a) {
 define i8 @uminv_v16i8(<16 x i8> %a) {
 ; CHECK-LABEL: uminv_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    uminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -622,8 +622,8 @@ define i8 @uminv_v16i8(<16 x i8> %a) {
 define i8 @uminv_v32i8(ptr %a) {
 ; CHECK-LABEL: uminv_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    umin z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    uminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
@@ -636,8 +636,8 @@ define i8 @uminv_v32i8(ptr %a) {
 define i16 @uminv_v4i16(<4 x i16> %a) {
 ; CHECK-LABEL: uminv_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -648,8 +648,8 @@ define i16 @uminv_v4i16(<4 x i16> %a) {
 define i16 @uminv_v8i16(<8 x i16> %a) {
 ; CHECK-LABEL: uminv_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    uminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -660,8 +660,8 @@ define i16 @uminv_v8i16(<8 x i16> %a) {
 define i16 @uminv_v16i16(ptr %a) {
 ; CHECK-LABEL: uminv_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    umin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    uminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
@@ -674,8 +674,8 @@ define i16 @uminv_v16i16(ptr %a) {
 define i32 @uminv_v2i32(<2 x i32> %a) {
 ; CHECK-LABEL: uminv_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -686,8 +686,8 @@ define i32 @uminv_v2i32(<2 x i32> %a) {
 define i32 @uminv_v4i32(<4 x i32> %a) {
 ; CHECK-LABEL: uminv_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    uminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -698,8 +698,8 @@ define i32 @uminv_v4i32(<4 x i32> %a) {
 define i32 @uminv_v8i32(ptr %a) {
 ; CHECK-LABEL: uminv_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    uminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
@@ -713,8 +713,8 @@ define i32 @uminv_v8i32(ptr %a) {
 define i64 @uminv_v2i64(<2 x i64> %a) {
 ; CHECK-LABEL: uminv_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    uminv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -725,8 +725,8 @@ define i64 @uminv_v2i64(<2 x i64> %a) {
 define i64 @uminv_v4i64(ptr %a) {
 ; CHECK-LABEL: uminv_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    uminv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
index 436e360d3cf73d..6e855a5e7b3ca7 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
@@ -10,12 +10,12 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x i8> @srem_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-LABEL: srem_v4i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    ptrue p1.s, vl4
-; CHECK-NEXT:    sxtb z1.h, p0/m, z1.h
 ; CHECK-NEXT:    sxtb z0.h, p0/m, z0.h
+; CHECK-NEXT:    sxtb z1.h, p0/m, z1.h
 ; CHECK-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEXT:    sunpklo z3.s, z0.h
 ; CHECK-NEXT:    sdivr z2.s, p1/m, z2.s, z3.s
@@ -34,21 +34,21 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    sunpklo z2.h, z1.b
 ; CHECK-NEXT:    sunpklo z3.h, z0.b
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    sunpklo z4.s, z2.h
 ; CHECK-NEXT:    sunpklo z5.s, z3.h
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    splice z4.h, p0, z4.h, z2.h
+; CHECK-NEXT:    uzp1 z3.h, z4.h, z4.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    splice z3.h, p0, z3.h, z2.h
 ; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    uzp1 z2.b, z4.b, z4.b
+; CHECK-NEXT:    uzp1 z2.b, z3.b, z3.b
 ; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -63,43 +63,42 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ptrue p1.b, vl16
 ; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
 ; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z2.h, z2.b
 ; CHECK-NEXT:    sunpklo z3.h, z3.b
-; CHECK-NEXT:    sunpklo z5.s, z2.h
-; CHECK-NEXT:    sunpklo z6.s, z3.h
+; CHECK-NEXT:    sunpklo z4.s, z2.h
+; CHECK-NEXT:    sunpklo z5.s, z3.h
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
-; CHECK-NEXT:    sdivr z5.s, p0/m, z5.s, z6.s
+; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT:    sunpklo z5.h, z0.b
+; CHECK-NEXT:    sunpklo z7.s, z5.h
+; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
+; CHECK-NEXT:    sunpklo z5.s, z5.h
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    ptrue p1.h, vl4
-; CHECK-NEXT:    sunpklo z4.h, z1.b
-; CHECK-NEXT:    sunpklo z6.h, z0.b
-; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    sunpklo z3.s, z4.h
-; CHECK-NEXT:    splice z5.h, p1, z5.h, z2.h
-; CHECK-NEXT:    sunpklo z2.s, z6.h
-; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
-; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
-; CHECK-NEXT:    sunpklo z4.s, z4.h
-; CHECK-NEXT:    sunpklo z6.s, z6.h
-; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    movprfx z3, z6
-; CHECK-NEXT:    sdiv z3.s, p0/m, z3.s, z4.s
+; CHECK-NEXT:    sunpklo z3.h, z1.b
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    sunpklo z6.s, z3.h
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z5.s
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    uzp1 z5.h, z6.h, z6.h
+; CHECK-NEXT:    splice z4.h, p0, z4.h, z2.h
+; CHECK-NEXT:    uzp1 z2.b, z4.b, z4.b
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    uzp1 z4.b, z5.b, z5.b
-; CHECK-NEXT:    splice z2.h, p1, z2.h, z3.h
+; CHECK-NEXT:    splice z5.h, p0, z5.h, z3.h
 ; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
-; CHECK-NEXT:    splice z2.b, p0, z2.b, z4.b
-; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
+; CHECK-NEXT:    uzp1 z3.b, z5.b, z5.b
+; CHECK-NEXT:    splice z3.b, p0, z3.b, z2.b
+; CHECK-NEXT:    mls z0.b, p1/m, z3.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %res = srem <16 x i8> %op1, %op2
@@ -109,80 +108,83 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @srem_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: srem_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q2, q0, [x0]
+; CHECK-NEXT:    ldr q0, [x0, #16]
+; CHECK-NEXT:    ldr q1, [x1, #16]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ptrue p1.h, vl4
-; CHECK-NEXT:    ldp q3, q1, [x1]
-; CHECK-NEXT:    mov z5.d, z0.d
-; CHECK-NEXT:    sunpklo z7.h, z0.b
-; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z5.h, z5.b
-; CHECK-NEXT:    sunpklo z18.s, z5.h
-; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
-; CHECK-NEXT:    sunpklo z5.s, z5.h
-; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    ptrue p1.b, vl16
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:    sunpklo z6.h, z1.b
-; CHECK-NEXT:    ext z4.b, z4.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z16.s, z6.h
-; CHECK-NEXT:    sunpklo z4.h, z4.b
-; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
-; CHECK-NEXT:    sunpklo z17.s, z4.h
-; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
-; CHECK-NEXT:    sunpklo z4.s, z4.h
-; CHECK-NEXT:    sdivr z17.s, p0/m, z17.s, z18.s
-; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT:    sunpklo z18.s, z7.h
-; CHECK-NEXT:    uzp1 z17.h, z17.h, z17.h
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
-; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
-; CHECK-NEXT:    sunpklo z5.s, z6.h
-; CHECK-NEXT:    splice z17.h, p1, z17.h, z4.h
-; CHECK-NEXT:    sunpklo z4.s, z7.h
-; CHECK-NEXT:    mov z6.d, z3.d
-; CHECK-NEXT:    mov z7.d, z2.d
-; CHECK-NEXT:    ext z6.b, z6.b, z3.b, #8
-; CHECK-NEXT:    ext z7.b, z7.b, z2.b, #8
-; CHECK-NEXT:    sdivr z16.s, p0/m, z16.s, z18.s
-; CHECK-NEXT:    sunpklo z6.h, z6.b
-; CHECK-NEXT:    sunpklo z7.h, z7.b
-; CHECK-NEXT:    sdiv z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT:    uzp1 z5.h, z16.h, z16.h
+; CHECK-NEXT:    sunpklo z7.h, z0.b
+; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z16.s, z6.h
-; CHECK-NEXT:    sunpklo z18.s, z7.h
 ; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT:    sunpklo z17.s, z7.h
 ; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
+; CHECK-NEXT:    sunpklo z4.h, z2.b
+; CHECK-NEXT:    sunpklo z3.h, z3.b
 ; CHECK-NEXT:    sunpklo z6.s, z6.h
 ; CHECK-NEXT:    sunpklo z7.s, z7.h
-; CHECK-NEXT:    sdivr z16.s, p0/m, z16.s, z18.s
-; CHECK-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
-; CHECK-NEXT:    uzp1 z7.h, z16.h, z16.h
-; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
-; CHECK-NEXT:    splice z5.h, p1, z5.h, z4.h
-; CHECK-NEXT:    splice z7.h, p1, z7.h, z6.h
-; CHECK-NEXT:    sunpklo z4.h, z3.b
-; CHECK-NEXT:    sunpklo z6.h, z2.b
-; CHECK-NEXT:    sunpklo z16.s, z4.h
-; CHECK-NEXT:    sunpklo z18.s, z6.h
+; CHECK-NEXT:    sdivr z16.s, p0/m, z16.s, z17.s
+; CHECK-NEXT:    sunpklo z2.s, z4.h
+; CHECK-NEXT:    sunpklo z5.s, z3.h
 ; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
-; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; CHECK-NEXT:    sunpklo z4.s, z4.h
-; CHECK-NEXT:    sunpklo z6.s, z6.h
-; CHECK-NEXT:    sdivr z16.s, p0/m, z16.s, z18.s
-; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z6.s
+; CHECK-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z5.s
+; CHECK-NEXT:    ldr q5, [x1]
+; CHECK-NEXT:    mov z17.d, z5.d
 ; CHECK-NEXT:    uzp1 z16.h, z16.h, z16.h
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
-; CHECK-NEXT:    splice z16.h, p1, z16.h, z4.h
-; CHECK-NEXT:    uzp1 z6.b, z17.b, z17.b
-; CHECK-NEXT:    uzp1 z5.b, z5.b, z5.b
-; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    uzp1 z4.b, z7.b, z7.b
+; CHECK-NEXT:    ext z17.b, z17.b, z5.b, #8
+; CHECK-NEXT:    sunpklo z17.h, z17.b
+; CHECK-NEXT:    sdiv z3.s, p0/m, z3.s, z4.s
+; CHECK-NEXT:    ldr q4, [x0]
+; CHECK-NEXT:    sunpklo z19.s, z17.h
+; CHECK-NEXT:    ext z17.b, z17.b, z17.b, #8
+; CHECK-NEXT:    mov z18.d, z4.d
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    sunpklo z17.s, z17.h
+; CHECK-NEXT:    ext z18.b, z18.b, z4.b, #8
+; CHECK-NEXT:    sunpklo z18.h, z18.b
+; CHECK-NEXT:    sunpklo z20.s, z18.h
+; CHECK-NEXT:    ext z18.b, z18.b, z18.b, #8
+; CHECK-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
+; CHECK-NEXT:    sunpklo z18.s, z18.h
+; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT:    sdivr z19.s, p0/m, z19.s, z20.s
+; CHECK-NEXT:    sunpklo z20.h, z4.b
+; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
+; CHECK-NEXT:    sunpklo z22.s, z20.h
+; CHECK-NEXT:    ext z20.b, z20.b, z20.b, #8
+; CHECK-NEXT:    sunpklo z20.s, z20.h
+; CHECK-NEXT:    sdivr z17.s, p0/m, z17.s, z18.s
+; CHECK-NEXT:    sunpklo z18.h, z5.b
+; CHECK-NEXT:    uzp1 z7.h, z19.h, z19.h
+; CHECK-NEXT:    sunpklo z21.s, z18.h
+; CHECK-NEXT:    ext z18.b, z18.b, z18.b, #8
+; CHECK-NEXT:    sunpklo z18.s, z18.h
+; CHECK-NEXT:    sdivr z21.s, p0/m, z21.s, z22.s
+; CHECK-NEXT:    uzp1 z17.h, z17.h, z17.h
+; CHECK-NEXT:    sdivr z18.s, p0/m, z18.s, z20.s
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    uzp1 z19.h, z21.h, z21.h
+; CHECK-NEXT:    splice z7.h, p0, z7.h, z17.h
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
+; CHECK-NEXT:    splice z16.h, p0, z16.h, z6.h
+; CHECK-NEXT:    uzp1 z3.b, z7.b, z7.b
+; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
 ; CHECK-NEXT:    uzp1 z7.b, z16.b, z16.b
-; CHECK-NEXT:    ptrue p1.b, vl16
-; CHECK-NEXT:    splice z7.b, p0, z7.b, z4.b
-; CHECK-NEXT:    splice z5.b, p0, z5.b, z6.b
-; CHECK-NEXT:    mls z2.b, p1/m, z7.b, z3.b
-; CHECK-NEXT:    mls z0.b, p1/m, z5.b, z1.b
+; CHECK-NEXT:    uzp1 z18.h, z18.h, z18.h
+; CHECK-NEXT:    splice z19.h, p0, z19.h, z18.h
+; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    uzp1 z6.b, z19.b, z19.b
+; CHECK-NEXT:    splice z7.b, p0, z7.b, z2.b
+; CHECK-NEXT:    splice z6.b, p0, z6.b, z3.b
+; CHECK-NEXT:    movprfx z2, z4
+; CHECK-NEXT:    mls z2.b, p1/m, z6.b, z5.b
+; CHECK-NEXT:    mls z0.b, p1/m, z7.b, z1.b
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
@@ -195,9 +197,9 @@ define void @srem_v32i8(ptr %a, ptr %b) {
 define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-LABEL: srem_v4i16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEXT:    sunpklo z3.s, z0.h
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
@@ -217,21 +219,21 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    sunpklo z4.s, z0.h
+; CHECK-NEXT:    ptrue p1.h, vl8
 ; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
 ; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z3.s, z3.h
-; CHECK-NEXT:    sunpklo z4.s, z1.h
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    sunpklo z3.s, z0.h
-; CHECK-NEXT:    sdiv z3.s, p0/m, z3.s, z4.s
+; CHECK-NEXT:    sunpklo z3.s, z1.h
+; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z4.s
+; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    splice z3.h, p0, z3.h, z2.h
-; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    mls z0.h, p0/m, z3.h, z1.h
+; CHECK-NEXT:    mls z0.h, p1/m, z3.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %res = srem <8 x i16> %op1, %op2
@@ -241,40 +243,41 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @srem_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: srem_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q2, q0, [x0]
+; CHECK-NEXT:    ldp q4, q1, [x1]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    ptrue p1.h, vl8
-; CHECK-NEXT:    mov z17.d, z2.d
-; CHECK-NEXT:    ext z17.b, z17.b, z2.b, #8
-; CHECK-NEXT:    ldp q3, q1, [x1]
-; CHECK-NEXT:    mov z5.d, z0.d
-; CHECK-NEXT:    sunpklo z7.s, z0.h
-; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #8
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    sunpklo z16.s, z0.h
+; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
+; CHECK-NEXT:    ext z5.b, z5.b, z4.b, #8
+; CHECK-NEXT:    sunpklo z2.s, z2.h
+; CHECK-NEXT:    sunpklo z3.s, z3.h
 ; CHECK-NEXT:    sunpklo z5.s, z5.h
-; CHECK-NEXT:    mov z16.d, z3.d
-; CHECK-NEXT:    ext z16.b, z16.b, z3.b, #8
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    sunpklo z6.s, z1.h
-; CHECK-NEXT:    ext z4.b, z4.b, z1.b, #8
-; CHECK-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT:    sunpklo z4.s, z4.h
-; CHECK-NEXT:    sunpklo z7.s, z16.h
-; CHECK-NEXT:    sunpklo z16.s, z17.h
-; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT:    movprfx z5, z16
-; CHECK-NEXT:    sdiv z5.s, p0/m, z5.s, z7.s
+; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    ldr q3, [x0]
+; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    sunpklo z7.s, z3.h
-; CHECK-NEXT:    sunpklo z16.s, z2.h
+; CHECK-NEXT:    ext z6.b, z6.b, z3.b, #8
+; CHECK-NEXT:    sunpklo z6.s, z6.h
+; CHECK-NEXT:    sdivr z5.s, p0/m, z5.s, z6.s
+; CHECK-NEXT:    sunpklo z6.s, z4.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    sdivr z6.s, p0/m, z6.s, z7.s
+; CHECK-NEXT:    sunpklo z7.s, z1.h
 ; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
 ; CHECK-NEXT:    sdivr z7.s, p0/m, z7.s, z16.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
+; CHECK-NEXT:    splice z6.h, p0, z6.h, z5.h
 ; CHECK-NEXT:    uzp1 z7.h, z7.h, z7.h
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
-; CHECK-NEXT:    splice z7.h, p0, z7.h, z5.h
-; CHECK-NEXT:    uzp1 z5.h, z6.h, z6.h
-; CHECK-NEXT:    splice z5.h, p0, z5.h, z4.h
-; CHECK-NEXT:    mls z2.h, p1/m, z7.h, z3.h
-; CHECK-NEXT:    mls z0.h, p1/m, z5.h, z1.h
+; CHECK-NEXT:    splice z7.h, p0, z7.h, z2.h
+; CHECK-NEXT:    movprfx z2, z3
+; CHECK-NEXT:    mls z2.h, p1/m, z6.h, z4.h
+; CHECK-NEXT:    mls z0.h, p1/m, z7.h, z1.h
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
@@ -287,8 +290,8 @@ define void @srem_v16i16(ptr %a, ptr %b) {
 define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-LABEL: srem_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
@@ -302,8 +305,8 @@ define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-LABEL: srem_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
@@ -317,14 +320,15 @@ define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @srem_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: srem_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    sdiv z4.s, p0/m, z4.s, z2.s
-; CHECK-NEXT:    movprfx z5, z1
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    movprfx z4, z1
+; CHECK-NEXT:    sdiv z4.s, p0/m, z4.s, z0.s
+; CHECK-NEXT:    movprfx z5, z2
 ; CHECK-NEXT:    sdiv z5.s, p0/m, z5.s, z3.s
-; CHECK-NEXT:    mls z0.s, p0/m, z4.s, z2.s
+; CHECK-NEXT:    msb z0.s, p0/m, z4.s, z1.s
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    mls z1.s, p0/m, z5.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -338,8 +342,8 @@ define void @srem_v8i32(ptr %a, ptr %b) {
 define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-LABEL: srem_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
@@ -353,8 +357,8 @@ define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-LABEL: srem_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
@@ -368,14 +372,15 @@ define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @srem_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: srem_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    sdiv z4.d, p0/m, z4.d, z2.d
-; CHECK-NEXT:    movprfx z5, z1
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    movprfx z4, z1
+; CHECK-NEXT:    sdiv z4.d, p0/m, z4.d, z0.d
+; CHECK-NEXT:    movprfx z5, z2
 ; CHECK-NEXT:    sdiv z5.d, p0/m, z5.d, z3.d
-; CHECK-NEXT:    mls z0.d, p0/m, z4.d, z2.d
+; CHECK-NEXT:    msb z0.d, p0/m, z4.d, z1.d
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    mls z1.d, p0/m, z5.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -417,21 +422,21 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uunpklo z2.h, z1.b
 ; CHECK-NEXT:    uunpklo z3.h, z0.b
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    uunpklo z4.s, z2.h
 ; CHECK-NEXT:    uunpklo z5.s, z3.h
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    splice z4.h, p0, z4.h, z2.h
+; CHECK-NEXT:    uzp1 z3.h, z4.h, z4.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    splice z3.h, p0, z3.h, z2.h
 ; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    uzp1 z2.b, z4.b, z4.b
+; CHECK-NEXT:    uzp1 z2.b, z3.b, z3.b
 ; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -446,43 +451,42 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ptrue p1.b, vl16
 ; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
 ; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z2.h, z2.b
 ; CHECK-NEXT:    uunpklo z3.h, z3.b
-; CHECK-NEXT:    uunpklo z5.s, z2.h
-; CHECK-NEXT:    uunpklo z6.s, z3.h
+; CHECK-NEXT:    uunpklo z4.s, z2.h
+; CHECK-NEXT:    uunpklo z5.s, z3.h
 ; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
 ; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    udivr z5.s, p0/m, z5.s, z6.s
+; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
+; CHECK-NEXT:    uunpklo z5.h, z0.b
+; CHECK-NEXT:    uunpklo z7.s, z5.h
+; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
+; CHECK-NEXT:    uunpklo z5.s, z5.h
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    ptrue p1.h, vl4
-; CHECK-NEXT:    uunpklo z4.h, z1.b
-; CHECK-NEXT:    uunpklo z6.h, z0.b
-; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    uunpklo z3.s, z4.h
-; CHECK-NEXT:    splice z5.h, p1, z5.h, z2.h
-; CHECK-NEXT:    uunpklo z2.s, z6.h
-; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
-; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
-; CHECK-NEXT:    uunpklo z4.s, z4.h
-; CHECK-NEXT:    uunpklo z6.s, z6.h
-; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    movprfx z3, z6
-; CHECK-NEXT:    udiv z3.s, p0/m, z3.s, z4.s
+; CHECK-NEXT:    uunpklo z3.h, z1.b
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    uunpklo z6.s, z3.h
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z5.s
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    uzp1 z5.h, z6.h, z6.h
+; CHECK-NEXT:    splice z4.h, p0, z4.h, z2.h
+; CHECK-NEXT:    uzp1 z2.b, z4.b, z4.b
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    uzp1 z4.b, z5.b, z5.b
-; CHECK-NEXT:    splice z2.h, p1, z2.h, z3.h
+; CHECK-NEXT:    splice z5.h, p0, z5.h, z3.h
 ; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
-; CHECK-NEXT:    splice z2.b, p0, z2.b, z4.b
-; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
+; CHECK-NEXT:    uzp1 z3.b, z5.b, z5.b
+; CHECK-NEXT:    splice z3.b, p0, z3.b, z2.b
+; CHECK-NEXT:    mls z0.b, p1/m, z3.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %res = urem <16 x i8> %op1, %op2
@@ -492,80 +496,83 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @urem_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: urem_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q2, q0, [x0]
+; CHECK-NEXT:    ldr q0, [x0, #16]
+; CHECK-NEXT:    ldr q1, [x1, #16]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ptrue p1.h, vl4
-; CHECK-NEXT:    ldp q3, q1, [x1]
-; CHECK-NEXT:    mov z5.d, z0.d
-; CHECK-NEXT:    uunpklo z7.h, z0.b
-; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z5.h, z5.b
-; CHECK-NEXT:    uunpklo z18.s, z5.h
-; CHECK-NEXT:    ext z5.b, z5.b, z5.b, #8
-; CHECK-NEXT:    uunpklo z5.s, z5.h
-; CHECK-NEXT:    mov z4.d, z1.d
+; CHECK-NEXT:    ptrue p1.b, vl16
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:    uunpklo z6.h, z1.b
-; CHECK-NEXT:    ext z4.b, z4.b, z1.b, #8
-; CHECK-NEXT:    uunpklo z16.s, z6.h
-; CHECK-NEXT:    uunpklo z4.h, z4.b
-; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
-; CHECK-NEXT:    uunpklo z17.s, z4.h
-; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
-; CHECK-NEXT:    uunpklo z4.s, z4.h
-; CHECK-NEXT:    udivr z17.s, p0/m, z17.s, z18.s
-; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT:    uunpklo z18.s, z7.h
-; CHECK-NEXT:    uzp1 z17.h, z17.h, z17.h
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
-; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
-; CHECK-NEXT:    uunpklo z5.s, z6.h
-; CHECK-NEXT:    splice z17.h, p1, z17.h, z4.h
-; CHECK-NEXT:    uunpklo z4.s, z7.h
-; CHECK-NEXT:    mov z6.d, z3.d
-; CHECK-NEXT:    mov z7.d, z2.d
-; CHECK-NEXT:    ext z6.b, z6.b, z3.b, #8
-; CHECK-NEXT:    ext z7.b, z7.b, z2.b, #8
-; CHECK-NEXT:    udivr z16.s, p0/m, z16.s, z18.s
-; CHECK-NEXT:    uunpklo z6.h, z6.b
-; CHECK-NEXT:    uunpklo z7.h, z7.b
-; CHECK-NEXT:    udiv z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT:    uzp1 z5.h, z16.h, z16.h
+; CHECK-NEXT:    uunpklo z7.h, z0.b
+; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z16.s, z6.h
-; CHECK-NEXT:    uunpklo z18.s, z7.h
 ; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT:    uunpklo z17.s, z7.h
 ; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
+; CHECK-NEXT:    uunpklo z4.h, z2.b
+; CHECK-NEXT:    uunpklo z3.h, z3.b
 ; CHECK-NEXT:    uunpklo z6.s, z6.h
 ; CHECK-NEXT:    uunpklo z7.s, z7.h
-; CHECK-NEXT:    udivr z16.s, p0/m, z16.s, z18.s
-; CHECK-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
-; CHECK-NEXT:    uzp1 z7.h, z16.h, z16.h
-; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
-; CHECK-NEXT:    splice z5.h, p1, z5.h, z4.h
-; CHECK-NEXT:    splice z7.h, p1, z7.h, z6.h
-; CHECK-NEXT:    uunpklo z4.h, z3.b
-; CHECK-NEXT:    uunpklo z6.h, z2.b
-; CHECK-NEXT:    uunpklo z16.s, z4.h
-; CHECK-NEXT:    uunpklo z18.s, z6.h
+; CHECK-NEXT:    udivr z16.s, p0/m, z16.s, z17.s
+; CHECK-NEXT:    uunpklo z2.s, z4.h
+; CHECK-NEXT:    uunpklo z5.s, z3.h
 ; CHECK-NEXT:    ext z4.b, z4.b, z4.b, #8
-; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
 ; CHECK-NEXT:    uunpklo z4.s, z4.h
-; CHECK-NEXT:    uunpklo z6.s, z6.h
-; CHECK-NEXT:    udivr z16.s, p0/m, z16.s, z18.s
-; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z6.s
+; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z5.s
+; CHECK-NEXT:    ldr q5, [x1]
+; CHECK-NEXT:    mov z17.d, z5.d
 ; CHECK-NEXT:    uzp1 z16.h, z16.h, z16.h
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
-; CHECK-NEXT:    splice z16.h, p1, z16.h, z4.h
-; CHECK-NEXT:    uzp1 z6.b, z17.b, z17.b
-; CHECK-NEXT:    uzp1 z5.b, z5.b, z5.b
-; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    uzp1 z4.b, z7.b, z7.b
+; CHECK-NEXT:    ext z17.b, z17.b, z5.b, #8
+; CHECK-NEXT:    uunpklo z17.h, z17.b
+; CHECK-NEXT:    udiv z3.s, p0/m, z3.s, z4.s
+; CHECK-NEXT:    ldr q4, [x0]
+; CHECK-NEXT:    uunpklo z19.s, z17.h
+; CHECK-NEXT:    ext z17.b, z17.b, z17.b, #8
+; CHECK-NEXT:    mov z18.d, z4.d
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uunpklo z17.s, z17.h
+; CHECK-NEXT:    ext z18.b, z18.b, z4.b, #8
+; CHECK-NEXT:    uunpklo z18.h, z18.b
+; CHECK-NEXT:    uunpklo z20.s, z18.h
+; CHECK-NEXT:    ext z18.b, z18.b, z18.b, #8
+; CHECK-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
+; CHECK-NEXT:    uunpklo z18.s, z18.h
+; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT:    udivr z19.s, p0/m, z19.s, z20.s
+; CHECK-NEXT:    uunpklo z20.h, z4.b
+; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
+; CHECK-NEXT:    uunpklo z22.s, z20.h
+; CHECK-NEXT:    ext z20.b, z20.b, z20.b, #8
+; CHECK-NEXT:    uunpklo z20.s, z20.h
+; CHECK-NEXT:    udivr z17.s, p0/m, z17.s, z18.s
+; CHECK-NEXT:    uunpklo z18.h, z5.b
+; CHECK-NEXT:    uzp1 z7.h, z19.h, z19.h
+; CHECK-NEXT:    uunpklo z21.s, z18.h
+; CHECK-NEXT:    ext z18.b, z18.b, z18.b, #8
+; CHECK-NEXT:    uunpklo z18.s, z18.h
+; CHECK-NEXT:    udivr z21.s, p0/m, z21.s, z22.s
+; CHECK-NEXT:    uzp1 z17.h, z17.h, z17.h
+; CHECK-NEXT:    udivr z18.s, p0/m, z18.s, z20.s
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    uzp1 z19.h, z21.h, z21.h
+; CHECK-NEXT:    splice z7.h, p0, z7.h, z17.h
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
+; CHECK-NEXT:    splice z16.h, p0, z16.h, z6.h
+; CHECK-NEXT:    uzp1 z3.b, z7.b, z7.b
+; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
 ; CHECK-NEXT:    uzp1 z7.b, z16.b, z16.b
-; CHECK-NEXT:    ptrue p1.b, vl16
-; CHECK-NEXT:    splice z7.b, p0, z7.b, z4.b
-; CHECK-NEXT:    splice z5.b, p0, z5.b, z6.b
-; CHECK-NEXT:    mls z2.b, p1/m, z7.b, z3.b
-; CHECK-NEXT:    mls z0.b, p1/m, z5.b, z1.b
+; CHECK-NEXT:    uzp1 z18.h, z18.h, z18.h
+; CHECK-NEXT:    splice z19.h, p0, z19.h, z18.h
+; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    uzp1 z6.b, z19.b, z19.b
+; CHECK-NEXT:    splice z7.b, p0, z7.b, z2.b
+; CHECK-NEXT:    splice z6.b, p0, z6.b, z3.b
+; CHECK-NEXT:    movprfx z2, z4
+; CHECK-NEXT:    mls z2.b, p1/m, z6.b, z5.b
+; CHECK-NEXT:    mls z0.b, p1/m, z7.b, z1.b
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
@@ -578,9 +585,9 @@ define void @urem_v32i8(ptr %a, ptr %b) {
 define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-LABEL: urem_v4i16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    uunpklo z2.s, z1.h
 ; CHECK-NEXT:    uunpklo z3.s, z0.h
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
@@ -600,21 +607,21 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    uunpklo z4.s, z0.h
+; CHECK-NEXT:    ptrue p1.h, vl8
 ; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
 ; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    uunpklo z4.s, z1.h
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
-; CHECK-NEXT:    uunpklo z3.s, z0.h
-; CHECK-NEXT:    udiv z3.s, p0/m, z3.s, z4.s
+; CHECK-NEXT:    uunpklo z3.s, z1.h
+; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z4.s
+; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    splice z3.h, p0, z3.h, z2.h
-; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    mls z0.h, p0/m, z3.h, z1.h
+; CHECK-NEXT:    mls z0.h, p1/m, z3.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %res = urem <8 x i16> %op1, %op2
@@ -624,40 +631,41 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @urem_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: urem_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q2, q0, [x0]
+; CHECK-NEXT:    ldp q4, q1, [x1]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    ptrue p1.h, vl8
-; CHECK-NEXT:    mov z17.d, z2.d
-; CHECK-NEXT:    ext z17.b, z17.b, z2.b, #8
-; CHECK-NEXT:    ldp q3, q1, [x1]
-; CHECK-NEXT:    mov z5.d, z0.d
-; CHECK-NEXT:    uunpklo z7.s, z0.h
-; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #8
+; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    mov z3.d, z0.d
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    uunpklo z16.s, z0.h
+; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
+; CHECK-NEXT:    ext z5.b, z5.b, z4.b, #8
+; CHECK-NEXT:    uunpklo z2.s, z2.h
+; CHECK-NEXT:    uunpklo z3.s, z3.h
 ; CHECK-NEXT:    uunpklo z5.s, z5.h
-; CHECK-NEXT:    mov z16.d, z3.d
-; CHECK-NEXT:    ext z16.b, z16.b, z3.b, #8
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    uunpklo z6.s, z1.h
-; CHECK-NEXT:    ext z4.b, z4.b, z1.b, #8
-; CHECK-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
-; CHECK-NEXT:    uunpklo z4.s, z4.h
-; CHECK-NEXT:    uunpklo z7.s, z16.h
-; CHECK-NEXT:    uunpklo z16.s, z17.h
-; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
-; CHECK-NEXT:    movprfx z5, z16
-; CHECK-NEXT:    udiv z5.s, p0/m, z5.s, z7.s
+; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:    ldr q3, [x0]
+; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    uunpklo z7.s, z3.h
-; CHECK-NEXT:    uunpklo z16.s, z2.h
+; CHECK-NEXT:    ext z6.b, z6.b, z3.b, #8
+; CHECK-NEXT:    uunpklo z6.s, z6.h
+; CHECK-NEXT:    udivr z5.s, p0/m, z5.s, z6.s
+; CHECK-NEXT:    uunpklo z6.s, z4.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    udivr z6.s, p0/m, z6.s, z7.s
+; CHECK-NEXT:    uunpklo z7.s, z1.h
 ; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
 ; CHECK-NEXT:    udivr z7.s, p0/m, z7.s, z16.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
+; CHECK-NEXT:    splice z6.h, p0, z6.h, z5.h
 ; CHECK-NEXT:    uzp1 z7.h, z7.h, z7.h
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
-; CHECK-NEXT:    splice z7.h, p0, z7.h, z5.h
-; CHECK-NEXT:    uzp1 z5.h, z6.h, z6.h
-; CHECK-NEXT:    splice z5.h, p0, z5.h, z4.h
-; CHECK-NEXT:    mls z2.h, p1/m, z7.h, z3.h
-; CHECK-NEXT:    mls z0.h, p1/m, z5.h, z1.h
+; CHECK-NEXT:    splice z7.h, p0, z7.h, z2.h
+; CHECK-NEXT:    movprfx z2, z3
+; CHECK-NEXT:    mls z2.h, p1/m, z6.h, z4.h
+; CHECK-NEXT:    mls z0.h, p1/m, z7.h, z1.h
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
@@ -670,8 +678,8 @@ define void @urem_v16i16(ptr %a, ptr %b) {
 define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-LABEL: urem_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
@@ -685,8 +693,8 @@ define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-LABEL: urem_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
@@ -700,14 +708,15 @@ define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @urem_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: urem_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    udiv z4.s, p0/m, z4.s, z2.s
-; CHECK-NEXT:    movprfx z5, z1
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    movprfx z4, z1
+; CHECK-NEXT:    udiv z4.s, p0/m, z4.s, z0.s
+; CHECK-NEXT:    movprfx z5, z2
 ; CHECK-NEXT:    udiv z5.s, p0/m, z5.s, z3.s
-; CHECK-NEXT:    mls z0.s, p0/m, z4.s, z2.s
+; CHECK-NEXT:    msb z0.s, p0/m, z4.s, z1.s
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    mls z1.s, p0/m, z5.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -721,8 +730,8 @@ define void @urem_v8i32(ptr %a, ptr %b) {
 define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-LABEL: urem_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
@@ -736,8 +745,8 @@ define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-LABEL: urem_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
@@ -751,14 +760,15 @@ define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @urem_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: urem_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    movprfx z4, z0
-; CHECK-NEXT:    udiv z4.d, p0/m, z4.d, z2.d
-; CHECK-NEXT:    movprfx z5, z1
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    movprfx z4, z1
+; CHECK-NEXT:    udiv z4.d, p0/m, z4.d, z0.d
+; CHECK-NEXT:    movprfx z5, z2
 ; CHECK-NEXT:    udiv z5.d, p0/m, z5.d, z3.d
-; CHECK-NEXT:    mls z0.d, p0/m, z4.d, z2.d
+; CHECK-NEXT:    msb z0.d, p0/m, z4.d, z1.d
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    mls z1.d, p0/m, z5.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
index 5654fe938ddb64..5bcdaafc760dfb 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
@@ -6,8 +6,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    mov z2.h, w8
@@ -22,8 +22,8 @@ define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, i1 %mask) {
 define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    mov z2.b, w8
@@ -38,8 +38,8 @@ define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) {
 define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    mov z2.b, w8
@@ -54,14 +54,14 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) {
 define void @select_v32i8(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-LABEL: select_v32i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    and w8, w2, #0x1
+; CHECK-NEXT:    mov z0.b, w8
+; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x0, #16]
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    ldr q2, [x1]
 ; CHECK-NEXT:    ldr q3, [x1, #16]
-; CHECK-NEXT:    mov z4.b, w8
-; CHECK-NEXT:    cmpne p0.b, p0/z, z4.b, #0
 ; CHECK-NEXT:    sel z0.b, p0, z0.b, z2.b
 ; CHECK-NEXT:    sel z1.b, p0, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -76,8 +76,8 @@ define void @select_v32i8(ptr %a, ptr %b, i1 %mask) {
 define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    mov z2.s, w8
@@ -92,8 +92,8 @@ define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, i1 %mask) {
 define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    mov z2.h, w8
@@ -108,8 +108,8 @@ define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) {
 define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    mov z2.h, w8
@@ -124,14 +124,14 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) {
 define void @select_v16i16(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-LABEL: select_v16i16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    and w8, w2, #0x1
+; CHECK-NEXT:    mov z0.h, w8
+; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x0, #16]
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    ldr q2, [x1]
 ; CHECK-NEXT:    ldr q3, [x1, #16]
-; CHECK-NEXT:    mov z4.h, w8
-; CHECK-NEXT:    cmpne p0.h, p0/z, z4.h, #0
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z2.h
 ; CHECK-NEXT:    sel z1.h, p0, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -146,8 +146,8 @@ define void @select_v16i16(ptr %a, ptr %b, i1 %mask) {
 define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    mov z2.s, w8
@@ -162,8 +162,8 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) {
 define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    mov z2.s, w8
@@ -178,14 +178,14 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) {
 define void @select_v8i32(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-LABEL: select_v8i32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    and w8, w2, #0x1
+; CHECK-NEXT:    mov z0.s, w8
+; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x0, #16]
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ldr q2, [x1]
 ; CHECK-NEXT:    ldr q3, [x1, #16]
-; CHECK-NEXT:    mov z4.s, w8
-; CHECK-NEXT:    cmpne p0.s, p0/z, z4.s, #0
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z2.s
 ; CHECK-NEXT:    sel z1.s, p0, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -200,9 +200,9 @@ define void @select_v8i32(ptr %a, ptr %b, i1 %mask) {
 define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v1i64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    and x8, x0, #0x1
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    mov z2.d, x8
@@ -217,9 +217,9 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) {
 define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v2i64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    and x8, x0, #0x1
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    mov z2.d, x8
@@ -234,15 +234,15 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) {
 define void @select_v4i64(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-LABEL: select_v4i64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    and x8, x2, #0x1
+; CHECK-NEXT:    mov z0.d, x8
+; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x0, #16]
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldr q2, [x1]
 ; CHECK-NEXT:    ldr q3, [x1, #16]
-; CHECK-NEXT:    mov z4.d, x8
-; CHECK-NEXT:    cmpne p0.d, p0/z, z4.d, #0
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z2.d
 ; CHECK-NEXT:    sel z1.d, p0, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
index e6e0a07cbf33b7..b9f10554c2f69a 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
@@ -10,11 +10,11 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x i8> @ashr_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-LABEL: ashr_v4i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    sxtb z0.h, p0/m, z0.h
 ; CHECK-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-NEXT:    sxtb z0.h, p0/m, z0.h
 ; CHECK-NEXT:    asr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -25,8 +25,8 @@ define <4 x i8> @ashr_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 define <8 x i8> @ashr_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-LABEL: ashr_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    asr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -38,8 +38,8 @@ define <8 x i8> @ashr_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 define <16 x i8> @ashr_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-LABEL: ashr_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    asr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -51,10 +51,11 @@ define <16 x i8> @ashr_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @ashr_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: ashr_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    asr z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    asrr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    asr z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -68,11 +69,11 @@ define void @ashr_v32i8(ptr %a, ptr %b) {
 define <2 x i16> @ashr_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-LABEL: ashr_v2i16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    ptrue p0.s, vl2
-; CHECK-NEXT:    sxth z0.s, p0/m, z0.s
 ; CHECK-NEXT:    and z1.s, z1.s, #0xffff
+; CHECK-NEXT:    sxth z0.s, p0/m, z0.s
 ; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -83,8 +84,8 @@ define <2 x i16> @ashr_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 define <4 x i16> @ashr_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-LABEL: ashr_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    asr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -96,8 +97,8 @@ define <4 x i16> @ashr_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 define <8 x i16> @ashr_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-LABEL: ashr_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    asr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -109,10 +110,11 @@ define <8 x i16> @ashr_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @ashr_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: ashr_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    asr z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    asrr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    asr z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -126,8 +128,8 @@ define void @ashr_v16i16(ptr %a, ptr %b) {
 define <2 x i32> @ashr_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-LABEL: ashr_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -139,8 +141,8 @@ define <2 x i32> @ashr_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 define <4 x i32> @ashr_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-LABEL: ashr_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -152,10 +154,11 @@ define <4 x i32> @ashr_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @ashr_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: ashr_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    asrr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    asr z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -169,8 +172,8 @@ define void @ashr_v8i32(ptr %a, ptr %b) {
 define <1 x i64> @ashr_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-LABEL: ashr_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    asr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -182,8 +185,8 @@ define <1 x i64> @ashr_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 define <2 x i64> @ashr_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-LABEL: ashr_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    asr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -195,10 +198,11 @@ define <2 x i64> @ashr_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @ashr_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: ashr_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    asr z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    asrr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    asr z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -216,9 +220,9 @@ define void @ashr_v4i64(ptr %a, ptr %b) {
 define <4 x i8> @lshr_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-LABEL: lshr_v4i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    and z1.h, z1.h, #0xff
 ; CHECK-NEXT:    and z0.h, z0.h, #0xff
 ; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
@@ -231,8 +235,8 @@ define <4 x i8> @lshr_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 define <8 x i8> @lshr_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-LABEL: lshr_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    lsr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -244,8 +248,8 @@ define <8 x i8> @lshr_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 define <16 x i8> @lshr_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-LABEL: lshr_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    lsr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -257,10 +261,11 @@ define <16 x i8> @lshr_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @lshr_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: lshr_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    lsr z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    lsrr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    lsr z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -274,9 +279,9 @@ define void @lshr_v32i8(ptr %a, ptr %b) {
 define <2 x i16> @lshr_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-LABEL: lshr_v2i16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    and z1.s, z1.s, #0xffff
 ; CHECK-NEXT:    and z0.s, z0.s, #0xffff
 ; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
@@ -289,8 +294,8 @@ define <2 x i16> @lshr_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 define <4 x i16> @lshr_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-LABEL: lshr_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -302,8 +307,8 @@ define <4 x i16> @lshr_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 define <8 x i16> @lshr_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-LABEL: lshr_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -315,10 +320,11 @@ define <8 x i16> @lshr_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @lshr_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: lshr_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    lsrr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    lsr z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -332,8 +338,8 @@ define void @lshr_v16i16(ptr %a, ptr %b) {
 define <2 x i32> @lshr_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-LABEL: lshr_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -345,8 +351,8 @@ define <2 x i32> @lshr_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 define <4 x i32> @lshr_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-LABEL: lshr_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -358,10 +364,11 @@ define <4 x i32> @lshr_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @lshr_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: lshr_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    lsrr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    lsr z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -375,8 +382,8 @@ define void @lshr_v8i32(ptr %a, ptr %b) {
 define <1 x i64> @lshr_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-LABEL: lshr_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -388,8 +395,8 @@ define <1 x i64> @lshr_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 define <2 x i64> @lshr_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-LABEL: lshr_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -401,10 +408,11 @@ define <2 x i64> @lshr_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @lshr_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: lshr_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    lsrr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    lsr z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -422,9 +430,9 @@ define void @lshr_v4i64(ptr %a, ptr %b) {
 define <2 x i8> @shl_v2i8(<2 x i8> %op1, <2 x i8> %op2) {
 ; CHECK-LABEL: shl_v2i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    and z1.s, z1.s, #0xff
 ; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -436,9 +444,9 @@ define <2 x i8> @shl_v2i8(<2 x i8> %op1, <2 x i8> %op2) {
 define <4 x i8> @shl_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-LABEL: shl_v4i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    and z1.h, z1.h, #0xff
 ; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -450,8 +458,8 @@ define <4 x i8> @shl_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 define <8 x i8> @shl_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 ; CHECK-LABEL: shl_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    lsl z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -463,8 +471,8 @@ define <8 x i8> @shl_v8i8(<8 x i8> %op1, <8 x i8> %op2) {
 define <16 x i8> @shl_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-LABEL: shl_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    lsl z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -476,10 +484,11 @@ define <16 x i8> @shl_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @shl_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: shl_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    lsl z0.b, p0/m, z0.b, z2.b
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    lslr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    lsl z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -493,8 +502,8 @@ define void @shl_v32i8(ptr %a, ptr %b) {
 define <4 x i16> @shl_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-LABEL: shl_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -506,8 +515,8 @@ define <4 x i16> @shl_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 define <8 x i16> @shl_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-LABEL: shl_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -519,10 +528,11 @@ define <8 x i16> @shl_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @shl_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: shl_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    lslr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    lsl z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -536,8 +546,8 @@ define void @shl_v16i16(ptr %a, ptr %b) {
 define <2 x i32> @shl_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 ; CHECK-LABEL: shl_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -549,8 +559,8 @@ define <2 x i32> @shl_v2i32(<2 x i32> %op1, <2 x i32> %op2) {
 define <4 x i32> @shl_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 ; CHECK-LABEL: shl_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -562,10 +572,11 @@ define <4 x i32> @shl_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @shl_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: shl_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    lslr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    lsl z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -579,8 +590,8 @@ define void @shl_v8i32(ptr %a, ptr %b) {
 define <1 x i64> @shl_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 ; CHECK-LABEL: shl_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -592,8 +603,8 @@ define <1 x i64> @shl_v1i64(<1 x i64> %op1, <1 x i64> %op2) {
 define <2 x i64> @shl_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 ; CHECK-LABEL: shl_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -605,10 +616,11 @@ define <2 x i64> @shl_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @shl_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: shl_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    lslr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    lsl z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index 4ae4e6538703ca..c110e89326cc0c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -10,8 +10,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) {
 ; CHECK-LABEL: ucvtf_v4i16_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -22,8 +22,8 @@ define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) {
 define void @ucvtf_v8i16_v8f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: ucvtf_v8i16_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
@@ -36,8 +36,8 @@ define void @ucvtf_v8i16_v8f16(ptr %a, ptr %b) {
 define void @ucvtf_v16i16_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: ucvtf_v16i16_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    ucvtf z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x1]
@@ -55,8 +55,8 @@ define void @ucvtf_v16i16_v16f16(ptr %a, ptr %b) {
 define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) {
 ; CHECK-LABEL: ucvtf_v2i16_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    and z0.s, z0.s, #0xffff
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -68,8 +68,8 @@ define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) {
 define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) {
 ; CHECK-LABEL: ucvtf_v4i16_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -99,21 +99,20 @@ define void @ucvtf_v8i16_v8f32(ptr %a, ptr %b) {
 define void @ucvtf_v16i16_v16f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: ucvtf_v16i16_v16f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z3.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    ucvtf z2.s, p0/m, z2.s
 ; CHECK-NEXT:    ucvtf z3.s, p0/m, z3.s
-; CHECK-NEXT:    ucvtf z1.s, p0/m, z1.s
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
-; CHECK-NEXT:    stp q3, q1, [x1, #32]
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    ucvtf z1.s, p0/m, z2.s
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    ucvtf z1.s, p0/m, z1.s
+; CHECK-NEXT:    stp q2, q0, [x1, #32]
+; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = uitofp <16 x i16> %op1 to <16 x float>
@@ -179,18 +178,17 @@ define void @ucvtf_v8i16_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z2.d, z1.s
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
 ; CHECK-NEXT:    uunpklo z3.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    ucvtf z2.d, p0/m, z2.d
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
 ; CHECK-NEXT:    ucvtf z3.d, p0/m, z3.d
+; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
-; CHECK-NEXT:    ucvtf z2.d, p0/m, z2.d
-; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    stp q2, q1, [x1]
 ; CHECK-NEXT:    stp q3, q0, [x1, #32]
-; CHECK-NEXT:    movprfx z0, z1
-; CHECK-NEXT:    ucvtf z0.d, p0/m, z1.d
-; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = uitofp <8 x i16> %op1 to <8 x double>
@@ -201,42 +199,44 @@ define void @ucvtf_v8i16_v8f64(ptr %a, ptr %b) {
 define void @ucvtf_v16i16_v16f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: ucvtf_v16i16_v16f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z4.d, z2.s
 ; CHECK-NEXT:    uunpklo z3.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    uunpklo z6.d, z0.s
-; CHECK-NEXT:    uunpklo z7.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    mov z4.d, z2.d
+; CHECK-NEXT:    mov z7.d, z3.d
+; CHECK-NEXT:    mov z5.d, z0.d
+; CHECK-NEXT:    ext z4.b, z4.b, z2.b, #8
+; CHECK-NEXT:    uunpklo z2.d, z2.s
+; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    ext z7.b, z7.b, z3.b, #8
+; CHECK-NEXT:    uunpklo z3.d, z3.s
+; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #8
+; CHECK-NEXT:    uunpklo z4.d, z4.s
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    ucvtf z7.d, p0/m, z7.d
+; CHECK-NEXT:    ext z6.b, z6.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    ucvtf z2.d, p0/m, z2.d
+; CHECK-NEXT:    ucvtf z3.d, p0/m, z3.d
+; CHECK-NEXT:    uunpklo z7.d, z7.s
+; CHECK-NEXT:    uunpklo z5.d, z5.s
+; CHECK-NEXT:    ucvtf z4.d, p0/m, z4.d
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    uunpklo z6.d, z6.s
 ; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    uunpklo z3.d, z3.s
-; CHECK-NEXT:    stp q7, q1, [x1, #96]
 ; CHECK-NEXT:    ucvtf z5.d, p0/m, z5.d
-; CHECK-NEXT:    movprfx z1, z3
-; CHECK-NEXT:    ucvtf z1.d, p0/m, z3.d
-; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
-; CHECK-NEXT:    uunpklo z2.d, z2.s
-; CHECK-NEXT:    stp q5, q1, [x1, #64]
-; CHECK-NEXT:    movprfx z3, z6
-; CHECK-NEXT:    ucvtf z3.d, p0/m, z6.d
-; CHECK-NEXT:    stp q3, q0, [x1, #32]
-; CHECK-NEXT:    movprfx z1, z4
-; CHECK-NEXT:    ucvtf z1.d, p0/m, z4.d
-; CHECK-NEXT:    movprfx z0, z2
-; CHECK-NEXT:    ucvtf z0.d, p0/m, z2.d
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    stp q2, q4, [x1, #64]
+; CHECK-NEXT:    movprfx z2, z6
+; CHECK-NEXT:    ucvtf z2.d, p0/m, z6.d
+; CHECK-NEXT:    stp q1, q2, [x1, #32]
+; CHECK-NEXT:    stp q0, q5, [x1, #96]
+; CHECK-NEXT:    movprfx z0, z7
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z7.d
+; CHECK-NEXT:    stp q3, q0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = uitofp <16 x i16> %op1 to <16 x double>
@@ -251,8 +251,8 @@ define void @ucvtf_v16i16_v16f64(ptr %a, ptr %b) {
 define <2 x half> @ucvtf_v2i32_v2f16(<2 x i32> %op1) {
 ; CHECK-LABEL: ucvtf_v2i32_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -264,8 +264,8 @@ define <2 x half> @ucvtf_v2i32_v2f16(<2 x i32> %op1) {
 define <4 x half> @ucvtf_v4i32_v4f16(<4 x i32> %op1) {
 ; CHECK-LABEL: ucvtf_v4i32_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -277,14 +277,14 @@ define <4 x half> @ucvtf_v4i32_v4f16(<4 x i32> %op1) {
 define <8 x half> @ucvtf_v8i32_v8f16(ptr %a) {
 ; CHECK-LABEL: ucvtf_v8i32_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ucvtf z1.h, p0/m, z1.s
 ; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    uzp1 z2.h, z0.h, z0.h
-; CHECK-NEXT:    uzp1 z0.h, z1.h, z1.h
-; CHECK-NEXT:    splice z0.h, p0, z0.h, z2.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
@@ -295,21 +295,21 @@ define <8 x half> @ucvtf_v8i32_v8f16(ptr %a) {
 define void @ucvtf_v16i32_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: ucvtf_v16i32_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ptrue p1.h, vl4
-; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    ldp q3, q2, [x0, #32]
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    ucvtf z1.h, p0/m, z1.s
-; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
-; CHECK-NEXT:    splice z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.s
 ; CHECK-NEXT:    ucvtf z3.h, p0/m, z3.s
-; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; CHECK-NEXT:    ucvtf z2.h, p0/m, z2.s
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    splice z3.h, p1, z3.h, z2.h
-; CHECK-NEXT:    stp q0, q3, [x1]
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
+; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i32>, ptr %a
   %res = uitofp <16 x i32> %op1 to <16 x half>
@@ -324,8 +324,8 @@ define void @ucvtf_v16i32_v16f16(ptr %a, ptr %b) {
 define <2 x float> @ucvtf_v2i32_v2f32(<2 x i32> %op1) {
 ; CHECK-LABEL: ucvtf_v2i32_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -336,8 +336,8 @@ define <2 x float> @ucvtf_v2i32_v2f32(<2 x i32> %op1) {
 define <4 x float> @ucvtf_v4i32_v4f32(<4 x i32> %op1) {
 ; CHECK-LABEL: ucvtf_v4i32_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -348,8 +348,8 @@ define <4 x float> @ucvtf_v4i32_v4f32(<4 x i32> %op1) {
 define void @ucvtf_v8i32_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: ucvtf_v8i32_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    ucvtf z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x1]
@@ -367,8 +367,8 @@ define void @ucvtf_v8i32_v8f32(ptr %a, ptr %b) {
 define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) {
 ; CHECK-LABEL: ucvtf_v2i32_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -398,21 +398,20 @@ define void @ucvtf_v4i32_v4f64(ptr %a, ptr %b) {
 define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: ucvtf_v8i32_v8f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    uunpklo z2.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    uunpklo z3.d, z1.s
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    ucvtf z2.d, p0/m, z2.d
 ; CHECK-NEXT:    ucvtf z3.d, p0/m, z3.d
-; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
-; CHECK-NEXT:    stp q3, q1, [x1, #32]
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    ucvtf z1.d, p0/m, z2.d
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
+; CHECK-NEXT:    stp q2, q0, [x1, #32]
+; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = uitofp <8 x i32> %op1 to <8 x double>
@@ -432,9 +431,9 @@ define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    mov z1.d, z0.d[1]
 ; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    fmov x9, d1
 ; CHECK-NEXT:    ucvtf h0, x8
-; CHECK-NEXT:    ucvtf h1, x9
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    ucvtf h1, x8
 ; CHECK-NEXT:    str h0, [sp, #8]
 ; CHECK-NEXT:    str h1, [sp, #10]
 ; CHECK-NEXT:    ldr d0, [sp, #8]
@@ -447,17 +446,16 @@ define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) {
 define <4 x half> @ucvtf_v4i64_v4f16(ptr %a) {
 ; CHECK-LABEL: ucvtf_v4i64_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    ucvtf z1.s, p0/m, z1.d
-; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.d
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    splice z1.s, p0, z1.s, z0.s
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    movprfx z0, z1
-; CHECK-NEXT:    fcvt z0.h, p0/m, z1.s
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    fcvt z0.h, p1/m, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -469,25 +467,26 @@ define <4 x half> @ucvtf_v4i64_v4f16(ptr %a) {
 define <8 x half> @ucvtf_v8i64_v8f16(ptr %a) {
 ; CHECK-LABEL: ucvtf_v8i64_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ptrue p1.s, vl2
-; CHECK-NEXT:    ptrue p2.s
+; CHECK-NEXT:    ldp q1, q0, [x0, #32]
+; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    ldp q3, q2, [x0]
 ; CHECK-NEXT:    ucvtf z1.s, p0/m, z1.d
-; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    splice z0.s, p1, z0.s, z1.s
 ; CHECK-NEXT:    ucvtf z3.s, p0/m, z3.d
-; CHECK-NEXT:    fcvt z0.h, p2/m, z0.s
-; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
 ; CHECK-NEXT:    ucvtf z2.s, p0/m, z2.d
-; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
 ; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
-; CHECK-NEXT:    splice z3.s, p1, z3.s, z2.s
-; CHECK-NEXT:    movprfx z1, z3
-; CHECK-NEXT:    fcvt z1.h, p2/m, z3.s
+; CHECK-NEXT:    splice z1.s, p0, z1.s, z0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fcvt z0.h, p1/m, z1.s
+; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    movprfx z1, z2
+; CHECK-NEXT:    fcvt z1.h, p1/m, z2.s
 ; CHECK-NEXT:    uzp1 z2.h, z0.h, z0.h
 ; CHECK-NEXT:    uzp1 z0.h, z1.h, z1.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z2.h
@@ -505,8 +504,8 @@ define <8 x half> @ucvtf_v8i64_v8f16(ptr %a) {
 define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) {
 ; CHECK-LABEL: ucvtf_v2i64_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -518,14 +517,14 @@ define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) {
 define <4 x float> @ucvtf_v4i64_v4f32(ptr %a) {
 ; CHECK-LABEL: ucvtf_v4i64_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ucvtf z1.s, p0/m, z1.d
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.d
 ; CHECK-NEXT:    ptrue p0.s, vl2
-; CHECK-NEXT:    uzp1 z2.s, z0.s, z0.s
-; CHECK-NEXT:    uzp1 z0.s, z1.s, z1.s
-; CHECK-NEXT:    splice z0.s, p0, z0.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
@@ -536,21 +535,21 @@ define <4 x float> @ucvtf_v4i64_v4f32(ptr %a) {
 define void @ucvtf_v8i64_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: ucvtf_v8i64_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ptrue p1.s, vl2
-; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    ldp q3, q2, [x0, #32]
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    ucvtf z1.s, p0/m, z1.d
-; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    splice z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.d
 ; CHECK-NEXT:    ucvtf z3.s, p0/m, z3.d
-; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
 ; CHECK-NEXT:    ucvtf z2.s, p0/m, z2.d
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
 ; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
-; CHECK-NEXT:    splice z3.s, p1, z3.s, z2.s
-; CHECK-NEXT:    stp q0, q3, [x1]
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
+; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x i64>, ptr %a
   %res = uitofp <8 x i64> %op1 to <8 x float>
@@ -565,8 +564,8 @@ define void @ucvtf_v8i64_v8f32(ptr %a, ptr %b) {
 define <2 x double> @ucvtf_v2i64_v2f64(<2 x i64> %op1) {
 ; CHECK-LABEL: ucvtf_v2i64_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -577,8 +576,8 @@ define <2 x double> @ucvtf_v2i64_v2f64(<2 x i64> %op1) {
 define void @ucvtf_v4i64_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: ucvtf_v4i64_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x1]
@@ -596,8 +595,8 @@ define void @ucvtf_v4i64_v4f64(ptr %a, ptr %b) {
 define <4 x half> @scvtf_v4i16_v4f16(<4 x i16> %op1) {
 ; CHECK-LABEL: scvtf_v4i16_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    scvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -608,8 +607,8 @@ define <4 x half> @scvtf_v4i16_v4f16(<4 x i16> %op1) {
 define void @scvtf_v8i16_v8f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: scvtf_v8i16_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    scvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
@@ -622,8 +621,8 @@ define void @scvtf_v8i16_v8f16(ptr %a, ptr %b) {
 define void @scvtf_v16i16_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: scvtf_v16i16_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    scvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    scvtf z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x1]
@@ -640,8 +639,8 @@ define void @scvtf_v16i16_v16f16(ptr %a, ptr %b) {
 define <2 x float> @scvtf_v2i16_v2f32(<2 x i16> %op1) {
 ; CHECK-LABEL: scvtf_v2i16_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    sxth z0.s, p0/m, z0.s
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -653,8 +652,8 @@ define <2 x float> @scvtf_v2i16_v2f32(<2 x i16> %op1) {
 define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) {
 ; CHECK-LABEL: scvtf_v4i16_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -684,21 +683,20 @@ define void @scvtf_v8i16_v8f32(ptr %a, ptr %b) {
 define void @scvtf_v16i16_v16f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: scvtf_v16i16_v16f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    sunpklo z2.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z3.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    scvtf z2.s, p0/m, z2.s
 ; CHECK-NEXT:    scvtf z3.s, p0/m, z3.s
-; CHECK-NEXT:    scvtf z1.s, p0/m, z1.s
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
-; CHECK-NEXT:    stp q3, q1, [x1, #32]
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    scvtf z1.s, p0/m, z2.s
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    scvtf z1.s, p0/m, z1.s
+; CHECK-NEXT:    stp q2, q0, [x1, #32]
+; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = sitofp <16 x i16> %op1 to <16 x float>
@@ -713,8 +711,8 @@ define void @scvtf_v16i16_v16f32(ptr %a, ptr %b) {
 define <2 x double> @scvtf_v2i16_v2f64(<2 x i16> %op1) {
 ; CHECK-LABEL: scvtf_v2i16_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    sxth z0.s, p0/m, z0.s
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
@@ -753,18 +751,17 @@ define void @scvtf_v8i16_v8f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z2.d, z1.s
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
 ; CHECK-NEXT:    sunpklo z3.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    scvtf z2.d, p0/m, z2.d
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
 ; CHECK-NEXT:    scvtf z3.d, p0/m, z3.d
+; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
-; CHECK-NEXT:    scvtf z2.d, p0/m, z2.d
-; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    stp q2, q1, [x1]
 ; CHECK-NEXT:    stp q3, q0, [x1, #32]
-; CHECK-NEXT:    movprfx z0, z1
-; CHECK-NEXT:    scvtf z0.d, p0/m, z1.d
-; CHECK-NEXT:    stp q2, q0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %res = sitofp <8 x i16> %op1 to <8 x double>
@@ -775,42 +772,44 @@ define void @scvtf_v8i16_v8f64(ptr %a, ptr %b) {
 define void @scvtf_v16i16_v16f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: scvtf_v16i16_v16f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    sunpklo z2.s, z0.h
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    sunpklo z4.d, z2.s
 ; CHECK-NEXT:    sunpklo z3.s, z1.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEXT:    sunpklo z6.d, z0.s
-; CHECK-NEXT:    sunpklo z7.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEXT:    ext z3.b, z3.b, z3.b, #8
+; CHECK-NEXT:    mov z4.d, z2.d
+; CHECK-NEXT:    mov z7.d, z3.d
+; CHECK-NEXT:    mov z5.d, z0.d
+; CHECK-NEXT:    ext z4.b, z4.b, z2.b, #8
+; CHECK-NEXT:    sunpklo z2.d, z2.s
+; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    ext z7.b, z7.b, z3.b, #8
+; CHECK-NEXT:    sunpklo z3.d, z3.s
+; CHECK-NEXT:    ext z5.b, z5.b, z0.b, #8
+; CHECK-NEXT:    sunpklo z4.d, z4.s
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    scvtf z7.d, p0/m, z7.d
+; CHECK-NEXT:    ext z6.b, z6.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    scvtf z2.d, p0/m, z2.d
+; CHECK-NEXT:    scvtf z3.d, p0/m, z3.d
+; CHECK-NEXT:    sunpklo z7.d, z7.s
+; CHECK-NEXT:    sunpklo z5.d, z5.s
+; CHECK-NEXT:    scvtf z4.d, p0/m, z4.d
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    sunpklo z6.d, z6.s
 ; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT:    sunpklo z3.d, z3.s
-; CHECK-NEXT:    stp q7, q1, [x1, #96]
 ; CHECK-NEXT:    scvtf z5.d, p0/m, z5.d
-; CHECK-NEXT:    movprfx z1, z3
-; CHECK-NEXT:    scvtf z1.d, p0/m, z3.d
-; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
-; CHECK-NEXT:    sunpklo z2.d, z2.s
-; CHECK-NEXT:    stp q5, q1, [x1, #64]
-; CHECK-NEXT:    movprfx z3, z6
-; CHECK-NEXT:    scvtf z3.d, p0/m, z6.d
-; CHECK-NEXT:    stp q3, q0, [x1, #32]
-; CHECK-NEXT:    movprfx z1, z4
-; CHECK-NEXT:    scvtf z1.d, p0/m, z4.d
-; CHECK-NEXT:    movprfx z0, z2
-; CHECK-NEXT:    scvtf z0.d, p0/m, z2.d
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    stp q2, q4, [x1, #64]
+; CHECK-NEXT:    movprfx z2, z6
+; CHECK-NEXT:    scvtf z2.d, p0/m, z6.d
+; CHECK-NEXT:    stp q1, q2, [x1, #32]
+; CHECK-NEXT:    stp q0, q5, [x1, #96]
+; CHECK-NEXT:    movprfx z0, z7
+; CHECK-NEXT:    scvtf z0.d, p0/m, z7.d
+; CHECK-NEXT:    stp q3, q0, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %res = sitofp <16 x i16> %op1 to <16 x double>
@@ -825,8 +824,8 @@ define void @scvtf_v16i16_v16f64(ptr %a, ptr %b) {
 define <2 x half> @scvtf_v2i32_v2f16(<2 x i32> %op1) {
 ; CHECK-LABEL: scvtf_v2i32_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    scvtf z0.h, p0/m, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -838,8 +837,8 @@ define <2 x half> @scvtf_v2i32_v2f16(<2 x i32> %op1) {
 define <4 x half> @scvtf_v4i32_v4f16(<4 x i32> %op1) {
 ; CHECK-LABEL: scvtf_v4i32_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    scvtf z0.h, p0/m, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -851,14 +850,14 @@ define <4 x half> @scvtf_v4i32_v4f16(<4 x i32> %op1) {
 define <8 x half> @scvtf_v8i32_v8f16(ptr %a) {
 ; CHECK-LABEL: scvtf_v8i32_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    scvtf z1.h, p0/m, z1.s
 ; CHECK-NEXT:    scvtf z0.h, p0/m, z0.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    uzp1 z2.h, z0.h, z0.h
-; CHECK-NEXT:    uzp1 z0.h, z1.h, z1.h
-; CHECK-NEXT:    splice z0.h, p0, z0.h, z2.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
@@ -873,8 +872,8 @@ define <8 x half> @scvtf_v8i32_v8f16(ptr %a) {
 define <2 x float> @scvtf_v2i32_v2f32(<2 x i32> %op1) {
 ; CHECK-LABEL: scvtf_v2i32_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -885,8 +884,8 @@ define <2 x float> @scvtf_v2i32_v2f32(<2 x i32> %op1) {
 define <4 x float> @scvtf_v4i32_v4f32(<4 x i32> %op1) {
 ; CHECK-LABEL: scvtf_v4i32_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -897,8 +896,8 @@ define <4 x float> @scvtf_v4i32_v4f32(<4 x i32> %op1) {
 define void @scvtf_v8i32_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: scvtf_v8i32_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    scvtf z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x1]
@@ -916,8 +915,8 @@ define void @scvtf_v8i32_v8f32(ptr %a, ptr %b) {
 define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) {
 ; CHECK-LABEL: scvtf_v2i32_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -947,21 +946,20 @@ define void @scvtf_v4i32_v4f64(ptr %a, ptr %b) {
 define void @scvtf_v8i32_v8f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: scvtf_v8i32_v8f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    sunpklo z2.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    sunpklo z3.d, z1.s
 ; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    scvtf z2.d, p0/m, z2.d
 ; CHECK-NEXT:    scvtf z3.d, p0/m, z3.d
-; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
-; CHECK-NEXT:    stp q3, q1, [x1, #32]
-; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    scvtf z1.d, p0/m, z2.d
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
+; CHECK-NEXT:    stp q2, q0, [x1, #32]
+; CHECK-NEXT:    stp q3, q1, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %res = sitofp <8 x i32> %op1 to <8 x double>
@@ -972,38 +970,40 @@ define void @scvtf_v8i32_v8f64(ptr %a, ptr %b) {
 define void @scvtf_v16i32_v16f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: scvtf_v16i32_v16f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q2, q3, [x0, #32]
+; CHECK-NEXT:    ldp q1, q0, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    mov z6.d, z2.d
+; CHECK-NEXT:    ldp q5, q4, [x0]
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    mov z3.d, z1.d
+; CHECK-NEXT:    mov z6.d, z4.d
+; CHECK-NEXT:    mov z7.d, z5.d
+; CHECK-NEXT:    ext z2.b, z2.b, z0.b, #8
+; CHECK-NEXT:    ext z3.b, z3.b, z1.b, #8
+; CHECK-NEXT:    sunpklo z0.d, z0.s
+; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    ext z6.b, z6.b, z4.b, #8
+; CHECK-NEXT:    sunpklo z4.d, z4.s
+; CHECK-NEXT:    ext z7.b, z7.b, z5.b, #8
+; CHECK-NEXT:    sunpklo z5.d, z5.s
 ; CHECK-NEXT:    sunpklo z2.d, z2.s
-; CHECK-NEXT:    ext z6.b, z6.b, z6.b, #8
-; CHECK-NEXT:    scvtf z2.d, p0/m, z2.d
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    mov z7.d, z3.d
 ; CHECK-NEXT:    sunpklo z3.d, z3.s
-; CHECK-NEXT:    ext z7.b, z7.b, z7.b, #8
-; CHECK-NEXT:    scvtf z3.d, p0/m, z3.d
-; CHECK-NEXT:    sunpklo z7.d, z7.s
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    sunpklo z6.d, z6.s
-; CHECK-NEXT:    sunpklo z4.d, z0.s
-; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    scvtf z7.d, p0/m, z7.d
-; CHECK-NEXT:    sunpklo z5.d, z1.s
-; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEXT:    stp q3, q7, [x1, #96]
-; CHECK-NEXT:    movprfx z3, z6
-; CHECK-NEXT:    scvtf z3.d, p0/m, z6.d
-; CHECK-NEXT:    stp q2, q3, [x1, #64]
+; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
+; CHECK-NEXT:    scvtf z4.d, p0/m, z4.d
+; CHECK-NEXT:    sunpklo z7.d, z7.s
+; CHECK-NEXT:    scvtf z2.d, p0/m, z2.d
+; CHECK-NEXT:    scvtf z3.d, p0/m, z3.d
+; CHECK-NEXT:    stp q1, q3, [x1, #64]
+; CHECK-NEXT:    movprfx z1, z7
+; CHECK-NEXT:    scvtf z1.d, p0/m, z7.d
+; CHECK-NEXT:    stp q0, q2, [x1, #96]
+; CHECK-NEXT:    movprfx z0, z6
+; CHECK-NEXT:    scvtf z0.d, p0/m, z6.d
 ; CHECK-NEXT:    movprfx z2, z5
 ; CHECK-NEXT:    scvtf z2.d, p0/m, z5.d
-; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
-; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
-; CHECK-NEXT:    stp q2, q1, [x1, #32]
-; CHECK-NEXT:    movprfx z2, z4
-; CHECK-NEXT:    scvtf z2.d, p0/m, z4.d
-; CHECK-NEXT:    stp q2, q0, [x1]
+; CHECK-NEXT:    stp q2, q1, [x1]
+; CHECK-NEXT:    stp q4, q0, [x1, #32]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i32>, ptr %a
   %res = sitofp <16 x i32> %op1 to <16 x double>
@@ -1023,9 +1023,9 @@ define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) {
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    mov z1.d, z0.d[1]
 ; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    fmov x9, d1
 ; CHECK-NEXT:    scvtf h0, x8
-; CHECK-NEXT:    scvtf h1, x9
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    scvtf h1, x8
 ; CHECK-NEXT:    str h0, [sp, #8]
 ; CHECK-NEXT:    str h1, [sp, #10]
 ; CHECK-NEXT:    ldr d0, [sp, #8]
@@ -1038,17 +1038,16 @@ define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) {
 define <4 x half> @scvtf_v4i64_v4f16(ptr %a) {
 ; CHECK-LABEL: scvtf_v4i64_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    scvtf z1.s, p0/m, z1.d
-; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.d
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    splice z1.s, p0, z1.s, z0.s
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    movprfx z0, z1
-; CHECK-NEXT:    fcvt z0.h, p0/m, z1.s
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    fcvt z0.h, p1/m, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -1064,8 +1063,8 @@ define <4 x half> @scvtf_v4i64_v4f16(ptr %a) {
 define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) {
 ; CHECK-LABEL: scvtf_v2i64_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -1077,14 +1076,14 @@ define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) {
 define <4 x float> @scvtf_v4i64_v4f32(ptr %a) {
 ; CHECK-LABEL: scvtf_v4i64_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    scvtf z1.s, p0/m, z1.d
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.d
 ; CHECK-NEXT:    ptrue p0.s, vl2
-; CHECK-NEXT:    uzp1 z2.s, z0.s, z0.s
-; CHECK-NEXT:    uzp1 z0.s, z1.s, z1.s
-; CHECK-NEXT:    splice z0.s, p0, z0.s, z2.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
@@ -1099,8 +1098,8 @@ define <4 x float> @scvtf_v4i64_v4f32(ptr %a) {
 define <2 x double> @scvtf_v2i64_v2f64(<2 x i64> %op1) {
 ; CHECK-LABEL: scvtf_v2i64_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -1111,8 +1110,8 @@ define <2 x double> @scvtf_v2i64_v2f64(<2 x i64> %op1) {
 define void @scvtf_v4i64_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: scvtf_v4i64_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x1]

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
index 01618021c9391c..5108ad9d2b5477 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
@@ -60,14 +60,14 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask)
 define void @select_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: select_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    cmpeq p1.b, p0/z, z1.b, z2.b
-; CHECK-NEXT:    sel z1.b, p1, z1.b, z2.b
-; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, z3.b
-; CHECK-NEXT:    sel z0.b, p0, z0.b, z3.b
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    ldp q0, q2, [x0]
+; CHECK-NEXT:    ldp q1, q3, [x1]
+; CHECK-NEXT:    cmpeq p1.b, p0/z, z0.b, z1.b
+; CHECK-NEXT:    cmpeq p0.b, p0/z, z2.b, z3.b
+; CHECK-NEXT:    sel z0.b, p1, z0.b, z1.b
+; CHECK-NEXT:    sel z1.b, p0, z2.b, z3.b
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -135,14 +135,14 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) {
 define void @select_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: select_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    cmpeq p1.h, p0/z, z1.h, z2.h
-; CHECK-NEXT:    sel z1.h, p1, z1.h, z2.h
-; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, z3.h
-; CHECK-NEXT:    sel z0.h, p0, z0.h, z3.h
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    ldp q0, q2, [x0]
+; CHECK-NEXT:    ldp q1, q3, [x1]
+; CHECK-NEXT:    cmpeq p1.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z2.h, z3.h
+; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    sel z1.h, p0, z2.h, z3.h
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -192,14 +192,14 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) {
 define void @select_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: select_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    cmpeq p1.s, p0/z, z1.s, z2.s
-; CHECK-NEXT:    sel z1.s, p1, z1.s, z2.s
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, z3.s
-; CHECK-NEXT:    sel z0.s, p0, z0.s, z3.s
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    ldp q0, q2, [x0]
+; CHECK-NEXT:    ldp q1, q3, [x1]
+; CHECK-NEXT:    cmpeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z2.s, z3.s
+; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    sel z1.s, p0, z2.s, z3.s
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -212,9 +212,9 @@ define void @select_v8i32(ptr %a, ptr %b) {
 define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, <1 x i1> %mask) {
 ; CHECK-LABEL: select_v1i64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    and x8, x0, #0x1
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    mov z2.d, x8
@@ -248,14 +248,14 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) {
 define void @select_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: select_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    cmpeq p1.d, p0/z, z1.d, z2.d
-; CHECK-NEXT:    sel z1.d, p1, z1.d, z2.d
-; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, z3.d
-; CHECK-NEXT:    sel z0.d, p0, z0.d, z3.d
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    ldp q0, q2, [x0]
+; CHECK-NEXT:    ldp q1, q3, [x1]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    cmpeq p0.d, p0/z, z2.d, z3.d
+; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    sel z1.d, p0, z2.d, z3.d
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
index 91b2b59534bb8a..15e875d3f50d21 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
@@ -49,22 +49,22 @@ define void @alloc_v6i8(ptr %st_ptr) nounwind {
 ; CHECK-NEXT:    ptrue p0.b, vl3
 ; CHECK-NEXT:    ld2b { z0.b, z1.b }, p0/z, [x20]
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    fmov w8, s1
 ; CHECK-NEXT:    mov z2.b, z1.b[3]
+; CHECK-NEXT:    fmov w8, s1
 ; CHECK-NEXT:    mov z3.b, z1.b[2]
-; CHECK-NEXT:    mov z0.b, z1.b[1]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    mov z4.b, z1.b[1]
 ; CHECK-NEXT:    strh w8, [sp]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strh w9, [sp, #6]
-; CHECK-NEXT:    strh w10, [sp, #4]
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    strh w8, [sp, #6]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    strh w9, [sp, #4]
 ; CHECK-NEXT:    strh w8, [sp, #2]
 ; CHECK-NEXT:    add x8, sp, #12
 ; CHECK-NEXT:    ldr d0, [sp]
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x8]
 ; CHECK-NEXT:    ldrh w8, [sp, #12]
-; CHECK-NEXT:    strb w10, [x19, #2]
+; CHECK-NEXT:    strb w9, [x19, #2]
 ; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-NEXT:    strh w8, [x19]
 ; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
@@ -86,32 +86,32 @@ define void @alloc_v32i8(ptr %st_ptr) nounwind {
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    add x0, sp, #16
 ; CHECK-NEXT:    bl def
-; CHECK-NEXT:    ldp q0, q1, [sp, #16]
-; CHECK-NEXT:    mov z2.b, z0.b[14]
-; CHECK-NEXT:    mov z3.b, z0.b[12]
+; CHECK-NEXT:    ldp q0, q3, [sp, #16]
+; CHECK-NEXT:    mov z1.b, z0.b[14]
 ; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    mov z2.b, z0.b[12]
 ; CHECK-NEXT:    mov z4.b, z0.b[10]
 ; CHECK-NEXT:    mov z5.b, z0.b[8]
-; CHECK-NEXT:    mov z6.b, z0.b[6]
 ; CHECK-NEXT:    strb w8, [sp]
-; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    strb w9, [sp, #7]
-; CHECK-NEXT:    fmov w9, s5
-; CHECK-NEXT:    strb w10, [sp, #6]
-; CHECK-NEXT:    fmov w10, s6
-; CHECK-NEXT:    mov z7.b, z0.b[4]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov z1.b, z0.b[6]
+; CHECK-NEXT:    strb w8, [sp, #7]
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    mov z2.b, z0.b[4]
 ; CHECK-NEXT:    mov z0.b, z0.b[2]
+; CHECK-NEXT:    strb w8, [sp, #6]
+; CHECK-NEXT:    fmov w8, s4
 ; CHECK-NEXT:    strb w8, [sp, #5]
-; CHECK-NEXT:    fmov w8, s7
-; CHECK-NEXT:    strb w9, [sp, #4]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strb w10, [sp, #3]
-; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    strb w8, [sp, #4]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    strb w8, [sp, #3]
+; CHECK-NEXT:    fmov w8, s2
 ; CHECK-NEXT:    strb w8, [sp, #2]
-; CHECK-NEXT:    strb w9, [sp, #1]
-; CHECK-NEXT:    strb w10, [x19, #8]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strb w8, [sp, #1]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    strb w8, [x19, #8]
 ; CHECK-NEXT:    ldr q0, [sp]
 ; CHECK-NEXT:    fmov x8, d0
 ; CHECK-NEXT:    str x8, [x19]
@@ -137,8 +137,8 @@ define void @alloc_v8f64(ptr %st_ptr) nounwind {
 ; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEXT:    mov x20, sp
 ; CHECK-NEXT:    bl def
-; CHECK-NEXT:    mov x8, #4 // =0x4
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    mov x8, #4 // =0x4
 ; CHECK-NEXT:    ld2d { z0.d, z1.d }, p0/z, [x20]
 ; CHECK-NEXT:    ld2d { z2.d, z3.d }, p0/z, [x20, x8, lsl #3]
 ; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
index a3591dfe527ee0..04235ef460a920 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
@@ -6,16 +6,16 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x i32> @test(ptr %arg1, ptr %arg2) {
 ; CHECK-LABEL: test:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp q2, q1, [x0, #32]
-; CHECK-NEXT:    add z2.s, z2.s, z2.s
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
 ; CHECK-NEXT:    ldp q3, q4, [x0]
+; CHECK-NEXT:    add z2.s, z0.s, z0.s
+; CHECK-NEXT:    add z5.s, z1.s, z1.s
 ; CHECK-NEXT:    mov z0.s, z1.s[2]
-; CHECK-NEXT:    add z1.s, z1.s, z1.s
-; CHECK-NEXT:    stp q2, q1, [x0, #32]
+; CHECK-NEXT:    add z1.s, z3.s, z3.s
+; CHECK-NEXT:    add z3.s, z4.s, z4.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
-; CHECK-NEXT:    add z2.s, z3.s, z3.s
-; CHECK-NEXT:    add z1.s, z4.s, z4.s
-; CHECK-NEXT:    stp q2, q1, [x0]
+; CHECK-NEXT:    stp q2, q5, [x0, #32]
+; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
 entry:
   %0 = load <16 x i32>, ptr %arg1, align 256
@@ -30,16 +30,16 @@ define <2 x i32> @test2(ptr %arg1, ptr %arg2) {
 ; CHECK-LABEL: test2:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldp q1, q0, [x0, #32]
-; CHECK-NEXT:    add z1.s, z1.s, z1.s
-; CHECK-NEXT:    ldp q2, q3, [x0]
-; CHECK-NEXT:    add z4.s, z0.s, z0.s
+; CHECK-NEXT:    ldp q3, q4, [x0]
+; CHECK-NEXT:    add z2.s, z0.s, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
+; CHECK-NEXT:    add z1.s, z1.s, z1.s
+; CHECK-NEXT:    add z3.s, z3.s, z3.s
+; CHECK-NEXT:    add z4.s, z4.s, z4.s
 ; CHECK-NEXT:    mov z0.s, s0
-; CHECK-NEXT:    stp q1, q4, [x0, #32]
+; CHECK-NEXT:    stp q1, q2, [x0, #32]
+; CHECK-NEXT:    stp q3, q4, [x0]
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
-; CHECK-NEXT:    add z1.s, z2.s, z2.s
-; CHECK-NEXT:    add z2.s, z3.s, z3.s
-; CHECK-NEXT:    stp q1, q2, [x0]
 ; CHECK-NEXT:    ret
 entry:
   %0 = load <16 x i32>, ptr %arg1, align 256

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
index c42be3a5354891..d4033aa85e5984 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
@@ -11,8 +11,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define i8 @andv_v4i8(<4 x i8> %a) {
 ; CHECK-LABEL: andv_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    andv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -23,8 +23,8 @@ define i8 @andv_v4i8(<4 x i8> %a) {
 define i8 @andv_v8i8(<8 x i8> %a) {
 ; CHECK-LABEL: andv_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    andv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -35,8 +35,8 @@ define i8 @andv_v8i8(<8 x i8> %a) {
 define i8 @andv_v16i8(<16 x i8> %a) {
 ; CHECK-LABEL: andv_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    andv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -47,8 +47,8 @@ define i8 @andv_v16i8(<16 x i8> %a) {
 define i8 @andv_v32i8(ptr %a) {
 ; CHECK-LABEL: andv_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    and z0.d, z1.d, z0.d
 ; CHECK-NEXT:    andv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
@@ -61,8 +61,8 @@ define i8 @andv_v32i8(ptr %a) {
 define i16 @andv_v2i16(<2 x i16> %a) {
 ; CHECK-LABEL: andv_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    andv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -73,8 +73,8 @@ define i16 @andv_v2i16(<2 x i16> %a) {
 define i16 @andv_v4i16(<4 x i16> %a) {
 ; CHECK-LABEL: andv_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    andv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -85,8 +85,8 @@ define i16 @andv_v4i16(<4 x i16> %a) {
 define i16 @andv_v8i16(<8 x i16> %a) {
 ; CHECK-LABEL: andv_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    andv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -97,8 +97,8 @@ define i16 @andv_v8i16(<8 x i16> %a) {
 define i16 @andv_v16i16(ptr %a) {
 ; CHECK-LABEL: andv_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    and z0.d, z1.d, z0.d
 ; CHECK-NEXT:    andv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
@@ -111,8 +111,8 @@ define i16 @andv_v16i16(ptr %a) {
 define i32 @andv_v2i32(<2 x i32> %a) {
 ; CHECK-LABEL: andv_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    andv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -123,8 +123,8 @@ define i32 @andv_v2i32(<2 x i32> %a) {
 define i32 @andv_v4i32(<4 x i32> %a) {
 ; CHECK-LABEL: andv_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    andv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -135,8 +135,8 @@ define i32 @andv_v4i32(<4 x i32> %a) {
 define i32 @andv_v8i32(ptr %a) {
 ; CHECK-LABEL: andv_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    and z0.d, z1.d, z0.d
 ; CHECK-NEXT:    andv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
@@ -149,8 +149,8 @@ define i32 @andv_v8i32(ptr %a) {
 define i64 @andv_v2i64(<2 x i64> %a) {
 ; CHECK-LABEL: andv_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    andv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -161,8 +161,8 @@ define i64 @andv_v2i64(<2 x i64> %a) {
 define i64 @andv_v4i64(ptr %a) {
 ; CHECK-LABEL: andv_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    and z0.d, z1.d, z0.d
 ; CHECK-NEXT:    andv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
@@ -179,8 +179,8 @@ define i64 @andv_v4i64(ptr %a) {
 define i8 @eorv_v4i8(<4 x i8> %a) {
 ; CHECK-LABEL: eorv_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    eorv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -191,8 +191,8 @@ define i8 @eorv_v4i8(<4 x i8> %a) {
 define i8 @eorv_v8i8(<8 x i8> %a) {
 ; CHECK-LABEL: eorv_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    eorv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -203,8 +203,8 @@ define i8 @eorv_v8i8(<8 x i8> %a) {
 define i8 @eorv_v16i8(<16 x i8> %a) {
 ; CHECK-LABEL: eorv_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    eorv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -215,8 +215,8 @@ define i8 @eorv_v16i8(<16 x i8> %a) {
 define i8 @eorv_v32i8(ptr %a) {
 ; CHECK-LABEL: eorv_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    eor z0.d, z1.d, z0.d
 ; CHECK-NEXT:    eorv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
@@ -229,8 +229,8 @@ define i8 @eorv_v32i8(ptr %a) {
 define i16 @eorv_v2i16(<2 x i16> %a) {
 ; CHECK-LABEL: eorv_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    eorv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -241,8 +241,8 @@ define i16 @eorv_v2i16(<2 x i16> %a) {
 define i16 @eorv_v4i16(<4 x i16> %a) {
 ; CHECK-LABEL: eorv_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    eorv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -253,8 +253,8 @@ define i16 @eorv_v4i16(<4 x i16> %a) {
 define i16 @eorv_v8i16(<8 x i16> %a) {
 ; CHECK-LABEL: eorv_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    eorv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -265,8 +265,8 @@ define i16 @eorv_v8i16(<8 x i16> %a) {
 define i16 @eorv_v16i16(ptr %a) {
 ; CHECK-LABEL: eorv_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    eor z0.d, z1.d, z0.d
 ; CHECK-NEXT:    eorv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
@@ -279,8 +279,8 @@ define i16 @eorv_v16i16(ptr %a) {
 define i32 @eorv_v2i32(<2 x i32> %a) {
 ; CHECK-LABEL: eorv_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    eorv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -291,8 +291,8 @@ define i32 @eorv_v2i32(<2 x i32> %a) {
 define i32 @eorv_v4i32(<4 x i32> %a) {
 ; CHECK-LABEL: eorv_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    eorv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -303,8 +303,8 @@ define i32 @eorv_v4i32(<4 x i32> %a) {
 define i32 @eorv_v8i32(ptr %a) {
 ; CHECK-LABEL: eorv_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    eor z0.d, z1.d, z0.d
 ; CHECK-NEXT:    eorv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
@@ -317,8 +317,8 @@ define i32 @eorv_v8i32(ptr %a) {
 define i64 @eorv_v2i64(<2 x i64> %a) {
 ; CHECK-LABEL: eorv_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    eorv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -329,8 +329,8 @@ define i64 @eorv_v2i64(<2 x i64> %a) {
 define i64 @eorv_v4i64(ptr %a) {
 ; CHECK-LABEL: eorv_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    eor z0.d, z1.d, z0.d
 ; CHECK-NEXT:    eorv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
@@ -347,8 +347,8 @@ define i64 @eorv_v4i64(ptr %a) {
 define i8 @orv_v4i8(<4 x i8> %a) {
 ; CHECK-LABEL: orv_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    orv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -359,8 +359,8 @@ define i8 @orv_v4i8(<4 x i8> %a) {
 define i8 @orv_v8i8(<8 x i8> %a) {
 ; CHECK-LABEL: orv_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    orv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -371,8 +371,8 @@ define i8 @orv_v8i8(<8 x i8> %a) {
 define i8 @orv_v16i8(<16 x i8> %a) {
 ; CHECK-LABEL: orv_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    orv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -383,8 +383,8 @@ define i8 @orv_v16i8(<16 x i8> %a) {
 define i8 @orv_v32i8(ptr %a) {
 ; CHECK-LABEL: orv_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    orr z0.d, z1.d, z0.d
 ; CHECK-NEXT:    orv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
@@ -397,8 +397,8 @@ define i8 @orv_v32i8(ptr %a) {
 define i16 @orv_v2i16(<2 x i16> %a) {
 ; CHECK-LABEL: orv_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    orv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -409,8 +409,8 @@ define i16 @orv_v2i16(<2 x i16> %a) {
 define i16 @orv_v4i16(<4 x i16> %a) {
 ; CHECK-LABEL: orv_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    orv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -421,8 +421,8 @@ define i16 @orv_v4i16(<4 x i16> %a) {
 define i16 @orv_v8i16(<8 x i16> %a) {
 ; CHECK-LABEL: orv_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    orv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -433,8 +433,8 @@ define i16 @orv_v8i16(<8 x i16> %a) {
 define i16 @orv_v16i16(ptr %a) {
 ; CHECK-LABEL: orv_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    orr z0.d, z1.d, z0.d
 ; CHECK-NEXT:    orv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
@@ -447,8 +447,8 @@ define i16 @orv_v16i16(ptr %a) {
 define i32 @orv_v2i32(<2 x i32> %a) {
 ; CHECK-LABEL: orv_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    orv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -459,8 +459,8 @@ define i32 @orv_v2i32(<2 x i32> %a) {
 define i32 @orv_v4i32(<4 x i32> %a) {
 ; CHECK-LABEL: orv_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    orv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -471,8 +471,8 @@ define i32 @orv_v4i32(<4 x i32> %a) {
 define i32 @orv_v8i32(ptr %a) {
 ; CHECK-LABEL: orv_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    orr z0.d, z1.d, z0.d
 ; CHECK-NEXT:    orv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
@@ -485,8 +485,8 @@ define i32 @orv_v8i32(ptr %a) {
 define i64 @orv_v2i64(<2 x i64> %a) {
 ; CHECK-LABEL: orv_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    orv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -497,8 +497,8 @@ define i64 @orv_v2i64(<2 x i64> %a) {
 define i64 @orv_v4i64(ptr %a) {
 ; CHECK-LABEL: orv_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    orr z0.d, z1.d, z0.d
 ; CHECK-NEXT:    orv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
index e746770e29a2f5..9785d795744ef8 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
@@ -59,69 +59,69 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) {
 ; CHECK-NEXT:    sub sp, sp, #32
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    ldr w8, [sp, #224]
-; CHECK-NEXT:    strb w7, [sp, #6]
 ; CHECK-NEXT:    ldr w9, [sp, #216]
-; CHECK-NEXT:    strb w6, [sp, #5]
-; CHECK-NEXT:    ldr w10, [sp, #208]
-; CHECK-NEXT:    strb w5, [sp, #4]
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    strb w7, [sp, #6]
 ; CHECK-NEXT:    strb w8, [sp, #31]
-; CHECK-NEXT:    ldr w8, [sp, #200]
+; CHECK-NEXT:    ldr w8, [sp, #208]
 ; CHECK-NEXT:    strb w9, [sp, #30]
-; CHECK-NEXT:    ldr w9, [sp, #192]
-; CHECK-NEXT:    strb w10, [sp, #29]
-; CHECK-NEXT:    ldr w10, [sp, #184]
-; CHECK-NEXT:    strb w8, [sp, #28]
+; CHECK-NEXT:    ldr w9, [sp, #200]
+; CHECK-NEXT:    strb w8, [sp, #29]
+; CHECK-NEXT:    ldr w8, [sp, #192]
+; CHECK-NEXT:    strb w9, [sp, #28]
+; CHECK-NEXT:    ldr w9, [sp, #184]
+; CHECK-NEXT:    strb w8, [sp, #27]
 ; CHECK-NEXT:    ldr w8, [sp, #176]
-; CHECK-NEXT:    strb w9, [sp, #27]
+; CHECK-NEXT:    strb w9, [sp, #26]
 ; CHECK-NEXT:    ldr w9, [sp, #168]
-; CHECK-NEXT:    strb w10, [sp, #26]
-; CHECK-NEXT:    ldr w10, [sp, #160]
 ; CHECK-NEXT:    strb w8, [sp, #25]
-; CHECK-NEXT:    ldr w8, [sp, #152]
+; CHECK-NEXT:    ldr w8, [sp, #160]
 ; CHECK-NEXT:    strb w9, [sp, #24]
-; CHECK-NEXT:    ldr w9, [sp, #144]
-; CHECK-NEXT:    strb w10, [sp, #23]
-; CHECK-NEXT:    ldr w10, [sp, #136]
-; CHECK-NEXT:    strb w8, [sp, #22]
+; CHECK-NEXT:    ldr w9, [sp, #152]
+; CHECK-NEXT:    strb w8, [sp, #23]
+; CHECK-NEXT:    ldr w8, [sp, #144]
+; CHECK-NEXT:    strb w9, [sp, #22]
+; CHECK-NEXT:    ldr w9, [sp, #136]
+; CHECK-NEXT:    strb w8, [sp, #21]
 ; CHECK-NEXT:    ldr w8, [sp, #128]
-; CHECK-NEXT:    strb w9, [sp, #21]
+; CHECK-NEXT:    strb w9, [sp, #20]
 ; CHECK-NEXT:    ldr w9, [sp, #120]
-; CHECK-NEXT:    strb w10, [sp, #20]
-; CHECK-NEXT:    ldr w10, [sp, #112]
 ; CHECK-NEXT:    strb w8, [sp, #19]
-; CHECK-NEXT:    ldr w8, [sp, #104]
+; CHECK-NEXT:    ldr w8, [sp, #112]
 ; CHECK-NEXT:    strb w9, [sp, #18]
-; CHECK-NEXT:    ldr w9, [sp, #96]
-; CHECK-NEXT:    strb w10, [sp, #17]
-; CHECK-NEXT:    ldr w10, [sp, #88]
-; CHECK-NEXT:    strb w8, [sp, #16]
+; CHECK-NEXT:    ldr w9, [sp, #104]
+; CHECK-NEXT:    strb w8, [sp, #17]
+; CHECK-NEXT:    ldr w8, [sp, #96]
+; CHECK-NEXT:    strb w9, [sp, #16]
+; CHECK-NEXT:    ldr w9, [sp, #88]
+; CHECK-NEXT:    strb w8, [sp, #15]
 ; CHECK-NEXT:    ldr w8, [sp, #80]
-; CHECK-NEXT:    strb w9, [sp, #15]
+; CHECK-NEXT:    strb w9, [sp, #14]
 ; CHECK-NEXT:    ldr w9, [sp, #72]
-; CHECK-NEXT:    strb w10, [sp, #14]
-; CHECK-NEXT:    ldr w10, [sp, #64]
 ; CHECK-NEXT:    strb w8, [sp, #13]
-; CHECK-NEXT:    ldr w8, [sp, #56]
+; CHECK-NEXT:    ldr w8, [sp, #64]
 ; CHECK-NEXT:    strb w9, [sp, #12]
-; CHECK-NEXT:    ldr w9, [sp, #48]
-; CHECK-NEXT:    strb w10, [sp, #11]
-; CHECK-NEXT:    ldr w10, [sp, #40]
-; CHECK-NEXT:    strb w8, [sp, #10]
+; CHECK-NEXT:    ldr w9, [sp, #56]
+; CHECK-NEXT:    strb w8, [sp, #11]
+; CHECK-NEXT:    ldr w8, [sp, #48]
+; CHECK-NEXT:    strb w9, [sp, #10]
+; CHECK-NEXT:    ldr w9, [sp, #40]
+; CHECK-NEXT:    strb w8, [sp, #9]
 ; CHECK-NEXT:    ldr w8, [sp, #32]
-; CHECK-NEXT:    strb w9, [sp, #9]
-; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    strb w10, [sp, #8]
+; CHECK-NEXT:    strb w9, [sp, #8]
 ; CHECK-NEXT:    strb w8, [sp, #7]
 ; CHECK-NEXT:    mov w8, #16 // =0x10
+; CHECK-NEXT:    strb w6, [sp, #5]
+; CHECK-NEXT:    strb w5, [sp, #4]
 ; CHECK-NEXT:    strb w4, [sp, #3]
 ; CHECK-NEXT:    strb w3, [sp, #2]
 ; CHECK-NEXT:    strb w2, [sp, #1]
 ; CHECK-NEXT:    strb w1, [sp]
 ; CHECK-NEXT:    ldp q1, q0, [sp]
-; CHECK-NEXT:    lsl z1.b, z1.b, #7
-; CHECK-NEXT:    asr z1.b, z1.b, #7
 ; CHECK-NEXT:    lsl z0.b, z0.b, #7
+; CHECK-NEXT:    lsl z1.b, z1.b, #7
 ; CHECK-NEXT:    asr z0.b, z0.b, #7
+; CHECK-NEXT:    asr z1.b, z1.b, #7
 ; CHECK-NEXT:    cmpne p1.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, #0
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
@@ -140,13 +140,13 @@ define <2 x half> @masked_load_v2f16(ptr %src, <2 x i1> %mask) {
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov z1.s, z0.s[1]
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    str wzr, [sp, #12]
-; CHECK-NEXT:    mov z0.s, z0.s[1]
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    strh w9, [sp, #10]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    strh w8, [sp, #10]
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    lsl z0.h, z0.h, #15
 ; CHECK-NEXT:    asr z0.h, z0.h, #15
@@ -195,19 +195,19 @@ define <16 x half> @masked_load_v16f16(ptr %src, <16 x i1> %mask) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    uunpklo z1.h, z0.b
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    mov x8, #8 // =0x8
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
-; CHECK-NEXT:    lsl z1.h, z1.h, #15
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    lsl z1.h, z1.h, #15
 ; CHECK-NEXT:    asr z1.h, z1.h, #15
 ; CHECK-NEXT:    lsl z0.h, z0.h, #15
-; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    asr z0.h, z0.h, #15
 ; CHECK-NEXT:    cmpne p1.h, p0/z, z1.h, #0
+; CHECK-NEXT:    asr z0.h, z0.h, #15
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0]
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, x8, lsl #1]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, x8, lsl #1]
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
   %load = call <16 x half> @llvm.masked.load.v16f16(ptr %src, i32 8, <16 x i1> %mask, <16 x half> zeroinitializer)
@@ -250,31 +250,31 @@ define <8 x float> @masked_load_v8f32(ptr %src, <8 x i1> %mask) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    mov z1.b, z0.b[3]
 ; CHECK-NEXT:    mov z2.b, z0.b[2]
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    mov z3.b, z0.b[1]
 ; CHECK-NEXT:    mov z4.b, z0.b[7]
-; CHECK-NEXT:    mov z5.b, z0.b[6]
-; CHECK-NEXT:    mov z6.b, z0.b[5]
-; CHECK-NEXT:    fmov w9, s1
-; CHECK-NEXT:    mov z0.b, z0.b[4]
-; CHECK-NEXT:    fmov w10, s2
 ; CHECK-NEXT:    strh w8, [sp, #-16]!
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov z1.b, z0.b[6]
+; CHECK-NEXT:    strh w8, [sp, #6]
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    mov z2.b, z0.b[5]
+; CHECK-NEXT:    mov z0.b, z0.b[4]
+; CHECK-NEXT:    strh w8, [sp, #4]
 ; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w9, [sp, #6]
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    strh w10, [sp, #4]
-; CHECK-NEXT:    fmov w10, s5
 ; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    strh w9, [sp, #14]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strh w10, [sp, #12]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    strh w8, [sp, #14]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    strh w8, [sp, #12]
+; CHECK-NEXT:    fmov w8, s2
 ; CHECK-NEXT:    strh w8, [sp, #10]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strh w8, [sp, #8]
 ; CHECK-NEXT:    mov x8, #4 // =0x4
-; CHECK-NEXT:    strh w9, [sp, #8]
 ; CHECK-NEXT:    ldp d0, d1, [sp]
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
@@ -314,8 +314,8 @@ define <4 x double> @masked_load_v4f64(ptr %src, <4 x i1> %mask) {
 ; CHECK-LABEL: masked_load_v4f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    mov x8, #2 // =0x2
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    mov x8, #2 // =0x2
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8
@@ -327,8 +327,8 @@ define <4 x double> @masked_load_v4f64(ptr %src, <4 x i1> %mask) {
 ; CHECK-NEXT:    cmpne p1.d, p0/z, z1.d, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, x8, lsl #3]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, x8, lsl #3]
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    ret
   %load = call <4 x double> @llvm.masked.load.v4f64(ptr %src, i32 8, <4 x i1> %mask, <4 x double> zeroinitializer)

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
index f6aa59fd7c8043..edc21535b0fe41 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
@@ -59,74 +59,74 @@ define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) {
 ; CHECK-NEXT:    sub sp, sp, #32
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    ldr w8, [sp, #96]
-; CHECK-NEXT:    strb w7, [sp, #6]
 ; CHECK-NEXT:    ldr w9, [sp, #88]
-; CHECK-NEXT:    strb w6, [sp, #5]
-; CHECK-NEXT:    ldr w10, [sp, #80]
-; CHECK-NEXT:    strb w5, [sp, #4]
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    ldr w10, [sp, #120]
+; CHECK-NEXT:    strb w7, [sp, #6]
 ; CHECK-NEXT:    strb w8, [sp, #15]
-; CHECK-NEXT:    ldr w8, [sp, #72]
+; CHECK-NEXT:    ldr w8, [sp, #80]
 ; CHECK-NEXT:    strb w9, [sp, #14]
-; CHECK-NEXT:    ldr w9, [sp, #64]
-; CHECK-NEXT:    strb w10, [sp, #13]
-; CHECK-NEXT:    ldr w10, [sp, #56]
-; CHECK-NEXT:    strb w8, [sp, #12]
+; CHECK-NEXT:    ldr w9, [sp, #72]
+; CHECK-NEXT:    strb w8, [sp, #13]
+; CHECK-NEXT:    ldr w8, [sp, #64]
+; CHECK-NEXT:    strb w9, [sp, #12]
+; CHECK-NEXT:    ldr w9, [sp, #56]
+; CHECK-NEXT:    strb w8, [sp, #11]
 ; CHECK-NEXT:    ldr w8, [sp, #48]
-; CHECK-NEXT:    strb w9, [sp, #11]
+; CHECK-NEXT:    strb w9, [sp, #10]
 ; CHECK-NEXT:    ldr w9, [sp, #40]
-; CHECK-NEXT:    strb w10, [sp, #10]
-; CHECK-NEXT:    ldr w10, [sp, #32]
 ; CHECK-NEXT:    strb w8, [sp, #9]
-; CHECK-NEXT:    ldr w8, [sp, #224]
+; CHECK-NEXT:    ldr w8, [sp, #32]
 ; CHECK-NEXT:    strb w9, [sp, #8]
 ; CHECK-NEXT:    ldr w9, [sp, #216]
-; CHECK-NEXT:    strb w10, [sp, #7]
-; CHECK-NEXT:    ldr w10, [sp, #208]
-; CHECK-NEXT:    strb w8, [sp, #31]
-; CHECK-NEXT:    ldr w8, [sp, #200]
+; CHECK-NEXT:    strb w8, [sp, #7]
+; CHECK-NEXT:    ldr w8, [sp, #224]
 ; CHECK-NEXT:    strb w9, [sp, #30]
-; CHECK-NEXT:    ldr w9, [sp, #192]
-; CHECK-NEXT:    strb w10, [sp, #29]
-; CHECK-NEXT:    ldr w10, [sp, #184]
-; CHECK-NEXT:    strb w8, [sp, #28]
-; CHECK-NEXT:    ldr w8, [sp, #176]
-; CHECK-NEXT:    strb w9, [sp, #27]
+; CHECK-NEXT:    ldr w9, [sp, #200]
+; CHECK-NEXT:    strb w8, [sp, #31]
+; CHECK-NEXT:    ldr w8, [sp, #208]
+; CHECK-NEXT:    strb w9, [sp, #28]
+; CHECK-NEXT:    ldr w9, [sp, #184]
+; CHECK-NEXT:    strb w8, [sp, #29]
+; CHECK-NEXT:    ldr w8, [sp, #192]
+; CHECK-NEXT:    strb w9, [sp, #26]
 ; CHECK-NEXT:    ldr w9, [sp, #168]
-; CHECK-NEXT:    strb w10, [sp, #26]
-; CHECK-NEXT:    ldr w10, [sp, #160]
-; CHECK-NEXT:    strb w8, [sp, #25]
-; CHECK-NEXT:    ldr w8, [sp, #152]
+; CHECK-NEXT:    strb w8, [sp, #27]
+; CHECK-NEXT:    ldr w8, [sp, #176]
 ; CHECK-NEXT:    strb w9, [sp, #24]
-; CHECK-NEXT:    ldr w9, [sp, #144]
-; CHECK-NEXT:    strb w10, [sp, #23]
-; CHECK-NEXT:    ldr w10, [sp, #136]
-; CHECK-NEXT:    strb w8, [sp, #22]
+; CHECK-NEXT:    ldr w9, [sp, #152]
+; CHECK-NEXT:    strb w8, [sp, #25]
+; CHECK-NEXT:    ldr w8, [sp, #160]
+; CHECK-NEXT:    strb w9, [sp, #22]
+; CHECK-NEXT:    ldr w9, [sp, #136]
+; CHECK-NEXT:    strb w8, [sp, #23]
+; CHECK-NEXT:    ldr w8, [sp, #144]
+; CHECK-NEXT:    strb w9, [sp, #20]
+; CHECK-NEXT:    ldr w9, [sp, #112]
+; CHECK-NEXT:    strb w8, [sp, #21]
 ; CHECK-NEXT:    ldr w8, [sp, #128]
-; CHECK-NEXT:    strb w9, [sp, #21]
-; CHECK-NEXT:    ldr w9, [sp, #120]
-; CHECK-NEXT:    strb w10, [sp, #20]
-; CHECK-NEXT:    ldr w10, [sp, #112]
+; CHECK-NEXT:    strb w6, [sp, #5]
 ; CHECK-NEXT:    strb w8, [sp, #19]
 ; CHECK-NEXT:    ldr w8, [sp, #104]
+; CHECK-NEXT:    strb w5, [sp, #4]
 ; CHECK-NEXT:    strb w4, [sp, #3]
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    strb w3, [sp, #2]
 ; CHECK-NEXT:    strb w2, [sp, #1]
 ; CHECK-NEXT:    strb w1, [sp]
-; CHECK-NEXT:    strb w9, [sp, #18]
-; CHECK-NEXT:    strb w10, [sp, #17]
+; CHECK-NEXT:    strb w10, [sp, #18]
+; CHECK-NEXT:    strb w9, [sp, #17]
 ; CHECK-NEXT:    strb w8, [sp, #16]
 ; CHECK-NEXT:    mov w8, #16 // =0x10
-; CHECK-NEXT:    ldp q0, q1, [sp]
+; CHECK-NEXT:    ldp q1, q0, [sp]
 ; CHECK-NEXT:    lsl z0.b, z0.b, #7
-; CHECK-NEXT:    asr z0.b, z0.b, #7
 ; CHECK-NEXT:    lsl z1.b, z1.b, #7
+; CHECK-NEXT:    asr z0.b, z0.b, #7
+; CHECK-NEXT:    asr z1.b, z1.b, #7
 ; CHECK-NEXT:    cmpne p1.b, p0/z, z0.b, #0
-; CHECK-NEXT:    asr z0.b, z1.b, #7
-; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
-; CHECK-NEXT:    st1b { z0.b }, p0, [x0, x8]
-; CHECK-NEXT:    st1b { z0.b }, p1, [x0]
+; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, #0
+; CHECK-NEXT:    st1b { z0.b }, p1, [x0, x8]
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.v32i8(<32 x i8> zeroinitializer, ptr %dst, i32 8, <32 x i1> %mask)
@@ -139,13 +139,13 @@ define void @masked_store_v2f16(ptr %dst, <2 x i1> %mask) {
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov z1.s, z0.s[1]
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    str wzr, [sp, #12]
-; CHECK-NEXT:    mov z0.s, z0.s[1]
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    strh w9, [sp, #10]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    strh w8, [sp, #10]
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    lsl z0.h, z0.h, #15
 ; CHECK-NEXT:    asr z0.h, z0.h, #15
@@ -194,15 +194,15 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    mov x8, #8 // =0x8
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    uunpklo z1.h, z1.b
 ; CHECK-NEXT:    lsl z0.h, z0.h, #15
 ; CHECK-NEXT:    lsl z1.h, z1.h, #15
-; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    asr z1.h, z1.h, #15
 ; CHECK-NEXT:    asr z0.h, z0.h, #15
+; CHECK-NEXT:    asr z1.h, z1.h, #15
 ; CHECK-NEXT:    cmpne p1.h, p0/z, z1.h, #0
 ; CHECK-NEXT:    mov z1.h, #0 // =0x0
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
@@ -237,42 +237,42 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z1.b, z0.b[7]
 ; CHECK-NEXT:    mov z2.b, z0.b[6]
-; CHECK-NEXT:    fmov w9, s1
-; CHECK-NEXT:    mov z1.b, z0.b[5]
-; CHECK-NEXT:    fmov w10, s2
-; CHECK-NEXT:    mov z2.b, z0.b[4]
-; CHECK-NEXT:    fmov w11, s1
+; CHECK-NEXT:    mov z3.b, z0.b[5]
+; CHECK-NEXT:    mov z4.b, z0.b[4]
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    strh w8, [sp, #14]
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    mov z2.s, #0 // =0x0
+; CHECK-NEXT:    strh w8, [sp, #12]
+; CHECK-NEXT:    fmov w8, s3
 ; CHECK-NEXT:    mov z3.b, z0.b[2]
-; CHECK-NEXT:    strh w9, [sp, #14]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    strh w10, [sp, #12]
-; CHECK-NEXT:    mov z2.b, z0.b[3]
-; CHECK-NEXT:    strh w11, [sp, #10]
+; CHECK-NEXT:    strh w8, [sp, #10]
+; CHECK-NEXT:    fmov w8, s4
 ; CHECK-NEXT:    mov z4.b, z0.b[1]
-; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    ldr d1, [sp, #8]
+; CHECK-NEXT:    strh w8, [sp, #8]
 ; CHECK-NEXT:    mov x8, #4 // =0x4
-; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    fmov w10, s2
-; CHECK-NEXT:    uunpklo z0.s, z1.h
-; CHECK-NEXT:    lsl z0.s, z0.s, #31
-; CHECK-NEXT:    asr z0.s, z0.s, #31
-; CHECK-NEXT:    cmpne p1.s, p0/z, z0.s, #0
-; CHECK-NEXT:    mov z0.s, #0 // =0x0
-; CHECK-NEXT:    st1w { z0.s }, p1, [x0, x8, lsl #2]
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w9, [sp]
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    strh w10, [sp, #6]
-; CHECK-NEXT:    strh w8, [sp, #4]
-; CHECK-NEXT:    strh w9, [sp, #2]
-; CHECK-NEXT:    ldr d1, [sp]
+; CHECK-NEXT:    ldr d1, [sp, #8]
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    lsl z1.s, z1.s, #31
 ; CHECK-NEXT:    asr z1.s, z1.s, #31
-; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    cmpne p1.s, p0/z, z1.s, #0
+; CHECK-NEXT:    mov z1.b, z0.b[3]
+; CHECK-NEXT:    st1w { z2.s }, p1, [x0, x8, lsl #2]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strh w8, [sp]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    strh w8, [sp, #6]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    strh w8, [sp, #4]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    strh w8, [sp, #2]
+; CHECK-NEXT:    ldr d0, [sp]
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    lsl z0.s, z0.s, #31
+; CHECK-NEXT:    asr z0.s, z0.s, #31
+; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    st1w { z2.s }, p0, [x0]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.v8f32(<8 x float> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask)
@@ -299,8 +299,8 @@ define void @masked_store_v4f64(ptr %dst, <4 x i1> %mask) {
 ; CHECK-LABEL: masked_store_v4f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    mov x8, #2 // =0x2
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    mov x8, #2 // =0x2
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    ext z0.b, z0.b, z0.b, #8

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
index c8cbc6b14d6315..49ebff3791b887 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
@@ -53,10 +53,10 @@ define void @add_v16i8(ptr %a, ptr %b) {
 define void @add_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: add_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    add z0.b, z0.b, z2.b
-; CHECK-NEXT:    add z1.b, z1.b, z3.b
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    add z0.b, z1.b, z0.b
+; CHECK-NEXT:    add z1.b, z2.b, z3.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
@@ -125,10 +125,10 @@ define void @add_v8i16(ptr %a, ptr %b, ptr %c) {
 define void @add_v16i16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: add_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    add z0.h, z0.h, z2.h
-; CHECK-NEXT:    add z1.h, z1.h, z3.h
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    add z0.h, z1.h, z0.h
+; CHECK-NEXT:    add z1.h, z2.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
@@ -141,8 +141,8 @@ define void @add_v16i16(ptr %a, ptr %b, ptr %c) {
 define void @abs_v2i32(ptr %a) {
 ; CHECK-LABEL: abs_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
@@ -155,8 +155,8 @@ define void @abs_v2i32(ptr %a) {
 define void @abs_v4i32(ptr %a) {
 ; CHECK-LABEL: abs_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
@@ -169,8 +169,8 @@ define void @abs_v4i32(ptr %a) {
 define void @abs_v8i32(ptr %a) {
 ; CHECK-LABEL: abs_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    abs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -184,8 +184,8 @@ define void @abs_v8i32(ptr %a) {
 define void @abs_v2i64(ptr %a) {
 ; CHECK-LABEL: abs_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    abs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
@@ -198,8 +198,8 @@ define void @abs_v2i64(ptr %a) {
 define void @abs_v4i64(ptr %a) {
 ; CHECK-LABEL: abs_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    abs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    abs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -213,8 +213,8 @@ define void @abs_v4i64(ptr %a) {
 define void @fadd_v2f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fadd_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ldr s1, [x1]
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    fmov w8, s0
@@ -230,8 +230,8 @@ define void @fadd_v2f16(ptr %a, ptr %b) {
 define void @fadd_v4f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fadd_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    str d0, [x0]
@@ -246,8 +246,8 @@ define void @fadd_v4f16(ptr %a, ptr %b) {
 define void @fadd_v8f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fadd_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    str q0, [x0]
@@ -262,10 +262,11 @@ define void @fadd_v8f16(ptr %a, ptr %b) {
 define void @fadd_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fadd_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z2.h
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fadd z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -279,8 +280,8 @@ define void @fadd_v16f16(ptr %a, ptr %b) {
 define void @fadd_v2f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fadd_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    str d0, [x0]
@@ -295,8 +296,8 @@ define void @fadd_v2f32(ptr %a, ptr %b) {
 define void @fadd_v4f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fadd_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    str q0, [x0]
@@ -311,10 +312,11 @@ define void @fadd_v4f32(ptr %a, ptr %b) {
 define void @fadd_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fadd_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fadd z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -328,8 +330,8 @@ define void @fadd_v8f32(ptr %a, ptr %b) {
 define void @fadd_v2f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fadd_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    str q0, [x0]
@@ -344,10 +346,11 @@ define void @fadd_v2f64(ptr %a, ptr %b) {
 define void @fadd_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fadd_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    movprfx z1, z2
 ; CHECK-NEXT:    fadd z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
index c4c87debac0870..8ba3bf8bd849dc 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
@@ -8,8 +8,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @test_revbv16i16(ptr %a) {
 ; CHECK-LABEL: test_revbv16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK-NEXT:    revb z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -24,8 +24,8 @@ define void @test_revbv16i16(ptr %a) {
 define void @test_revbv8i32(ptr %a) {
 ; CHECK-LABEL: test_revbv8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK-NEXT:    revb z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -40,8 +40,8 @@ define void @test_revbv8i32(ptr %a) {
 define void @test_revbv4i64(ptr %a) {
 ; CHECK-LABEL: test_revbv4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK-NEXT:    revb z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -56,8 +56,8 @@ define void @test_revbv4i64(ptr %a) {
 define void @test_revhv8i32(ptr %a) {
 ; CHECK-LABEL: test_revhv8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    revh z0.s, p0/m, z0.s
 ; CHECK-NEXT:    revh z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -72,8 +72,8 @@ define void @test_revhv8i32(ptr %a) {
 define void @test_revhv8f32(ptr %a) {
 ; CHECK-LABEL: test_revhv8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    revh z0.s, p0/m, z0.s
 ; CHECK-NEXT:    revh z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -88,8 +88,8 @@ define void @test_revhv8f32(ptr %a) {
 define void @test_revhv4i64(ptr %a) {
 ; CHECK-LABEL: test_revhv4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    revh z0.d, p0/m, z0.d
 ; CHECK-NEXT:    revh z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -104,8 +104,8 @@ define void @test_revhv4i64(ptr %a) {
 define void @test_revwv4i64(ptr %a) {
 ; CHECK-LABEL: test_revwv4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    revw z0.d, p0/m, z0.d
 ; CHECK-NEXT:    revw z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -120,8 +120,8 @@ define void @test_revwv4i64(ptr %a) {
 define void @test_revwv4f64(ptr %a) {
 ; CHECK-LABEL: test_revwv4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    revw z0.d, p0/m, z0.d
 ; CHECK-NEXT:    revw z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -135,8 +135,8 @@ define void @test_revwv4f64(ptr %a) {
 define <16 x i8> @test_revv16i8(ptr %a) {
 ; CHECK-LABEL: test_revv16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -149,8 +149,8 @@ define <16 x i8> @test_revv16i8(ptr %a) {
 define void @test_revwv8i32v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: test_revwv8i32v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x1]
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldp q0, q1, [x1]
 ; CHECK-NEXT:    revw z0.d, p0/m, z0.d
 ; CHECK-NEXT:    revw z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -165,15 +165,15 @@ define void @test_revwv8i32v8i32(ptr %a, ptr %b) {
 define void @test_revhv32i16(ptr %a) {
 ; CHECK-LABEL: test_revhv32i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    revh z0.d, p0/m, z0.d
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
 ; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    revh z0.d, p0/m, z0.d
 ; CHECK-NEXT:    revh z1.d, p0/m, z1.d
+; CHECK-NEXT:    revh z2.d, p0/m, z2.d
+; CHECK-NEXT:    revh z3.d, p0/m, z3.d
 ; CHECK-NEXT:    stp q0, q1, [x0, #32]
-; CHECK-NEXT:    revh z0.d, p0/m, z2.d
-; CHECK-NEXT:    revh z1.d, p0/m, z3.d
-; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    stp q2, q3, [x0]
 ; CHECK-NEXT:    ret
   %tmp1 = load <32 x i16>, ptr %a
   %tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 undef, i32 undef, i32 undef, i32 31, i32 30, i32 29, i32 undef>
@@ -185,15 +185,15 @@ define void @test_rev_elts_fail(ptr %a) {
 ; CHECK-LABEL: test_rev_elts_fail:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q1, q0, [x0]
-; CHECK-NEXT:    fmov x10, d1
 ; CHECK-NEXT:    mov z2.d, z0.d[1]
 ; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    fmov x9, d2
 ; CHECK-NEXT:    mov z0.d, z1.d[1]
-; CHECK-NEXT:    fmov x11, d0
+; CHECK-NEXT:    fmov x9, d2
 ; CHECK-NEXT:    stp x9, x8, [sp, #-32]!
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    stp x11, x10, [sp, #16]
+; CHECK-NEXT:    fmov x8, d1
+; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    stp x9, x8, [sp, #16]
 ; CHECK-NEXT:    ldp q1, q0, [sp]
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    add sp, sp, #32
@@ -209,8 +209,8 @@ define void @test_rev_elts_fail(ptr %a) {
 define void @test_revdv4i64_sve2p1(ptr %a) #1 {
 ; CHECK-LABEL: test_revdv4i64_sve2p1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    revd z0.q, p0/m, z0.q
 ; CHECK-NEXT:    revd z1.q, p0/m, z1.q
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -224,8 +224,8 @@ define void @test_revdv4i64_sve2p1(ptr %a) #1 {
 define void @test_revdv4f64_sve2p1(ptr %a) #1 {
 ; CHECK-LABEL: test_revdv4f64_sve2p1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    revd z0.q, p0/m, z0.q
 ; CHECK-NEXT:    revd z1.q, p0/m, z1.q
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -241,25 +241,25 @@ define void @test_revv8i32(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #32
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    mov z2.s, z0.s[1]
-; CHECK-NEXT:    mov z3.s, z0.s[2]
+; CHECK-NEXT:    ldp q0, q3, [x0]
+; CHECK-NEXT:    mov z1.s, z0.s[1]
+; CHECK-NEXT:    mov z2.s, z0.s[2]
 ; CHECK-NEXT:    mov z4.s, z0.s[3]
 ; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    fmov w10, s3
-; CHECK-NEXT:    fmov w11, s4
-; CHECK-NEXT:    mov z0.s, z1.s[1]
-; CHECK-NEXT:    mov z2.s, z1.s[2]
-; CHECK-NEXT:    mov z3.s, z1.s[3]
+; CHECK-NEXT:    mov z0.s, z3.s[1]
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    mov z1.s, z3.s[2]
 ; CHECK-NEXT:    stp w9, w8, [sp, #24]
-; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    mov z2.s, z3.s[3]
+; CHECK-NEXT:    stp w9, w8, [sp, #16]
+; CHECK-NEXT:    fmov w8, s3
 ; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    stp w11, w10, [sp, #16]
-; CHECK-NEXT:    fmov w10, s2
-; CHECK-NEXT:    fmov w11, s3
 ; CHECK-NEXT:    stp w9, w8, [sp, #8]
-; CHECK-NEXT:    stp w11, w10, [sp]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    stp w9, w8, [sp]
 ; CHECK-NEXT:    ldp q0, q1, [sp]
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    add sp, sp, #32

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
index 80edfc5ada0103..28d299c65fdc04 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
@@ -14,57 +14,57 @@ define void @zip1_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q1, [x1, #16]
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    mov z2.b, z0.b[15]
+; CHECK-NEXT:    mov z3.b, z0.b[14]
+; CHECK-NEXT:    mov z4.b, z0.b[13]
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.b, z0.b[14]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.b, z0.b[13]
-; CHECK-NEXT:    fmov w10, s2
 ; CHECK-NEXT:    mov z2.b, z0.b[12]
 ; CHECK-NEXT:    strb w8, [sp, #14]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    mov z3.b, z0.b[11]
+; CHECK-NEXT:    strb w8, [sp, #12]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    mov z4.b, z0.b[10]
+; CHECK-NEXT:    strb w8, [sp, #10]
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.b, z0.b[11]
-; CHECK-NEXT:    strb w9, [sp, #12]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.b, z0.b[10]
-; CHECK-NEXT:    strb w10, [sp, #10]
-; CHECK-NEXT:    fmov w10, s2
 ; CHECK-NEXT:    mov z2.b, z0.b[9]
 ; CHECK-NEXT:    strb w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.b, z0.b[8]
-; CHECK-NEXT:    strb w9, [sp, #6]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.b, z1.b[15]
-; CHECK-NEXT:    strb w10, [sp, #4]
-; CHECK-NEXT:    strb w8, [sp, #2]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    mov z3.b, z0.b[8]
+; CHECK-NEXT:    zip1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    strb w8, [sp, #6]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    mov z4.b, z1.b[15]
+; CHECK-NEXT:    strb w8, [sp, #4]
 ; CHECK-NEXT:    fmov w8, s2
 ; CHECK-NEXT:    mov z2.b, z1.b[14]
-; CHECK-NEXT:    strb w9, [sp]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.b, z1.b[13]
-; CHECK-NEXT:    fmov w10, s2
-; CHECK-NEXT:    mov z2.b, z1.b[12]
+; CHECK-NEXT:    strb w8, [sp, #2]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    mov z3.b, z1.b[13]
+; CHECK-NEXT:    strb w8, [sp]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    mov z4.b, z1.b[12]
 ; CHECK-NEXT:    strb w8, [sp, #15]
 ; CHECK-NEXT:    fmov w8, s2
 ; CHECK-NEXT:    mov z2.b, z1.b[11]
-; CHECK-NEXT:    strb w9, [sp, #13]
-; CHECK-NEXT:    strb w10, [sp, #11]
-; CHECK-NEXT:    zip1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    strb w8, [sp, #13]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    mov z3.b, z1.b[10]
+; CHECK-NEXT:    strb w8, [sp, #11]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    mov z4.b, z1.b[9]
 ; CHECK-NEXT:    strb w8, [sp, #9]
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.b, z1.b[10]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.b, z1.b[9]
-; CHECK-NEXT:    fmov w10, s2
 ; CHECK-NEXT:    mov z2.b, z1.b[8]
 ; CHECK-NEXT:    strb w8, [sp, #7]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    strb w8, [sp, #5]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    strb w8, [sp, #3]
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strb w9, [sp, #5]
-; CHECK-NEXT:    strb w10, [sp, #3]
 ; CHECK-NEXT:    strb w8, [sp, #1]
-; CHECK-NEXT:    ldr q2, [sp]
+; CHECK-NEXT:    ldr q1, [sp]
 ; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    str q2, [x0, #16]
+; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
   %tmp1 = load volatile <32 x i8>, ptr %a
@@ -79,119 +79,119 @@ define void @zip_v32i16(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #64
 ; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    ldp q2, q5, [x1]
-; CHECK-NEXT:    ldp q4, q7, [x0]
-; CHECK-NEXT:    mov z16.h, z5.h[7]
+; CHECK-NEXT:    ldp q1, q3, [x1]
+; CHECK-NEXT:    ldp q0, q4, [x0]
+; CHECK-NEXT:    ldp q2, q5, [x0, #32]
+; CHECK-NEXT:    mov z16.h, z3.h[7]
+; CHECK-NEXT:    mov z18.h, z3.h[6]
+; CHECK-NEXT:    mov z17.h, z4.h[7]
+; CHECK-NEXT:    ldp q6, q7, [x1, #32]
+; CHECK-NEXT:    mov z19.h, z4.h[6]
 ; CHECK-NEXT:    fmov w8, s16
-; CHECK-NEXT:    mov z16.h, z5.h[6]
-; CHECK-NEXT:    fmov w10, s16
-; CHECK-NEXT:    mov z16.h, z5.h[5]
-; CHECK-NEXT:    mov z17.h, z7.h[7]
-; CHECK-NEXT:    fmov w9, s17
-; CHECK-NEXT:    mov z17.h, z7.h[6]
-; CHECK-NEXT:    ldp q0, q1, [x0, #32]
-; CHECK-NEXT:    ldp q3, q6, [x1, #32]
+; CHECK-NEXT:    mov z16.h, z3.h[5]
 ; CHECK-NEXT:    strh w8, [sp, #30]
 ; CHECK-NEXT:    fmov w8, s17
-; CHECK-NEXT:    strh w9, [sp, #28]
-; CHECK-NEXT:    strh w10, [sp, #26]
+; CHECK-NEXT:    mov z17.h, z4.h[5]
+; CHECK-NEXT:    strh w8, [sp, #28]
+; CHECK-NEXT:    fmov w8, s18
+; CHECK-NEXT:    mov z18.h, z3.h[4]
+; CHECK-NEXT:    zip1 z3.h, z4.h, z3.h
+; CHECK-NEXT:    strh w8, [sp, #26]
+; CHECK-NEXT:    fmov w8, s19
+; CHECK-NEXT:    mov z19.h, z7.h[6]
 ; CHECK-NEXT:    strh w8, [sp, #24]
 ; CHECK-NEXT:    fmov w8, s16
-; CHECK-NEXT:    mov z16.h, z7.h[5]
-; CHECK-NEXT:    fmov w9, s16
-; CHECK-NEXT:    mov z16.h, z5.h[4]
-; CHECK-NEXT:    fmov w10, s16
-; CHECK-NEXT:    mov z16.h, z7.h[4]
+; CHECK-NEXT:    mov z16.h, z4.h[4]
+; CHECK-NEXT:    zip1 z4.h, z5.h, z7.h
 ; CHECK-NEXT:    strh w8, [sp, #22]
+; CHECK-NEXT:    fmov w8, s17
+; CHECK-NEXT:    mov z17.h, z1.h[7]
+; CHECK-NEXT:    add z3.h, z3.h, z4.h
+; CHECK-NEXT:    strh w8, [sp, #20]
+; CHECK-NEXT:    fmov w8, s18
+; CHECK-NEXT:    mov z18.h, z0.h[7]
+; CHECK-NEXT:    strh w8, [sp, #18]
 ; CHECK-NEXT:    fmov w8, s16
-; CHECK-NEXT:    mov z16.h, z2.h[7]
-; CHECK-NEXT:    strh w9, [sp, #20]
-; CHECK-NEXT:    strh w10, [sp, #18]
-; CHECK-NEXT:    mov z18.h, z6.h[7]
+; CHECK-NEXT:    mov z16.h, z1.h[6]
 ; CHECK-NEXT:    strh w8, [sp, #16]
-; CHECK-NEXT:    fmov w8, s16
-; CHECK-NEXT:    mov z16.h, z4.h[7]
-; CHECK-NEXT:    ldr q17, [sp, #16]
-; CHECK-NEXT:    fmov w9, s16
-; CHECK-NEXT:    mov z16.h, z2.h[6]
-; CHECK-NEXT:    fmov w10, s16
-; CHECK-NEXT:    mov z16.h, z4.h[6]
+; CHECK-NEXT:    fmov w8, s17
+; CHECK-NEXT:    mov z17.h, z0.h[6]
 ; CHECK-NEXT:    strh w8, [sp, #62]
+; CHECK-NEXT:    fmov w8, s18
+; CHECK-NEXT:    mov z18.h, z1.h[5]
+; CHECK-NEXT:    strh w8, [sp, #60]
 ; CHECK-NEXT:    fmov w8, s16
-; CHECK-NEXT:    mov z16.h, z2.h[5]
-; CHECK-NEXT:    strh w9, [sp, #60]
-; CHECK-NEXT:    strh w10, [sp, #58]
-; CHECK-NEXT:    zip1 z5.h, z7.h, z5.h
+; CHECK-NEXT:    mov z16.h, z0.h[5]
+; CHECK-NEXT:    strh w8, [sp, #58]
+; CHECK-NEXT:    fmov w8, s17
+; CHECK-NEXT:    mov z17.h, z1.h[4]
 ; CHECK-NEXT:    strh w8, [sp, #56]
-; CHECK-NEXT:    fmov w8, s16
-; CHECK-NEXT:    mov z16.h, z4.h[5]
-; CHECK-NEXT:    fmov w9, s16
-; CHECK-NEXT:    mov z16.h, z2.h[4]
-; CHECK-NEXT:    fmov w10, s16
-; CHECK-NEXT:    mov z16.h, z4.h[4]
+; CHECK-NEXT:    fmov w8, s18
+; CHECK-NEXT:    mov z18.h, z0.h[4]
+; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    zip1 z1.h, z2.h, z6.h
 ; CHECK-NEXT:    strh w8, [sp, #54]
 ; CHECK-NEXT:    fmov w8, s16
-; CHECK-NEXT:    strh w9, [sp, #52]
-; CHECK-NEXT:    zip1 z2.h, z4.h, z2.h
-; CHECK-NEXT:    strh w10, [sp, #50]
-; CHECK-NEXT:    strh w8, [sp, #48]
+; CHECK-NEXT:    mov z16.h, z7.h[7]
+; CHECK-NEXT:    add z0.h, z0.h, z1.h
+; CHECK-NEXT:    strh w8, [sp, #52]
+; CHECK-NEXT:    fmov w8, s17
+; CHECK-NEXT:    mov z17.h, z5.h[7]
+; CHECK-NEXT:    strh w8, [sp, #50]
 ; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z1.h[7]
-; CHECK-NEXT:    ldr q16, [sp, #48]
-; CHECK-NEXT:    fmov w9, s18
-; CHECK-NEXT:    mov z18.h, z6.h[6]
-; CHECK-NEXT:    fmov w10, s18
-; CHECK-NEXT:    mov z18.h, z1.h[6]
+; CHECK-NEXT:    ldr q18, [sp, #16]
+; CHECK-NEXT:    strh w8, [sp, #48]
+; CHECK-NEXT:    fmov w8, s16
+; CHECK-NEXT:    mov z16.h, z5.h[6]
+; CHECK-NEXT:    ldr q20, [sp, #48]
 ; CHECK-NEXT:    strh w8, [sp, #46]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z6.h[5]
-; CHECK-NEXT:    strh w9, [sp, #44]
-; CHECK-NEXT:    strh w10, [sp, #42]
+; CHECK-NEXT:    fmov w8, s17
+; CHECK-NEXT:    mov z17.h, z7.h[5]
+; CHECK-NEXT:    strh w8, [sp, #44]
+; CHECK-NEXT:    fmov w8, s19
+; CHECK-NEXT:    mov z19.h, z5.h[5]
+; CHECK-NEXT:    strh w8, [sp, #42]
+; CHECK-NEXT:    fmov w8, s16
+; CHECK-NEXT:    mov z16.h, z7.h[4]
 ; CHECK-NEXT:    strh w8, [sp, #40]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z1.h[5]
-; CHECK-NEXT:    fmov w9, s18
-; CHECK-NEXT:    mov z18.h, z6.h[4]
-; CHECK-NEXT:    fmov w10, s18
-; CHECK-NEXT:    mov z18.h, z1.h[4]
+; CHECK-NEXT:    fmov w8, s17
+; CHECK-NEXT:    mov z17.h, z5.h[4]
 ; CHECK-NEXT:    strh w8, [sp, #38]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z3.h[7]
-; CHECK-NEXT:    strh w9, [sp, #36]
-; CHECK-NEXT:    strh w10, [sp, #34]
-; CHECK-NEXT:    zip1 z1.h, z1.h, z6.h
+; CHECK-NEXT:    fmov w8, s19
+; CHECK-NEXT:    mov z19.h, z6.h[7]
+; CHECK-NEXT:    strh w8, [sp, #36]
+; CHECK-NEXT:    fmov w8, s16
+; CHECK-NEXT:    mov z16.h, z2.h[7]
+; CHECK-NEXT:    strh w8, [sp, #34]
+; CHECK-NEXT:    fmov w8, s17
+; CHECK-NEXT:    mov z17.h, z6.h[6]
 ; CHECK-NEXT:    strh w8, [sp, #32]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z0.h[7]
-; CHECK-NEXT:    ldr q4, [sp, #32]
-; CHECK-NEXT:    fmov w9, s18
-; CHECK-NEXT:    mov z18.h, z3.h[6]
-; CHECK-NEXT:    fmov w10, s18
-; CHECK-NEXT:    mov z18.h, z0.h[6]
+; CHECK-NEXT:    fmov w8, s19
+; CHECK-NEXT:    mov z19.h, z2.h[6]
 ; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z3.h[5]
-; CHECK-NEXT:    strh w9, [sp, #12]
-; CHECK-NEXT:    strh w10, [sp, #10]
-; CHECK-NEXT:    add z1.h, z5.h, z1.h
+; CHECK-NEXT:    fmov w8, s16
+; CHECK-NEXT:    mov z16.h, z6.h[5]
+; CHECK-NEXT:    strh w8, [sp, #12]
+; CHECK-NEXT:    fmov w8, s17
+; CHECK-NEXT:    mov z17.h, z2.h[5]
+; CHECK-NEXT:    strh w8, [sp, #10]
+; CHECK-NEXT:    fmov w8, s19
+; CHECK-NEXT:    mov z19.h, z6.h[4]
 ; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z0.h[5]
-; CHECK-NEXT:    add z4.h, z17.h, z4.h
-; CHECK-NEXT:    fmov w9, s18
-; CHECK-NEXT:    mov z18.h, z3.h[4]
-; CHECK-NEXT:    fmov w10, s18
-; CHECK-NEXT:    mov z18.h, z0.h[4]
+; CHECK-NEXT:    fmov w8, s16
+; CHECK-NEXT:    mov z16.h, z2.h[4]
+; CHECK-NEXT:    ldr q2, [sp, #32]
 ; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    strh w9, [sp, #4]
-; CHECK-NEXT:    zip1 z0.h, z0.h, z3.h
-; CHECK-NEXT:    strh w10, [sp, #2]
-; CHECK-NEXT:    add z0.h, z2.h, z0.h
+; CHECK-NEXT:    fmov w8, s17
+; CHECK-NEXT:    add z2.h, z18.h, z2.h
+; CHECK-NEXT:    strh w8, [sp, #4]
+; CHECK-NEXT:    fmov w8, s19
+; CHECK-NEXT:    strh w8, [sp, #2]
+; CHECK-NEXT:    fmov w8, s16
 ; CHECK-NEXT:    strh w8, [sp]
-; CHECK-NEXT:    ldr q3, [sp]
-; CHECK-NEXT:    stp q1, q4, [x0, #32]
-; CHECK-NEXT:    add z1.h, z16.h, z3.h
+; CHECK-NEXT:    ldr q4, [sp]
+; CHECK-NEXT:    stp q3, q2, [x0, #32]
+; CHECK-NEXT:    add z1.h, z20.h, z4.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
@@ -214,33 +214,33 @@ define void @zip1_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q1, [x1, #16]
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    mov z2.h, z0.h[7]
+; CHECK-NEXT:    mov z3.h, z0.h[6]
+; CHECK-NEXT:    mov z4.h, z0.h[5]
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.h, z0.h[6]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.h, z0.h[5]
-; CHECK-NEXT:    fmov w10, s2
 ; CHECK-NEXT:    mov z2.h, z0.h[4]
+; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
 ; CHECK-NEXT:    strh w8, [sp, #12]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    mov z3.h, z1.h[7]
+; CHECK-NEXT:    strh w8, [sp, #8]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    mov z4.h, z1.h[6]
+; CHECK-NEXT:    strh w8, [sp, #4]
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.h, z1.h[7]
-; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.h, z1.h[6]
-; CHECK-NEXT:    strh w10, [sp, #4]
-; CHECK-NEXT:    fmov w10, s2
 ; CHECK-NEXT:    mov z2.h, z1.h[5]
 ; CHECK-NEXT:    strh w8, [sp]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    mov z3.h, z1.h[4]
+; CHECK-NEXT:    strh w8, [sp, #14]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    strh w8, [sp, #10]
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.h, z1.h[4]
-; CHECK-NEXT:    strh w9, [sp, #14]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    strh w10, [sp, #10]
-; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
 ; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    strh w9, [sp, #2]
-; CHECK-NEXT:    ldr q2, [sp]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    strh w8, [sp, #2]
+; CHECK-NEXT:    ldr q1, [sp]
 ; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    str q2, [x0, #16]
+; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
   %tmp1 = load volatile <16 x i16>, ptr %a
@@ -260,19 +260,19 @@ define void @zip1_v8i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q1, [x1, #16]
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    mov z2.s, z0.s[3]
+; CHECK-NEXT:    mov z4.s, z0.s[2]
+; CHECK-NEXT:    mov z3.s, z1.s[3]
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.s, z1.s[3]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.s, z0.s[2]
-; CHECK-NEXT:    fmov w10, s2
 ; CHECK-NEXT:    mov z2.s, z1.s[2]
-; CHECK-NEXT:    fmov w11, s2
-; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    fmov w9, s3
 ; CHECK-NEXT:    stp w8, w9, [sp, #8]
-; CHECK-NEXT:    stp w10, w11, [sp]
-; CHECK-NEXT:    ldr q2, [sp]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    stp w8, w9, [sp]
+; CHECK-NEXT:    ldr q1, [sp]
 ; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    str q2, [x0, #16]
+; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
   %tmp1 = load volatile <8 x i32>, ptr %a
@@ -287,12 +287,13 @@ define void @zip_v4f64(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    zip1 z4.d, z1.d, z2.d
-; CHECK-NEXT:    trn2 z1.d, z1.d, z2.d
-; CHECK-NEXT:    zip1 z2.d, z0.d, z3.d
-; CHECK-NEXT:    trn2 z0.d, z0.d, z3.d
-; CHECK-NEXT:    fadd z2.d, p0/m, z2.d, z4.d
+; CHECK-NEXT:    ldp q3, q2, [x1]
+; CHECK-NEXT:    zip1 z4.d, z1.d, z3.d
+; CHECK-NEXT:    zip1 z5.d, z0.d, z2.d
+; CHECK-NEXT:    trn2 z1.d, z1.d, z3.d
+; CHECK-NEXT:    trn2 z0.d, z0.d, z2.d
+; CHECK-NEXT:    movprfx z2, z4
+; CHECK-NEXT:    fadd z2.d, p0/m, z2.d, z5.d
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
@@ -314,17 +315,17 @@ define void @zip_v4i32(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q1, [x0]
 ; CHECK-NEXT:    mov z2.s, z0.s[3]
 ; CHECK-NEXT:    mov z3.s, z1.s[3]
+; CHECK-NEXT:    mov z4.s, z0.s[2]
+; CHECK-NEXT:    zip1 z0.s, z1.s, z0.s
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.s, z0.s[2]
+; CHECK-NEXT:    mov z2.s, z1.s[2]
 ; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov z3.s, z1.s[2]
-; CHECK-NEXT:    fmov w10, s2
-; CHECK-NEXT:    fmov w11, s3
-; CHECK-NEXT:    zip1 z0.s, z1.s, z0.s
 ; CHECK-NEXT:    stp w9, w8, [sp, #8]
-; CHECK-NEXT:    stp w11, w10, [sp]
-; CHECK-NEXT:    ldr q2, [sp]
-; CHECK-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    stp w9, w8, [sp]
+; CHECK-NEXT:    ldr q1, [sp]
+; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
@@ -345,10 +346,10 @@ define void @zip1_v8i32_undef(ptr %a) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    mov z1.s, z0.s[3]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z1.s, z0.s[2]
-; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    mov z2.s, z0.s[2]
 ; CHECK-NEXT:    zip1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s2
 ; CHECK-NEXT:    stp w8, w8, [sp, #8]
 ; CHECK-NEXT:    stp w9, w9, [sp]
 ; CHECK-NEXT:    ldr q1, [sp]
@@ -365,15 +366,15 @@ define void @zip1_v8i32_undef(ptr %a) {
 define void @trn_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: trn_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    trn1 z4.b, z1.b, z2.b
-; CHECK-NEXT:    trn2 z1.b, z1.b, z2.b
-; CHECK-NEXT:    add z1.b, z4.b, z1.b
-; CHECK-NEXT:    trn1 z5.b, z0.b, z3.b
-; CHECK-NEXT:    trn2 z0.b, z0.b, z3.b
-; CHECK-NEXT:    add z0.b, z5.b, z0.b
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    ldp q0, q2, [x0]
+; CHECK-NEXT:    ldp q1, q3, [x1]
+; CHECK-NEXT:    trn1 z4.b, z0.b, z1.b
+; CHECK-NEXT:    trn2 z0.b, z0.b, z1.b
+; CHECK-NEXT:    trn1 z1.b, z2.b, z3.b
+; CHECK-NEXT:    trn2 z2.b, z2.b, z3.b
+; CHECK-NEXT:    add z0.b, z4.b, z0.b
+; CHECK-NEXT:    add z1.b, z1.b, z2.b
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = load <32 x i8>, ptr %b
@@ -391,33 +392,33 @@ define void @trn_v8i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    mov z1.h, z0.h[3]
 ; CHECK-NEXT:    mov z2.h, z0.h[1]
-; CHECK-NEXT:    mov z6.h, z0.h[2]
 ; CHECK-NEXT:    mov z3.h, z0.h[5]
 ; CHECK-NEXT:    mov z4.h, z0.h[4]
-; CHECK-NEXT:    mov z5.h, z0.h[6]
-; CHECK-NEXT:    fmov w9, s1
-; CHECK-NEXT:    mov z0.h, z0.h[7]
-; CHECK-NEXT:    fmov w10, s2
-; CHECK-NEXT:    fmov w11, s6
 ; CHECK-NEXT:    strh w8, [sp, #-32]!
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w12, s4
-; CHECK-NEXT:    fmov w13, s5
-; CHECK-NEXT:    strh w11, [sp, #4]
-; CHECK-NEXT:    fmov w11, s0
-; CHECK-NEXT:    strh w9, [sp, #14]
-; CHECK-NEXT:    strh w10, [sp, #12]
-; CHECK-NEXT:    strh w8, [sp, #10]
-; CHECK-NEXT:    strh w12, [sp, #8]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov z1.h, z0.h[2]
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    mov z2.h, z0.h[6]
+; CHECK-NEXT:    mov z0.h, z0.h[7]
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    fmov w11, s4
+; CHECK-NEXT:    fmov w12, s1
+; CHECK-NEXT:    strh w8, [sp, #14]
+; CHECK-NEXT:    fmov w13, s2
+; CHECK-NEXT:    strh w9, [sp, #12]
+; CHECK-NEXT:    strh w10, [sp, #10]
+; CHECK-NEXT:    strh w12, [sp, #4]
+; CHECK-NEXT:    fmov w12, s0
+; CHECK-NEXT:    strh w11, [sp, #8]
 ; CHECK-NEXT:    strh w13, [sp, #6]
-; CHECK-NEXT:    strh w11, [sp, #2]
-; CHECK-NEXT:    strh w11, [sp, #28]
-; CHECK-NEXT:    strh w12, [sp, #26]
-; CHECK-NEXT:    strh w8, [sp, #22]
-; CHECK-NEXT:    strh w9, [sp, #20]
+; CHECK-NEXT:    strh w12, [sp, #2]
+; CHECK-NEXT:    strh w12, [sp, #28]
+; CHECK-NEXT:    strh w11, [sp, #26]
+; CHECK-NEXT:    strh w10, [sp, #22]
+; CHECK-NEXT:    strh w8, [sp, #20]
 ; CHECK-NEXT:    strh w13, [sp, #18]
-; CHECK-NEXT:    strh w10, [sp, #16]
+; CHECK-NEXT:    strh w9, [sp, #16]
 ; CHECK-NEXT:    ldp q0, q1, [sp]
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    str q0, [x0]
@@ -435,15 +436,15 @@ define void @trn_v8i16(ptr %a, ptr %b) {
 define void @trn_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: trn_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    trn1 z4.h, z1.h, z2.h
-; CHECK-NEXT:    trn2 z1.h, z1.h, z2.h
-; CHECK-NEXT:    add z1.h, z4.h, z1.h
-; CHECK-NEXT:    trn1 z5.h, z0.h, z3.h
-; CHECK-NEXT:    trn2 z0.h, z0.h, z3.h
-; CHECK-NEXT:    add z0.h, z5.h, z0.h
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    ldp q0, q2, [x0]
+; CHECK-NEXT:    ldp q1, q3, [x1]
+; CHECK-NEXT:    trn1 z4.h, z0.h, z1.h
+; CHECK-NEXT:    trn2 z0.h, z0.h, z1.h
+; CHECK-NEXT:    trn1 z1.h, z2.h, z3.h
+; CHECK-NEXT:    trn2 z2.h, z2.h, z3.h
+; CHECK-NEXT:    add z0.h, z4.h, z0.h
+; CHECK-NEXT:    add z1.h, z1.h, z2.h
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %tmp1 = load <16 x i16>, ptr %a
   %tmp2 = load <16 x i16>, ptr %b
@@ -457,15 +458,15 @@ define void @trn_v16i16(ptr %a, ptr %b) {
 define void @trn_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: trn_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    zip1 z4.s, z1.s, z2.s
-; CHECK-NEXT:    trn2 z1.s, z1.s, z2.s
-; CHECK-NEXT:    add z1.s, z4.s, z1.s
-; CHECK-NEXT:    trn1 z5.s, z0.s, z3.s
-; CHECK-NEXT:    trn2 z0.s, z0.s, z3.s
-; CHECK-NEXT:    add z0.s, z5.s, z0.s
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    ldp q0, q2, [x0]
+; CHECK-NEXT:    ldp q1, q3, [x1]
+; CHECK-NEXT:    zip1 z4.s, z0.s, z1.s
+; CHECK-NEXT:    trn2 z0.s, z0.s, z1.s
+; CHECK-NEXT:    trn1 z1.s, z2.s, z3.s
+; CHECK-NEXT:    trn2 z2.s, z2.s, z3.s
+; CHECK-NEXT:    add z0.s, z4.s, z0.s
+; CHECK-NEXT:    add z1.s, z1.s, z2.s
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x i32>, ptr %a
   %tmp2 = load <8 x i32>, ptr %b
@@ -479,16 +480,16 @@ define void @trn_v8i32(ptr %a, ptr %b) {
 define void @trn_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: trn_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ldp q0, q2, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    zip1 z4.d, z1.d, z2.d
-; CHECK-NEXT:    trn2 z1.d, z1.d, z2.d
-; CHECK-NEXT:    fadd z1.d, p0/m, z1.d, z4.d
-; CHECK-NEXT:    zip1 z5.d, z0.d, z3.d
-; CHECK-NEXT:    trn2 z0.d, z0.d, z3.d
-; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z5.d
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    ldp q1, q3, [x1]
+; CHECK-NEXT:    zip1 z4.d, z0.d, z1.d
+; CHECK-NEXT:    trn2 z0.d, z0.d, z1.d
+; CHECK-NEXT:    zip1 z1.d, z2.d, z3.d
+; CHECK-NEXT:    trn2 z2.d, z2.d, z3.d
+; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z4.d
+; CHECK-NEXT:    fadd z1.d, p0/m, z1.d, z2.d
+; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %a
   %tmp2 = load <4 x double>, ptr %b
@@ -502,8 +503,8 @@ define void @trn_v4f64(ptr %a, ptr %b) {
 define void @trn_v4f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: trn_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    trn1 z2.s, z0.s, z1.s
 ; CHECK-NEXT:    trn2 z0.s, z0.s, z1.s
@@ -525,9 +526,9 @@ define void @trn_v8i32_undef(ptr %a) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    trn1 z2.s, z0.s, z0.s
 ; CHECK-NEXT:    trn2 z0.s, z0.s, z0.s
-; CHECK-NEXT:    add z0.s, z2.s, z0.s
 ; CHECK-NEXT:    trn1 z3.s, z1.s, z1.s
 ; CHECK-NEXT:    trn2 z1.s, z1.s, z1.s
+; CHECK-NEXT:    add z0.s, z2.s, z0.s
 ; CHECK-NEXT:    add z1.s, z3.s, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
@@ -549,57 +550,57 @@ define void @zip2_v32i8(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    ldr q1, [x1, #16]
 ; CHECK-NEXT:    mov z2.b, z0.b[15]
+; CHECK-NEXT:    mov z3.b, z0.b[14]
+; CHECK-NEXT:    mov z4.b, z0.b[13]
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.b, z0.b[14]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.b, z0.b[13]
-; CHECK-NEXT:    fmov w10, s2
 ; CHECK-NEXT:    mov z2.b, z0.b[12]
 ; CHECK-NEXT:    strb w8, [sp, #14]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    mov z3.b, z0.b[11]
+; CHECK-NEXT:    strb w8, [sp, #12]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    mov z4.b, z0.b[10]
+; CHECK-NEXT:    strb w8, [sp, #10]
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.b, z0.b[11]
-; CHECK-NEXT:    strb w9, [sp, #12]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.b, z0.b[10]
-; CHECK-NEXT:    strb w10, [sp, #10]
-; CHECK-NEXT:    fmov w10, s2
 ; CHECK-NEXT:    mov z2.b, z0.b[9]
 ; CHECK-NEXT:    strb w8, [sp, #8]
-; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.b, z0.b[8]
-; CHECK-NEXT:    strb w9, [sp, #6]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.b, z1.b[15]
-; CHECK-NEXT:    strb w10, [sp, #4]
-; CHECK-NEXT:    strb w8, [sp, #2]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    mov z3.b, z0.b[8]
+; CHECK-NEXT:    zip1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    strb w8, [sp, #6]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    mov z4.b, z1.b[15]
+; CHECK-NEXT:    strb w8, [sp, #4]
 ; CHECK-NEXT:    fmov w8, s2
 ; CHECK-NEXT:    mov z2.b, z1.b[14]
-; CHECK-NEXT:    strb w9, [sp]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.b, z1.b[13]
-; CHECK-NEXT:    fmov w10, s2
-; CHECK-NEXT:    mov z2.b, z1.b[12]
+; CHECK-NEXT:    strb w8, [sp, #2]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    mov z3.b, z1.b[13]
+; CHECK-NEXT:    strb w8, [sp]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    mov z4.b, z1.b[12]
 ; CHECK-NEXT:    strb w8, [sp, #15]
 ; CHECK-NEXT:    fmov w8, s2
 ; CHECK-NEXT:    mov z2.b, z1.b[11]
-; CHECK-NEXT:    strb w9, [sp, #13]
-; CHECK-NEXT:    strb w10, [sp, #11]
-; CHECK-NEXT:    zip1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    strb w8, [sp, #13]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    mov z3.b, z1.b[10]
+; CHECK-NEXT:    strb w8, [sp, #11]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    mov z4.b, z1.b[9]
 ; CHECK-NEXT:    strb w8, [sp, #9]
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.b, z1.b[10]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.b, z1.b[9]
-; CHECK-NEXT:    fmov w10, s2
 ; CHECK-NEXT:    mov z2.b, z1.b[8]
 ; CHECK-NEXT:    strb w8, [sp, #7]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    strb w8, [sp, #5]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    strb w8, [sp, #3]
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strb w9, [sp, #5]
-; CHECK-NEXT:    strb w10, [sp, #3]
 ; CHECK-NEXT:    strb w8, [sp, #1]
-; CHECK-NEXT:    ldr q2, [sp]
+; CHECK-NEXT:    ldr q1, [sp]
 ; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    str q2, [x0, #16]
+; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
   %tmp1 = load volatile <32 x i8>, ptr %a
@@ -619,33 +620,33 @@ define void @zip2_v16i16(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    ldr q1, [x1, #16]
 ; CHECK-NEXT:    mov z2.h, z0.h[7]
+; CHECK-NEXT:    mov z3.h, z0.h[6]
+; CHECK-NEXT:    mov z4.h, z0.h[5]
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.h, z0.h[6]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.h, z0.h[5]
-; CHECK-NEXT:    fmov w10, s2
 ; CHECK-NEXT:    mov z2.h, z0.h[4]
+; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
 ; CHECK-NEXT:    strh w8, [sp, #12]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    mov z3.h, z1.h[7]
+; CHECK-NEXT:    strh w8, [sp, #8]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    mov z4.h, z1.h[6]
+; CHECK-NEXT:    strh w8, [sp, #4]
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.h, z1.h[7]
-; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.h, z1.h[6]
-; CHECK-NEXT:    strh w10, [sp, #4]
-; CHECK-NEXT:    fmov w10, s2
 ; CHECK-NEXT:    mov z2.h, z1.h[5]
 ; CHECK-NEXT:    strh w8, [sp]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    mov z3.h, z1.h[4]
+; CHECK-NEXT:    strh w8, [sp, #14]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    strh w8, [sp, #10]
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.h, z1.h[4]
-; CHECK-NEXT:    strh w9, [sp, #14]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    strh w10, [sp, #10]
-; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
 ; CHECK-NEXT:    strh w8, [sp, #6]
-; CHECK-NEXT:    strh w9, [sp, #2]
-; CHECK-NEXT:    ldr q2, [sp]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    strh w8, [sp, #2]
+; CHECK-NEXT:    ldr q1, [sp]
 ; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    str q2, [x0, #16]
+; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
   %tmp1 = load volatile <16 x i16>, ptr %a
@@ -665,19 +666,19 @@ define void @zip2_v8i32(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    ldr q1, [x1, #16]
 ; CHECK-NEXT:    mov z2.s, z0.s[3]
+; CHECK-NEXT:    mov z4.s, z0.s[2]
+; CHECK-NEXT:    mov z3.s, z1.s[3]
+; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.s, z1.s[3]
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.s, z0.s[2]
-; CHECK-NEXT:    fmov w10, s2
 ; CHECK-NEXT:    mov z2.s, z1.s[2]
-; CHECK-NEXT:    fmov w11, s2
-; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    fmov w9, s3
 ; CHECK-NEXT:    stp w8, w9, [sp, #8]
-; CHECK-NEXT:    stp w10, w11, [sp]
-; CHECK-NEXT:    ldr q2, [sp]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    stp w8, w9, [sp]
+; CHECK-NEXT:    ldr q1, [sp]
 ; CHECK-NEXT:    str q0, [x0]
-; CHECK-NEXT:    str q2, [x0, #16]
+; CHECK-NEXT:    str q1, [x0, #16]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
   %tmp1 = load volatile <8 x i32>, ptr %a
@@ -695,10 +696,10 @@ define void @zip2_v8i32_undef(ptr %a) #0{
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    mov z1.s, z0.s[3]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z1.s, z0.s[2]
-; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    mov z2.s, z0.s[2]
 ; CHECK-NEXT:    zip1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s2
 ; CHECK-NEXT:    stp w8, w8, [sp, #8]
 ; CHECK-NEXT:    stp w9, w9, [sp]
 ; CHECK-NEXT:    ldr q1, [sp]
@@ -715,213 +716,197 @@ define void @zip2_v8i32_undef(ptr %a) #0{
 define void @uzp_v32i8(ptr %a, ptr %b) #0{
 ; CHECK-LABEL: uzp_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #128
-; CHECK-NEXT:    stp d15, d14, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d13, d12, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d11, d10, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d9, d8, [sp, #112] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 128
-; CHECK-NEXT:    .cfi_offset b8, -8
-; CHECK-NEXT:    .cfi_offset b9, -16
-; CHECK-NEXT:    .cfi_offset b10, -24
-; CHECK-NEXT:    .cfi_offset b11, -32
-; CHECK-NEXT:    .cfi_offset b12, -40
-; CHECK-NEXT:    .cfi_offset b13, -48
-; CHECK-NEXT:    .cfi_offset b14, -56
-; CHECK-NEXT:    .cfi_offset b15, -64
-; CHECK-NEXT:    ldp q0, q3, [x0]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov z27.b, z0.b[14]
-; CHECK-NEXT:    mov z28.b, z0.b[12]
-; CHECK-NEXT:    mov z30.b, z0.b[8]
+; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    ldp q0, q1, [x1]
 ; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z2.b, z3.b[12]
-; CHECK-NEXT:    mov z4.b, z3.b[10]
-; CHECK-NEXT:    mov z1.b, z3.b[14]
-; CHECK-NEXT:    ldp q10, q11, [x1]
+; CHECK-NEXT:    mov z4.b, z3.b[14]
+; CHECK-NEXT:    mov z5.b, z3.b[12]
+; CHECK-NEXT:    mov z6.b, z3.b[10]
+; CHECK-NEXT:    mov z7.b, z3.b[8]
+; CHECK-NEXT:    mov z16.b, z3.b[11]
+; CHECK-NEXT:    mov z17.b, z3.b[9]
+; CHECK-NEXT:    mov z18.b, z3.b[7]
 ; CHECK-NEXT:    strb w8, [sp, #40]
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strb w9, [sp, #32]
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    mov z6.b, z3.b[6]
-; CHECK-NEXT:    mov z7.b, z3.b[4]
-; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    strb w8, [sp, #32]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    mov z4.b, z3.b[6]
+; CHECK-NEXT:    strb w8, [sp, #47]
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov z5.b, z3.b[4]
 ; CHECK-NEXT:    strb w8, [sp, #46]
 ; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    strb w9, [sp, #45]
-; CHECK-NEXT:    fmov w9, s7
-; CHECK-NEXT:    mov z5.b, z3.b[8]
-; CHECK-NEXT:    strb w10, [sp, #47]
-; CHECK-NEXT:    fmov w10, s5
+; CHECK-NEXT:    mov z6.b, z3.b[2]
+; CHECK-NEXT:    strb w8, [sp, #45]
+; CHECK-NEXT:    fmov w8, s7
+; CHECK-NEXT:    mov z7.b, z2.b[14]
+; CHECK-NEXT:    strb w8, [sp, #44]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    mov z4.b, z2.b[12]
 ; CHECK-NEXT:    strb w8, [sp, #43]
-; CHECK-NEXT:    fmov w8, s27
-; CHECK-NEXT:    strb w9, [sp, #42]
-; CHECK-NEXT:    fmov w9, s28
-; CHECK-NEXT:    mov z16.b, z3.b[2]
-; CHECK-NEXT:    mov z31.b, z0.b[6]
-; CHECK-NEXT:    strb w10, [sp, #44]
-; CHECK-NEXT:    fmov w10, s16
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov z5.b, z2.b[10]
+; CHECK-NEXT:    strb w8, [sp, #42]
+; CHECK-NEXT:    fmov w8, s6
+; CHECK-NEXT:    mov z6.b, z2.b[8]
+; CHECK-NEXT:    strb w8, [sp, #41]
+; CHECK-NEXT:    fmov w8, s7
+; CHECK-NEXT:    mov z7.b, z2.b[6]
 ; CHECK-NEXT:    strb w8, [sp, #39]
-; CHECK-NEXT:    fmov w8, s30
-; CHECK-NEXT:    strb w9, [sp, #38]
-; CHECK-NEXT:    fmov w9, s31
-; CHECK-NEXT:    mov z29.b, z0.b[10]
-; CHECK-NEXT:    mov z9.b, z0.b[2]
-; CHECK-NEXT:    strb w10, [sp, #41]
-; CHECK-NEXT:    fmov w10, s29
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    mov z4.b, z2.b[4]
+; CHECK-NEXT:    strb w8, [sp, #38]
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov z5.b, z2.b[2]
+; CHECK-NEXT:    strb w8, [sp, #37]
+; CHECK-NEXT:    fmov w8, s6
+; CHECK-NEXT:    mov z6.b, z1.b[10]
 ; CHECK-NEXT:    strb w8, [sp, #36]
-; CHECK-NEXT:    fmov w8, s9
-; CHECK-NEXT:    strb w9, [sp, #35]
-; CHECK-NEXT:    fmov w9, s11
-; CHECK-NEXT:    mov z8.b, z0.b[4]
-; CHECK-NEXT:    mov z16.b, z11.b[4]
-; CHECK-NEXT:    mov z27.b, z11.b[2]
-; CHECK-NEXT:    strb w10, [sp, #37]
-; CHECK-NEXT:    fmov w10, s8
+; CHECK-NEXT:    fmov w8, s7
+; CHECK-NEXT:    mov z7.b, z1.b[8]
+; CHECK-NEXT:    strb w8, [sp, #35]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    mov z4.b, z1.b[14]
+; CHECK-NEXT:    strb w8, [sp, #34]
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov z5.b, z1.b[12]
 ; CHECK-NEXT:    strb w8, [sp, #33]
-; CHECK-NEXT:    fmov w8, s16
-; CHECK-NEXT:    strb w9, [sp, #8]
-; CHECK-NEXT:    fmov w9, s27
-; CHECK-NEXT:    mov z5.b, z11.b[10]
-; CHECK-NEXT:    mov z6.b, z11.b[8]
-; CHECK-NEXT:    mov z2.b, z11.b[14]
-; CHECK-NEXT:    fmov w12, s5
-; CHECK-NEXT:    fmov w13, s6
-; CHECK-NEXT:    mov z5.b, z10.b[10]
-; CHECK-NEXT:    mov z6.b, z10.b[8]
-; CHECK-NEXT:    strb w10, [sp, #34]
-; CHECK-NEXT:    fmov w10, s2
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    strb w8, [sp, #8]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strb w8, [sp]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    mov z4.b, z1.b[6]
+; CHECK-NEXT:    strb w8, [sp, #15]
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov z5.b, z1.b[4]
+; CHECK-NEXT:    strb w8, [sp, #14]
+; CHECK-NEXT:    fmov w8, s6
+; CHECK-NEXT:    mov z6.b, z1.b[2]
+; CHECK-NEXT:    strb w8, [sp, #13]
+; CHECK-NEXT:    fmov w8, s7
+; CHECK-NEXT:    mov z7.b, z0.b[14]
+; CHECK-NEXT:    strb w8, [sp, #12]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    mov z4.b, z0.b[12]
+; CHECK-NEXT:    strb w8, [sp, #11]
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov z5.b, z0.b[10]
 ; CHECK-NEXT:    strb w8, [sp, #10]
+; CHECK-NEXT:    fmov w8, s6
+; CHECK-NEXT:    mov z6.b, z0.b[8]
+; CHECK-NEXT:    strb w8, [sp, #9]
+; CHECK-NEXT:    fmov w8, s7
+; CHECK-NEXT:    mov z7.b, z0.b[6]
+; CHECK-NEXT:    strb w8, [sp, #7]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    mov z4.b, z0.b[4]
+; CHECK-NEXT:    strb w8, [sp, #6]
 ; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    strb w9, [sp, #9]
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    mov z4.b, z11.b[12]
-; CHECK-NEXT:    mov z7.b, z11.b[6]
-; CHECK-NEXT:    mov z28.b, z11.b[15]
-; CHECK-NEXT:    mov z29.b, z11.b[13]
-; CHECK-NEXT:    mov z30.b, z11.b[11]
-; CHECK-NEXT:    mov z31.b, z11.b[9]
-; CHECK-NEXT:    mov z8.b, z11.b[7]
-; CHECK-NEXT:    mov z9.b, z11.b[5]
-; CHECK-NEXT:    mov z12.b, z11.b[3]
-; CHECK-NEXT:    mov z13.b, z11.b[1]
-; CHECK-NEXT:    mov z2.b, z10.b[14]
-; CHECK-NEXT:    mov z11.b, z10.b[4]
-; CHECK-NEXT:    mov z14.b, z10.b[2]
-; CHECK-NEXT:    strb w10, [sp, #15]
-; CHECK-NEXT:    fmov w10, s2
+; CHECK-NEXT:    mov z5.b, z0.b[2]
 ; CHECK-NEXT:    strb w8, [sp, #5]
-; CHECK-NEXT:    fmov w8, s11
-; CHECK-NEXT:    strb w9, [sp, #4]
-; CHECK-NEXT:    fmov w9, s14
-; CHECK-NEXT:    mov z17.b, z3.b[15]
-; CHECK-NEXT:    mov z18.b, z3.b[13]
-; CHECK-NEXT:    fmov w14, s7
-; CHECK-NEXT:    mov z7.b, z10.b[6]
-; CHECK-NEXT:    strb w10, [sp, #7]
-; CHECK-NEXT:    fmov w10, s7
+; CHECK-NEXT:    fmov w8, s6
+; CHECK-NEXT:    mov z6.b, z3.b[15]
+; CHECK-NEXT:    strb w8, [sp, #4]
+; CHECK-NEXT:    fmov w8, s7
+; CHECK-NEXT:    mov z7.b, z3.b[13]
+; CHECK-NEXT:    strb w8, [sp, #3]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    ldr q4, [sp, #32]
 ; CHECK-NEXT:    strb w8, [sp, #2]
-; CHECK-NEXT:    fmov w8, s17
-; CHECK-NEXT:    strb w9, [sp, #1]
-; CHECK-NEXT:    fmov w9, s18
-; CHECK-NEXT:    mov z19.b, z3.b[11]
-; CHECK-NEXT:    mov z20.b, z3.b[9]
-; CHECK-NEXT:    mov z21.b, z3.b[7]
-; CHECK-NEXT:    strb w10, [sp, #3]
-; CHECK-NEXT:    fmov w10, s19
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    strb w8, [sp, #1]
+; CHECK-NEXT:    fmov w8, s6
+; CHECK-NEXT:    mov z6.b, z3.b[5]
+; CHECK-NEXT:    mov z3.b, z3.b[3]
+; CHECK-NEXT:    ldr q5, [sp]
 ; CHECK-NEXT:    strb w8, [sp, #63]
-; CHECK-NEXT:    fmov w8, s20
-; CHECK-NEXT:    strb w9, [sp, #62]
-; CHECK-NEXT:    fmov w9, s21
-; CHECK-NEXT:    mov z22.b, z3.b[5]
-; CHECK-NEXT:    mov z23.b, z3.b[3]
-; CHECK-NEXT:    mov z3.b, z0.b[13]
-; CHECK-NEXT:    strb w10, [sp, #61]
-; CHECK-NEXT:    fmov w10, s22
+; CHECK-NEXT:    fmov w8, s7
+; CHECK-NEXT:    mov z7.b, z2.b[13]
+; CHECK-NEXT:    strb w8, [sp, #62]
+; CHECK-NEXT:    fmov w8, s16
+; CHECK-NEXT:    mov z16.b, z2.b[11]
+; CHECK-NEXT:    strb w8, [sp, #61]
+; CHECK-NEXT:    fmov w8, s17
 ; CHECK-NEXT:    strb w8, [sp, #60]
-; CHECK-NEXT:    fmov w8, s23
-; CHECK-NEXT:    strb w9, [sp, #59]
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    mov z24.b, z0.b[11]
-; CHECK-NEXT:    mov z25.b, z0.b[9]
-; CHECK-NEXT:    mov z26.b, z0.b[5]
-; CHECK-NEXT:    strb w10, [sp, #58]
-; CHECK-NEXT:    fmov w10, s24
+; CHECK-NEXT:    fmov w8, s18
+; CHECK-NEXT:    strb w8, [sp, #59]
+; CHECK-NEXT:    fmov w8, s6
+; CHECK-NEXT:    mov z6.b, z2.b[9]
+; CHECK-NEXT:    strb w8, [sp, #58]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    mov z3.b, z2.b[5]
 ; CHECK-NEXT:    strb w8, [sp, #57]
-; CHECK-NEXT:    fmov w8, s25
-; CHECK-NEXT:    strb w9, [sp, #54]
-; CHECK-NEXT:    fmov w9, s26
-; CHECK-NEXT:    mov z1.b, z0.b[3]
-; CHECK-NEXT:    mov z0.b, z0.b[1]
-; CHECK-NEXT:    strb w10, [sp, #53]
-; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s7
+; CHECK-NEXT:    mov z7.b, z2.b[3]
+; CHECK-NEXT:    mov z2.b, z2.b[1]
+; CHECK-NEXT:    strb w8, [sp, #54]
+; CHECK-NEXT:    fmov w8, s16
+; CHECK-NEXT:    strb w8, [sp, #53]
+; CHECK-NEXT:    fmov w8, s6
+; CHECK-NEXT:    mov z6.b, z1.b[15]
 ; CHECK-NEXT:    strb w8, [sp, #52]
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strb w9, [sp, #50]
-; CHECK-NEXT:    fmov w9, s28
-; CHECK-NEXT:    strb w10, [sp, #49]
-; CHECK-NEXT:    fmov w10, s29
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    mov z3.b, z1.b[13]
+; CHECK-NEXT:    strb w8, [sp, #50]
+; CHECK-NEXT:    fmov w8, s7
+; CHECK-NEXT:    mov z7.b, z1.b[11]
+; CHECK-NEXT:    strb w8, [sp, #49]
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    mov z2.b, z1.b[9]
 ; CHECK-NEXT:    strb w8, [sp, #48]
-; CHECK-NEXT:    fmov w8, s30
-; CHECK-NEXT:    strb w9, [sp, #31]
-; CHECK-NEXT:    fmov w9, s31
-; CHECK-NEXT:    strb w10, [sp, #30]
-; CHECK-NEXT:    fmov w10, s8
+; CHECK-NEXT:    fmov w8, s6
+; CHECK-NEXT:    mov z6.b, z1.b[7]
+; CHECK-NEXT:    strb w8, [sp, #31]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    mov z3.b, z1.b[5]
+; CHECK-NEXT:    strb w8, [sp, #30]
+; CHECK-NEXT:    fmov w8, s7
+; CHECK-NEXT:    mov z7.b, z1.b[3]
+; CHECK-NEXT:    mov z1.b, z1.b[1]
 ; CHECK-NEXT:    strb w8, [sp, #29]
-; CHECK-NEXT:    fmov w8, s9
-; CHECK-NEXT:    strb w9, [sp, #28]
-; CHECK-NEXT:    fmov w9, s12
-; CHECK-NEXT:    fmov w11, s4
-; CHECK-NEXT:    mov z15.b, z10.b[15]
-; CHECK-NEXT:    mov z16.b, z10.b[13]
-; CHECK-NEXT:    strb w10, [sp, #27]
-; CHECK-NEXT:    fmov w10, s13
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    mov z2.b, z0.b[15]
+; CHECK-NEXT:    strb w8, [sp, #28]
+; CHECK-NEXT:    fmov w8, s6
+; CHECK-NEXT:    mov z6.b, z0.b[11]
+; CHECK-NEXT:    strb w8, [sp, #27]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    mov z3.b, z0.b[13]
 ; CHECK-NEXT:    strb w8, [sp, #26]
-; CHECK-NEXT:    fmov w8, s15
-; CHECK-NEXT:    strb w9, [sp, #25]
-; CHECK-NEXT:    fmov w9, s16
-; CHECK-NEXT:    mov z4.b, z10.b[12]
-; CHECK-NEXT:    mov z27.b, z10.b[11]
-; CHECK-NEXT:    strb w11, [sp, #14]
-; CHECK-NEXT:    mov z2.b, z10.b[9]
-; CHECK-NEXT:    fmov w11, s4
-; CHECK-NEXT:    mov z4.b, z10.b[7]
-; CHECK-NEXT:    strb w10, [sp, #24]
-; CHECK-NEXT:    fmov w10, s27
-; CHECK-NEXT:    strb w8, [sp, #23]
+; CHECK-NEXT:    fmov w8, s7
+; CHECK-NEXT:    strb w8, [sp, #25]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov z1.b, z0.b[9]
+; CHECK-NEXT:    strb w8, [sp, #24]
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strb w9, [sp, #22]
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    mov z5.b, z10.b[5]
-; CHECK-NEXT:    mov z6.b, z10.b[3]
-; CHECK-NEXT:    mov z7.b, z10.b[1]
-; CHECK-NEXT:    fmov w15, s10
-; CHECK-NEXT:    strb w10, [sp, #21]
-; CHECK-NEXT:    fmov w10, s5
+; CHECK-NEXT:    mov z2.b, z0.b[7]
+; CHECK-NEXT:    strb w8, [sp, #23]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    mov z3.b, z0.b[5]
+; CHECK-NEXT:    strb w8, [sp, #22]
+; CHECK-NEXT:    fmov w8, s6
+; CHECK-NEXT:    mov z6.b, z0.b[3]
+; CHECK-NEXT:    mov z0.b, z0.b[1]
+; CHECK-NEXT:    strb w8, [sp, #21]
+; CHECK-NEXT:    fmov w8, s1
 ; CHECK-NEXT:    strb w8, [sp, #20]
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    strb w8, [sp, #19]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    strb w8, [sp, #18]
 ; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    strb w9, [sp, #19]
-; CHECK-NEXT:    fmov w9, s7
-; CHECK-NEXT:    strb w15, [sp]
-; CHECK-NEXT:    strb w12, [sp, #13]
-; CHECK-NEXT:    ldr q17, [sp, #32]
-; CHECK-NEXT:    strb w13, [sp, #12]
-; CHECK-NEXT:    ldr q0, [sp, #48]
-; CHECK-NEXT:    strb w14, [sp, #11]
-; CHECK-NEXT:    strb w11, [sp, #6]
-; CHECK-NEXT:    strb w10, [sp, #18]
-; CHECK-NEXT:    ldr q18, [sp]
 ; CHECK-NEXT:    strb w8, [sp, #17]
-; CHECK-NEXT:    add z0.b, z17.b, z0.b
-; CHECK-NEXT:    strb w9, [sp, #16]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    ldr q0, [sp, #48]
+; CHECK-NEXT:    add z0.b, z4.b, z0.b
+; CHECK-NEXT:    strb w8, [sp, #16]
 ; CHECK-NEXT:    ldr q1, [sp, #16]
-; CHECK-NEXT:    ldp d9, d8, [sp, #112] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d11, d10, [sp, #96] // 16-byte Folded Reload
-; CHECK-NEXT:    add z1.b, z18.b, z1.b
-; CHECK-NEXT:    ldp d13, d12, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    add z1.b, z5.b, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
-; CHECK-NEXT:    ldp d15, d14, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #128
+; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
   %tmp1 = load <32 x i8>, ptr %a
   %tmp2 = load <32 x i8>, ptr %b
@@ -936,21 +921,21 @@ define void @uzp_v4i16(ptr %a, ptr %b) #0{
 ; CHECK-LABEL: uzp_v4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    mov z1.h, z0.h[1]
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    mov z2.h, z0.h[2]
-; CHECK-NEXT:    mov z0.h, z0.h[3]
+; CHECK-NEXT:    mov z3.h, z0.h[3]
 ; CHECK-NEXT:    fmov w9, s1
-; CHECK-NEXT:    fmov w10, s2
-; CHECK-NEXT:    fmov w11, s0
 ; CHECK-NEXT:    strh w8, [sp, #-16]!
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    fmov w10, s2
+; CHECK-NEXT:    fmov w11, s3
 ; CHECK-NEXT:    strh w9, [sp, #6]
+; CHECK-NEXT:    strh w8, [sp, #10]
+; CHECK-NEXT:    strh w9, [sp, #8]
 ; CHECK-NEXT:    strh w10, [sp, #4]
 ; CHECK-NEXT:    strh w11, [sp, #2]
-; CHECK-NEXT:    strh w8, [sp, #10]
 ; CHECK-NEXT:    strh w10, [sp, #12]
-; CHECK-NEXT:    strh w9, [sp, #8]
 ; CHECK-NEXT:    ldp d0, d1, [sp]
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    str d0, [x0]
@@ -970,106 +955,106 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #64
 ; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    mov z17.h, z0.h[4]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov z18.h, z0.h[2]
-; CHECK-NEXT:    mov z19.h, z0.h[7]
-; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    mov z3.h, z1.h[4]
-; CHECK-NEXT:    ldp q21, q22, [x1]
-; CHECK-NEXT:    mov z2.h, z1.h[6]
-; CHECK-NEXT:    mov z4.h, z1.h[2]
-; CHECK-NEXT:    strh w8, [sp, #40]
+; CHECK-NEXT:    ldp q1, q3, [x0]
+; CHECK-NEXT:    ldp q0, q2, [x1]
 ; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    fmov w10, s2
-; CHECK-NEXT:    mov z5.h, z1.h[7]
-; CHECK-NEXT:    mov z6.h, z1.h[5]
-; CHECK-NEXT:    mov z7.h, z1.h[3]
+; CHECK-NEXT:    mov z4.h, z3.h[6]
+; CHECK-NEXT:    mov z5.h, z3.h[4]
+; CHECK-NEXT:    mov z6.h, z3.h[2]
+; CHECK-NEXT:    mov z7.h, z1.h[6]
+; CHECK-NEXT:    strh w8, [sp, #40]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    strh w8, [sp, #32]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    mov z4.h, z1.h[4]
+; CHECK-NEXT:    strh w8, [sp, #46]
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov z5.h, z1.h[2]
 ; CHECK-NEXT:    strh w8, [sp, #44]
-; CHECK-NEXT:    fmov w8, s17
-; CHECK-NEXT:    mov z16.h, z1.h[1]
-; CHECK-NEXT:    mov z1.h, z0.h[6]
-; CHECK-NEXT:    strh w9, [sp, #32]
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    strh w10, [sp, #46]
-; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s6
+; CHECK-NEXT:    mov z6.h, z2.h[2]
+; CHECK-NEXT:    strh w8, [sp, #42]
+; CHECK-NEXT:    fmov w8, s7
+; CHECK-NEXT:    mov z7.h, z0.h[6]
+; CHECK-NEXT:    strh w8, [sp, #38]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    mov z4.h, z2.h[6]
 ; CHECK-NEXT:    strh w8, [sp, #36]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z2.h, z22.h[6]
-; CHECK-NEXT:    strh w9, [sp, #42]
-; CHECK-NEXT:    strh w10, [sp, #38]
-; CHECK-NEXT:    fmov w9, s22
-; CHECK-NEXT:    fmov w10, s21
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov z5.h, z2.h[4]
 ; CHECK-NEXT:    strh w8, [sp, #34]
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z3.h, z22.h[4]
-; CHECK-NEXT:    mov z4.h, z22.h[2]
-; CHECK-NEXT:    mov z17.h, z22.h[7]
-; CHECK-NEXT:    mov z18.h, z22.h[5]
-; CHECK-NEXT:    mov z23.h, z22.h[3]
-; CHECK-NEXT:    mov z24.h, z22.h[1]
-; CHECK-NEXT:    mov z22.h, z21.h[6]
-; CHECK-NEXT:    strh w9, [sp, #8]
-; CHECK-NEXT:    fmov w9, s3
-; CHECK-NEXT:    strh w10, [sp]
-; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    ldr q16, [sp, #32]
+; CHECK-NEXT:    strh w8, [sp, #8]
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strh w8, [sp]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    mov z4.h, z0.h[4]
 ; CHECK-NEXT:    strh w8, [sp, #14]
-; CHECK-NEXT:    fmov w8, s22
-; CHECK-NEXT:    mov z25.h, z21.h[4]
-; CHECK-NEXT:    mov z26.h, z21.h[2]
-; CHECK-NEXT:    strh w9, [sp, #12]
-; CHECK-NEXT:    fmov w9, s25
-; CHECK-NEXT:    strh w10, [sp, #10]
-; CHECK-NEXT:    fmov w10, s26
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov z5.h, z0.h[2]
+; CHECK-NEXT:    strh w8, [sp, #12]
+; CHECK-NEXT:    fmov w8, s6
+; CHECK-NEXT:    mov z6.h, z3.h[7]
+; CHECK-NEXT:    strh w8, [sp, #10]
+; CHECK-NEXT:    fmov w8, s7
+; CHECK-NEXT:    mov z7.h, z1.h[7]
 ; CHECK-NEXT:    strh w8, [sp, #6]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    mov z4.h, z3.h[5]
+; CHECK-NEXT:    strh w8, [sp, #4]
 ; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    strh w9, [sp, #4]
-; CHECK-NEXT:    fmov w9, s6
-; CHECK-NEXT:    strh w10, [sp, #2]
-; CHECK-NEXT:    fmov w10, s7
+; CHECK-NEXT:    mov z5.h, z3.h[3]
+; CHECK-NEXT:    mov z3.h, z3.h[1]
+; CHECK-NEXT:    strh w8, [sp, #2]
+; CHECK-NEXT:    fmov w8, s6
+; CHECK-NEXT:    ldr q6, [sp]
 ; CHECK-NEXT:    strh w8, [sp, #62]
-; CHECK-NEXT:    fmov w8, s16
-; CHECK-NEXT:    mov z20.h, z0.h[5]
-; CHECK-NEXT:    mov z1.h, z0.h[3]
-; CHECK-NEXT:    strh w9, [sp, #60]
-; CHECK-NEXT:    fmov w9, s19
-; CHECK-NEXT:    strh w10, [sp, #58]
-; CHECK-NEXT:    fmov w10, s20
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    mov z4.h, z1.h[5]
+; CHECK-NEXT:    strh w8, [sp, #60]
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov z5.h, z1.h[3]
+; CHECK-NEXT:    mov z1.h, z1.h[1]
+; CHECK-NEXT:    strh w8, [sp, #58]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    mov z3.h, z2.h[7]
 ; CHECK-NEXT:    strh w8, [sp, #56]
+; CHECK-NEXT:    fmov w8, s7
+; CHECK-NEXT:    strh w8, [sp, #54]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    mov z4.h, z2.h[5]
+; CHECK-NEXT:    strh w8, [sp, #52]
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov z5.h, z2.h[3]
+; CHECK-NEXT:    strh w8, [sp, #50]
 ; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov z1.h, z2.h[1]
+; CHECK-NEXT:    mov z2.h, z0.h[7]
+; CHECK-NEXT:    strh w8, [sp, #48]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    mov z3.h, z0.h[5]
+; CHECK-NEXT:    strh w8, [sp, #30]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    mov z4.h, z0.h[3]
 ; CHECK-NEXT:    mov z0.h, z0.h[1]
-; CHECK-NEXT:    strh w9, [sp, #54]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strh w10, [sp, #52]
-; CHECK-NEXT:    fmov w10, s17
-; CHECK-NEXT:    strh w8, [sp, #50]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z27.h, z21.h[7]
-; CHECK-NEXT:    strh w9, [sp, #48]
-; CHECK-NEXT:    fmov w9, s23
-; CHECK-NEXT:    strh w10, [sp, #30]
-; CHECK-NEXT:    fmov w10, s24
 ; CHECK-NEXT:    strh w8, [sp, #28]
-; CHECK-NEXT:    fmov w8, s27
-; CHECK-NEXT:    mov z28.h, z21.h[5]
-; CHECK-NEXT:    mov z2.h, z21.h[3]
-; CHECK-NEXT:    mov z3.h, z21.h[1]
-; CHECK-NEXT:    strh w9, [sp, #26]
-; CHECK-NEXT:    fmov w9, s28
-; CHECK-NEXT:    strh w10, [sp, #24]
-; CHECK-NEXT:    fmov w10, s2
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    strh w8, [sp, #26]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    strh w8, [sp, #24]
+; CHECK-NEXT:    fmov w8, s2
 ; CHECK-NEXT:    strh w8, [sp, #22]
 ; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    ldr q4, [sp, #32]
-; CHECK-NEXT:    strh w9, [sp, #20]
-; CHECK-NEXT:    ldr q5, [sp]
-; CHECK-NEXT:    strh w10, [sp, #18]
+; CHECK-NEXT:    strh w8, [sp, #20]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    strh w8, [sp, #18]
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    ldr q0, [sp, #48]
+; CHECK-NEXT:    add z0.h, z16.h, z0.h
 ; CHECK-NEXT:    strh w8, [sp, #16]
 ; CHECK-NEXT:    ldr q1, [sp, #16]
-; CHECK-NEXT:    add z0.h, z4.h, z0.h
-; CHECK-NEXT:    add z1.h, z5.h, z1.h
+; CHECK-NEXT:    add z1.h, z6.h, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
@@ -1087,27 +1072,27 @@ define void @uzp_v8f32(ptr %a, ptr %b) #0{
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #64
 ; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ldp q2, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q3, q2, [x1]
-; CHECK-NEXT:    mov z4.s, z0.s[2]
-; CHECK-NEXT:    stp s0, s4, [sp, #24]
-; CHECK-NEXT:    mov z4.s, z3.s[2]
-; CHECK-NEXT:    mov z5.s, z2.s[2]
-; CHECK-NEXT:    stp s4, s2, [sp, #4]
-; CHECK-NEXT:    stp s5, s1, [sp, #12]
+; CHECK-NEXT:    ldp q4, q1, [x1]
+; CHECK-NEXT:    mov z3.s, z0.s[2]
+; CHECK-NEXT:    mov z5.s, z1.s[2]
+; CHECK-NEXT:    stp s0, s3, [sp, #24]
+; CHECK-NEXT:    mov z3.s, z4.s[2]
+; CHECK-NEXT:    stp s5, s2, [sp, #12]
 ; CHECK-NEXT:    mov z5.s, z0.s[3]
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    mov z1.s, z1.s[1]
+; CHECK-NEXT:    stp s3, s1, [sp, #4]
+; CHECK-NEXT:    mov z1.s, z2.s[1]
 ; CHECK-NEXT:    stp s0, s5, [sp, #40]
-; CHECK-NEXT:    mov z0.s, z3.s[3]
+; CHECK-NEXT:    mov z5.s, z4.s[3]
+; CHECK-NEXT:    mov z4.s, z4.s[1]
+; CHECK-NEXT:    ldp q3, q2, [sp]
 ; CHECK-NEXT:    str s1, [sp, #32]
-; CHECK-NEXT:    mov z1.s, z3.s[1]
-; CHECK-NEXT:    stp s1, s0, [sp, #48]
-; CHECK-NEXT:    ldp q4, q2, [sp]
+; CHECK-NEXT:    stp s4, s5, [sp, #48]
 ; CHECK-NEXT:    ldp q0, q1, [sp, #32]
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z2.s
-; CHECK-NEXT:    fadd z1.s, p0/m, z1.s, z4.s
+; CHECK-NEXT:    fadd z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
@@ -1127,10 +1112,10 @@ define void @uzp_v4i64(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    ldp q3, q2, [x1]
 ; CHECK-NEXT:    zip1 z4.d, z1.d, z0.d
 ; CHECK-NEXT:    trn2 z0.d, z1.d, z0.d
+; CHECK-NEXT:    zip1 z1.d, z3.d, z2.d
+; CHECK-NEXT:    trn2 z2.d, z3.d, z2.d
 ; CHECK-NEXT:    add z0.d, z4.d, z0.d
-; CHECK-NEXT:    zip1 z5.d, z3.d, z2.d
-; CHECK-NEXT:    trn2 z1.d, z3.d, z2.d
-; CHECK-NEXT:    add z1.d, z5.d, z1.d
+; CHECK-NEXT:    add z1.d, z1.d, z2.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i64>, ptr %a
@@ -1152,51 +1137,51 @@ define void @uzp_v8i16(ptr %a, ptr %b) #0{
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    mov z2.h, z0.h[6]
 ; CHECK-NEXT:    mov z3.h, z0.h[4]
-; CHECK-NEXT:    fmov w9, s1
-; CHECK-NEXT:    fmov w10, s2
 ; CHECK-NEXT:    mov z4.h, z0.h[2]
+; CHECK-NEXT:    mov z5.h, z1.h[6]
+; CHECK-NEXT:    mov z6.h, z1.h[4]
 ; CHECK-NEXT:    strh w8, [sp, #8]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    strh w8, [sp]
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    mov z2.h, z1.h[2]
+; CHECK-NEXT:    strh w8, [sp, #14]
 ; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    mov z5.h, z0.h[7]
-; CHECK-NEXT:    mov z6.h, z0.h[5]
-; CHECK-NEXT:    mov z7.h, z0.h[3]
-; CHECK-NEXT:    mov z16.h, z0.h[1]
-; CHECK-NEXT:    mov z0.h, z1.h[6]
-; CHECK-NEXT:    mov z17.h, z1.h[4]
-; CHECK-NEXT:    strh w9, [sp]
-; CHECK-NEXT:    fmov w9, s4
-; CHECK-NEXT:    strh w10, [sp, #14]
-; CHECK-NEXT:    fmov w10, s0
+; CHECK-NEXT:    mov z3.h, z0.h[7]
 ; CHECK-NEXT:    strh w8, [sp, #12]
-; CHECK-NEXT:    fmov w8, s17
-; CHECK-NEXT:    mov z18.h, z1.h[2]
-; CHECK-NEXT:    strh w9, [sp, #10]
-; CHECK-NEXT:    fmov w9, s18
-; CHECK-NEXT:    strh w10, [sp, #6]
-; CHECK-NEXT:    fmov w10, s5
-; CHECK-NEXT:    strh w8, [sp, #4]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    mov z4.h, z0.h[5]
+; CHECK-NEXT:    strh w8, [sp, #10]
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov z5.h, z0.h[3]
+; CHECK-NEXT:    mov z0.h, z0.h[1]
+; CHECK-NEXT:    strh w8, [sp, #6]
 ; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    mov z19.h, z1.h[7]
-; CHECK-NEXT:    strh w9, [sp, #2]
-; CHECK-NEXT:    fmov w9, s7
-; CHECK-NEXT:    strh w10, [sp, #30]
-; CHECK-NEXT:    fmov w10, s16
+; CHECK-NEXT:    strh w8, [sp, #4]
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    mov z2.h, z1.h[7]
+; CHECK-NEXT:    strh w8, [sp, #2]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    strh w8, [sp, #30]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    mov z4.h, z1.h[5]
 ; CHECK-NEXT:    strh w8, [sp, #28]
-; CHECK-NEXT:    fmov w8, s19
-; CHECK-NEXT:    mov z20.h, z1.h[5]
-; CHECK-NEXT:    mov z21.h, z1.h[3]
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov z5.h, z1.h[3]
+; CHECK-NEXT:    strh w8, [sp, #26]
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    mov z0.h, z1.h[1]
-; CHECK-NEXT:    strh w9, [sp, #26]
-; CHECK-NEXT:    fmov w9, s20
-; CHECK-NEXT:    strh w10, [sp, #24]
-; CHECK-NEXT:    fmov w10, s21
+; CHECK-NEXT:    strh w8, [sp, #24]
+; CHECK-NEXT:    fmov w8, s2
 ; CHECK-NEXT:    strh w8, [sp, #22]
+; CHECK-NEXT:    fmov w8, s4
+; CHECK-NEXT:    strh w8, [sp, #20]
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    strh w8, [sp, #18]
 ; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    strh w9, [sp, #20]
-; CHECK-NEXT:    strh w10, [sp, #18]
 ; CHECK-NEXT:    strh w8, [sp, #16]
-; CHECK-NEXT:    ldp q1, q0, [sp]
-; CHECK-NEXT:    add z0.h, z1.h, z0.h
+; CHECK-NEXT:    ldp q3, q0, [sp]
+; CHECK-NEXT:    add z0.h, z3.h, z0.h
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
@@ -1215,24 +1200,24 @@ define void @uzp_v8i32_undef(ptr %a) #0{
 ; CHECK-NEXT:    sub sp, sp, #32
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    ldp q1, q0, [x0]
-; CHECK-NEXT:    fmov w10, s1
-; CHECK-NEXT:    mov z5.s, z1.s[3]
 ; CHECK-NEXT:    mov z2.s, z0.s[2]
-; CHECK-NEXT:    mov z3.s, z0.s[3]
-; CHECK-NEXT:    mov z4.s, z0.s[1]
 ; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    mov z0.s, z1.s[2]
+; CHECK-NEXT:    mov z3.s, z1.s[2]
+; CHECK-NEXT:    mov z4.s, z0.s[3]
+; CHECK-NEXT:    mov z0.s, z0.s[1]
 ; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    fmov w11, s0
-; CHECK-NEXT:    mov z2.s, z1.s[1]
-; CHECK-NEXT:    fmov w12, s3
+; CHECK-NEXT:    mov z2.s, z1.s[3]
 ; CHECK-NEXT:    stp w8, w9, [sp, #8]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov z1.s, z1.s[1]
+; CHECK-NEXT:    stp w8, w9, [sp]
 ; CHECK-NEXT:    fmov w8, s4
-; CHECK-NEXT:    fmov w9, s5
-; CHECK-NEXT:    stp w10, w11, [sp]
-; CHECK-NEXT:    fmov w10, s2
-; CHECK-NEXT:    stp w8, w12, [sp, #24]
-; CHECK-NEXT:    stp w10, w9, [sp, #16]
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    stp w9, w8, [sp, #24]
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s1
+; CHECK-NEXT:    stp w9, w8, [sp, #16]
 ; CHECK-NEXT:    ldp q0, q1, [sp]
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    stp q0, q0, [x0]
@@ -1251,12 +1236,13 @@ define void @zip_vscale2_4(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    ldp q2, q3, [x1]
-; CHECK-NEXT:    zip1 z4.d, z1.d, z2.d
-; CHECK-NEXT:    trn2 z1.d, z1.d, z2.d
-; CHECK-NEXT:    zip1 z2.d, z0.d, z3.d
-; CHECK-NEXT:    trn2 z0.d, z0.d, z3.d
-; CHECK-NEXT:    fadd z2.d, p0/m, z2.d, z4.d
+; CHECK-NEXT:    ldp q3, q2, [x1]
+; CHECK-NEXT:    zip1 z4.d, z1.d, z3.d
+; CHECK-NEXT:    zip1 z5.d, z0.d, z2.d
+; CHECK-NEXT:    trn2 z1.d, z1.d, z3.d
+; CHECK-NEXT:    trn2 z0.d, z0.d, z2.d
+; CHECK-NEXT:    movprfx z2, z4
+; CHECK-NEXT:    fadd z2.d, p0/m, z2.d, z5.d
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
index 93146398a653c5..cb3b41bd406bc1 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
@@ -7,30 +7,30 @@ target triple = "aarch64-unknown-linux-gnu"
 define i1 @ptest_v16i1(ptr %a, ptr %b) {
 ; CHECK-LABEL: ptest_v16i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ptrue p1.h, vl4
+; CHECK-NEXT:    ldp q1, q0, [x0, #32]
 ; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    fcmne p1.s, p0/z, z0.s, #0.0
 ; CHECK-NEXT:    fcmne p2.s, p0/z, z1.s, #0.0
-; CHECK-NEXT:    mov z1.s, p2/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    fcmne p2.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    mov z0.s, p2/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    splice z0.h, p1, z0.h, z1.h
-; CHECK-NEXT:    fcmne p2.s, p0/z, z3.s, #0.0
+; CHECK-NEXT:    fcmne p3.s, p0/z, z3.s, #0.0
 ; CHECK-NEXT:    fcmne p0.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    mov z2.s, p2/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z1.s, p2/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z2.s, p3/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z3.s, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    ptrue p1.b, vl16
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    splice z3.h, p1, z3.h, z2.h
-; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
-; CHECK-NEXT:    uzp1 z1.b, z3.b, z3.b
+; CHECK-NEXT:    splice z1.h, p0, z1.h, z0.h
+; CHECK-NEXT:    splice z3.h, p0, z3.h, z2.h
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
+; CHECK-NEXT:    uzp1 z1.b, z3.b, z3.b
 ; CHECK-NEXT:    splice z1.b, p0, z1.b, z0.b
-; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    umaxv b0, p0, z1.b
+; CHECK-NEXT:    umaxv b0, p1, z1.b
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
@@ -44,49 +44,49 @@ define i1 @ptest_v16i1(ptr %a, ptr %b) {
 define i1 @ptest_or_v16i1(ptr %a, ptr %b) {
 ; CHECK-LABEL: ptest_or_v16i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q2, q3, [x0, #32]
+; CHECK-NEXT:    ldp q4, q5, [x1]
+; CHECK-NEXT:    ldp q6, q7, [x1, #32]
+; CHECK-NEXT:    fcmne p1.s, p0/z, z3.s, #0.0
+; CHECK-NEXT:    fcmne p2.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    fcmne p3.s, p0/z, z1.s, #0.0
+; CHECK-NEXT:    fcmne p4.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    fcmne p5.s, p0/z, z7.s, #0.0
+; CHECK-NEXT:    fcmne p6.s, p0/z, z6.s, #0.0
+; CHECK-NEXT:    fcmne p7.s, p0/z, z5.s, #0.0
+; CHECK-NEXT:    fcmne p0.s, p0/z, z4.s, #0.0
+; CHECK-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z1.s, p2/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z2.s, p3/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z3.s, p4/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z4.s, p5/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z5.s, p6/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z6.s, p7/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z7.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ptrue p1.h, vl4
-; CHECK-NEXT:    fcmne p3.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    ldp q2, q3, [x0]
-; CHECK-NEXT:    fcmne p2.s, p0/z, z1.s, #0.0
-; CHECK-NEXT:    mov z1.s, p3/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    mov z0.s, p2/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    splice z1.h, p1, z1.h, z0.h
-; CHECK-NEXT:    fcmne p3.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    mov z4.s, p3/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    fcmne p2.s, p0/z, z3.s, #0.0
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
-; CHECK-NEXT:    ldp q3, q0, [x1, #32]
-; CHECK-NEXT:    mov z2.s, p2/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    splice z4.h, p1, z4.h, z2.h
-; CHECK-NEXT:    fcmne p3.s, p0/z, z3.s, #0.0
-; CHECK-NEXT:    mov z3.s, p3/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    ldp q5, q6, [x1]
-; CHECK-NEXT:    fcmne p2.s, p0/z, z0.s, #0.0
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    mov z0.s, p2/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    splice z3.h, p1, z3.h, z0.h
-; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
-; CHECK-NEXT:    uzp1 z1.b, z4.b, z4.b
-; CHECK-NEXT:    fcmne p2.s, p0/z, z6.s, #0.0
-; CHECK-NEXT:    fcmne p0.s, p0/z, z5.s, #0.0
-; CHECK-NEXT:    mov z2.s, p2/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    mov z4.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
-; CHECK-NEXT:    splice z4.h, p1, z4.h, z2.h
+; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
+; CHECK-NEXT:    uzp1 z7.h, z7.h, z7.h
 ; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    uzp1 z2.b, z3.b, z3.b
-; CHECK-NEXT:    uzp1 z3.b, z4.b, z4.b
+; CHECK-NEXT:    splice z1.h, p1, z1.h, z0.h
+; CHECK-NEXT:    splice z3.h, p1, z3.h, z2.h
+; CHECK-NEXT:    splice z5.h, p1, z5.h, z4.h
+; CHECK-NEXT:    splice z7.h, p1, z7.h, z6.h
+; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
+; CHECK-NEXT:    uzp1 z1.b, z3.b, z3.b
+; CHECK-NEXT:    uzp1 z2.b, z5.b, z5.b
+; CHECK-NEXT:    uzp1 z3.b, z7.b, z7.b
 ; CHECK-NEXT:    splice z1.b, p0, z1.b, z0.b
 ; CHECK-NEXT:    splice z3.b, p0, z3.b, z2.b
-; CHECK-NEXT:    orr z0.d, z1.d, z3.d
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    orr z0.d, z1.d, z3.d
 ; CHECK-NEXT:    umaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
@@ -111,49 +111,49 @@ declare i1 @llvm.vector.reduce.or.i1.v16i1(<16 x i1>)
 define i1 @ptest_and_v16i1(ptr %a, ptr %b) {
 ; CHECK-LABEL: ptest_and_v16i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ldp q2, q3, [x0, #32]
+; CHECK-NEXT:    ldp q4, q5, [x1]
+; CHECK-NEXT:    ldp q6, q7, [x1, #32]
+; CHECK-NEXT:    fcmne p1.s, p0/z, z3.s, #0.0
+; CHECK-NEXT:    fcmne p2.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    fcmne p3.s, p0/z, z1.s, #0.0
+; CHECK-NEXT:    fcmne p4.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    fcmne p5.s, p0/z, z7.s, #0.0
+; CHECK-NEXT:    fcmne p6.s, p0/z, z6.s, #0.0
+; CHECK-NEXT:    fcmne p7.s, p0/z, z5.s, #0.0
+; CHECK-NEXT:    fcmne p0.s, p0/z, z4.s, #0.0
+; CHECK-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z1.s, p2/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z2.s, p3/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z3.s, p4/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z4.s, p5/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z5.s, p6/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z6.s, p7/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z7.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ptrue p1.h, vl4
-; CHECK-NEXT:    fcmne p3.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    ldp q2, q3, [x0]
-; CHECK-NEXT:    fcmne p2.s, p0/z, z1.s, #0.0
-; CHECK-NEXT:    mov z1.s, p3/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    mov z0.s, p2/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    splice z1.h, p1, z1.h, z0.h
-; CHECK-NEXT:    fcmne p3.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    mov z4.s, p3/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    fcmne p2.s, p0/z, z3.s, #0.0
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
-; CHECK-NEXT:    ldp q3, q0, [x1, #32]
-; CHECK-NEXT:    mov z2.s, p2/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    splice z4.h, p1, z4.h, z2.h
-; CHECK-NEXT:    fcmne p3.s, p0/z, z3.s, #0.0
-; CHECK-NEXT:    mov z3.s, p3/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    ldp q5, q6, [x1]
-; CHECK-NEXT:    fcmne p2.s, p0/z, z0.s, #0.0
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    mov z0.s, p2/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    splice z3.h, p1, z3.h, z0.h
-; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
-; CHECK-NEXT:    uzp1 z1.b, z4.b, z4.b
-; CHECK-NEXT:    fcmne p2.s, p0/z, z6.s, #0.0
-; CHECK-NEXT:    fcmne p0.s, p0/z, z5.s, #0.0
-; CHECK-NEXT:    mov z2.s, p2/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    mov z4.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
-; CHECK-NEXT:    splice z4.h, p1, z4.h, z2.h
+; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
+; CHECK-NEXT:    uzp1 z7.h, z7.h, z7.h
 ; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    uzp1 z2.b, z3.b, z3.b
-; CHECK-NEXT:    uzp1 z3.b, z4.b, z4.b
+; CHECK-NEXT:    splice z1.h, p1, z1.h, z0.h
+; CHECK-NEXT:    splice z3.h, p1, z3.h, z2.h
+; CHECK-NEXT:    splice z5.h, p1, z5.h, z4.h
+; CHECK-NEXT:    splice z7.h, p1, z7.h, z6.h
+; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
+; CHECK-NEXT:    uzp1 z1.b, z3.b, z3.b
+; CHECK-NEXT:    uzp1 z2.b, z5.b, z5.b
+; CHECK-NEXT:    uzp1 z3.b, z7.b, z7.b
 ; CHECK-NEXT:    splice z1.b, p0, z1.b, z0.b
 ; CHECK-NEXT:    splice z3.b, p0, z3.b, z2.b
-; CHECK-NEXT:    and z0.d, z1.d, z3.d
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    and z0.d, z1.d, z3.d
 ; CHECK-NEXT:    uminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll
index 3dd08f04f26193..eeed8e422652fa 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll
@@ -15,14 +15,14 @@ define <4 x i1> @reshuffle_v4i1_nxv4i1(<vscale x 4 x i1> %a) {
 ; CHECK-NEXT:    mov z1.s, z0.s[3]
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    mov z2.s, z0.s[2]
-; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    fmov w9, s1
-; CHECK-NEXT:    fmov w10, s2
-; CHECK-NEXT:    fmov w11, s0
+; CHECK-NEXT:    mov z3.s, z0.s[1]
 ; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    strh w9, [sp, #14]
-; CHECK-NEXT:    strh w10, [sp, #12]
-; CHECK-NEXT:    strh w11, [sp, #10]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    strh w8, [sp, #14]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    strh w9, [sp, #12]
+; CHECK-NEXT:    strh w8, [sp, #10]
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
index c4cc4d9d408de9..2ace8b0debc37f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
@@ -11,8 +11,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x i8> @bitreverse_v4i8(<4 x i8> %op) {
 ; CHECK-LABEL: bitreverse_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
 ; CHECK-NEXT:    lsr z0.h, z0.h, #8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -24,8 +24,8 @@ define <4 x i8> @bitreverse_v4i8(<4 x i8> %op) {
 define <8 x i8> @bitreverse_v8i8(<8 x i8> %op) {
 ; CHECK-LABEL: bitreverse_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -36,8 +36,8 @@ define <8 x i8> @bitreverse_v8i8(<8 x i8> %op) {
 define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) {
 ; CHECK-LABEL: bitreverse_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -48,8 +48,8 @@ define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) {
 define void @bitreverse_v32i8(ptr %a) {
 ; CHECK-LABEL: bitreverse_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
 ; CHECK-NEXT:    rbit z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -63,8 +63,8 @@ define void @bitreverse_v32i8(ptr %a) {
 define <2 x i16> @bitreverse_v2i16(<2 x i16> %op) {
 ; CHECK-LABEL: bitreverse_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
 ; CHECK-NEXT:    lsr z0.s, z0.s, #16
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -76,8 +76,8 @@ define <2 x i16> @bitreverse_v2i16(<2 x i16> %op) {
 define <4 x i16> @bitreverse_v4i16(<4 x i16> %op) {
 ; CHECK-LABEL: bitreverse_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -88,8 +88,8 @@ define <4 x i16> @bitreverse_v4i16(<4 x i16> %op) {
 define <8 x i16> @bitreverse_v8i16(<8 x i16> %op) {
 ; CHECK-LABEL: bitreverse_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -100,8 +100,8 @@ define <8 x i16> @bitreverse_v8i16(<8 x i16> %op) {
 define void @bitreverse_v16i16(ptr %a) {
 ; CHECK-LABEL: bitreverse_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
 ; CHECK-NEXT:    rbit z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -115,8 +115,8 @@ define void @bitreverse_v16i16(ptr %a) {
 define <2 x i32> @bitreverse_v2i32(<2 x i32> %op) {
 ; CHECK-LABEL: bitreverse_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -127,8 +127,8 @@ define <2 x i32> @bitreverse_v2i32(<2 x i32> %op) {
 define <4 x i32> @bitreverse_v4i32(<4 x i32> %op) {
 ; CHECK-LABEL: bitreverse_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -139,8 +139,8 @@ define <4 x i32> @bitreverse_v4i32(<4 x i32> %op) {
 define void @bitreverse_v8i32(ptr %a) {
 ; CHECK-LABEL: bitreverse_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
 ; CHECK-NEXT:    rbit z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -154,8 +154,8 @@ define void @bitreverse_v8i32(ptr %a) {
 define <1 x i64> @bitreverse_v1i64(<1 x i64> %op) {
 ; CHECK-LABEL: bitreverse_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -166,8 +166,8 @@ define <1 x i64> @bitreverse_v1i64(<1 x i64> %op) {
 define <2 x i64> @bitreverse_v2i64(<2 x i64> %op) {
 ; CHECK-LABEL: bitreverse_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -178,8 +178,8 @@ define <2 x i64> @bitreverse_v2i64(<2 x i64> %op) {
 define void @bitreverse_v4i64(ptr %a) {
 ; CHECK-LABEL: bitreverse_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
 ; CHECK-NEXT:    rbit z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -197,8 +197,8 @@ define void @bitreverse_v4i64(ptr %a) {
 define <2 x i16> @bswap_v2i16(<2 x i16> %op) {
 ; CHECK-LABEL: bswap_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK-NEXT:    lsr z0.s, z0.s, #16
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -210,8 +210,8 @@ define <2 x i16> @bswap_v2i16(<2 x i16> %op) {
 define <4 x i16> @bswap_v4i16(<4 x i16> %op) {
 ; CHECK-LABEL: bswap_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -222,8 +222,8 @@ define <4 x i16> @bswap_v4i16(<4 x i16> %op) {
 define <8 x i16> @bswap_v8i16(<8 x i16> %op) {
 ; CHECK-LABEL: bswap_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -234,8 +234,8 @@ define <8 x i16> @bswap_v8i16(<8 x i16> %op) {
 define void @bswap_v16i16(ptr %a) {
 ; CHECK-LABEL: bswap_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK-NEXT:    revb z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -249,8 +249,8 @@ define void @bswap_v16i16(ptr %a) {
 define <2 x i32> @bswap_v2i32(<2 x i32> %op) {
 ; CHECK-LABEL: bswap_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -261,8 +261,8 @@ define <2 x i32> @bswap_v2i32(<2 x i32> %op) {
 define <4 x i32> @bswap_v4i32(<4 x i32> %op) {
 ; CHECK-LABEL: bswap_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -273,8 +273,8 @@ define <4 x i32> @bswap_v4i32(<4 x i32> %op) {
 define void @bswap_v8i32(ptr %a) {
 ; CHECK-LABEL: bswap_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK-NEXT:    revb z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -288,8 +288,8 @@ define void @bswap_v8i32(ptr %a) {
 define <1 x i64> @bswap_v1i64(<1 x i64> %op) {
 ; CHECK-LABEL: bswap_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -300,8 +300,8 @@ define <1 x i64> @bswap_v1i64(<1 x i64> %op) {
 define <2 x i64> @bswap_v2i64(<2 x i64> %op) {
 ; CHECK-LABEL: bswap_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -312,8 +312,8 @@ define <2 x i64> @bswap_v2i64(<2 x i64> %op) {
 define void @bswap_v4i64(ptr %a) {
 ; CHECK-LABEL: bswap_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK-NEXT:    revb z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
index 355dcc6b3faeea..b45a3c0904a05e 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
@@ -7,8 +7,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x i8> @sdiv_v4i8(<4 x i8> %op1) {
 ; CHECK-LABEL: sdiv_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    sxtb z0.h, p0/m, z0.h
 ; CHECK-NEXT:    asrd z0.h, p0/m, z0.h, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -20,8 +20,8 @@ define <4 x i8> @sdiv_v4i8(<4 x i8> %op1) {
 define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) {
 ; CHECK-LABEL: sdiv_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    asrd z0.b, p0/m, z0.b, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -32,8 +32,8 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) {
 define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) {
 ; CHECK-LABEL: sdiv_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    asrd z0.b, p0/m, z0.b, #5
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -44,8 +44,8 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) {
 define void @sdiv_v32i8(ptr %a) {
 ; CHECK-LABEL: sdiv_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    asrd z0.b, p0/m, z0.b, #5
 ; CHECK-NEXT:    asrd z1.b, p0/m, z1.b, #5
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -59,8 +59,8 @@ define void @sdiv_v32i8(ptr %a) {
 define <2 x i16> @sdiv_v2i16(<2 x i16> %op1) {
 ; CHECK-LABEL: sdiv_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    sxth z0.s, p0/m, z0.s
 ; CHECK-NEXT:    asrd z0.s, p0/m, z0.s, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -72,8 +72,8 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1) {
 define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) {
 ; CHECK-LABEL: sdiv_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    asrd z0.h, p0/m, z0.h, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -84,8 +84,8 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) {
 define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) {
 ; CHECK-LABEL: sdiv_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    asrd z0.h, p0/m, z0.h, #5
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -96,8 +96,8 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) {
 define void @sdiv_v16i16(ptr %a) {
 ; CHECK-LABEL: sdiv_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    asrd z0.h, p0/m, z0.h, #5
 ; CHECK-NEXT:    asrd z1.h, p0/m, z1.h, #5
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -111,8 +111,8 @@ define void @sdiv_v16i16(ptr %a) {
 define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) {
 ; CHECK-LABEL: sdiv_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    asrd z0.s, p0/m, z0.s, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -123,8 +123,8 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) {
 define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) {
 ; CHECK-LABEL: sdiv_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    asrd z0.s, p0/m, z0.s, #5
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -135,8 +135,8 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) {
 define void @sdiv_v8i32(ptr %a) {
 ; CHECK-LABEL: sdiv_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    asrd z0.s, p0/m, z0.s, #5
 ; CHECK-NEXT:    asrd z1.s, p0/m, z1.s, #5
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -150,8 +150,8 @@ define void @sdiv_v8i32(ptr %a) {
 define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) {
 ; CHECK-LABEL: sdiv_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    asrd z0.d, p0/m, z0.d, #5
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -163,8 +163,8 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) {
 define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) {
 ; CHECK-LABEL: sdiv_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    asrd z0.d, p0/m, z0.d, #5
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -175,8 +175,8 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) {
 define void @sdiv_v4i64(ptr %a) {
 ; CHECK-LABEL: sdiv_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    asrd z0.d, p0/m, z0.d, #5
 ; CHECK-NEXT:    asrd z1.d, p0/m, z1.d, #5
 ; CHECK-NEXT:    stp q0, q1, [x0]

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
index 62602950008040..05efb6d86bad5a 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
@@ -22,8 +22,8 @@ define void @hang_when_merging_stores_after_legalisation(ptr %a, <2 x i32> %b) {
 define void @interleave_store_without_splat(ptr %a, <4 x i32> %v1, <4 x i32> %v2) {
 ; CHECK-LABEL: interleave_store_without_splat:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    st2w { z0.s, z1.s }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -36,12 +36,12 @@ define void @interleave_store_without_splat(ptr %a, <4 x i32> %v1, <4 x i32> %v2
 define void @interleave_store_legalization(ptr %a, <8 x i32> %v1, <8 x i32> %v2) {
 ; CHECK-LABEL: interleave_store_legalization:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #8 // =0x8
-; CHECK-NEXT:    // kill: def $q3 killed $q3 killed $z2_z3 def $z2_z3
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    mov z5.d, z2.d
-; CHECK-NEXT:    mov z2.d, z1.d
+; CHECK-NEXT:    // kill: def $q3 killed $q3 def $z2_z3
+; CHECK-NEXT:    mov x8, #8 // =0x8
 ; CHECK-NEXT:    mov z4.d, z0.d
-; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    st2w { z4.s, z5.s }, p0, [x0]
 ; CHECK-NEXT:    st2w { z2.s, z3.s }, p0, [x0, x8, lsl #2]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
index 49492b428ddef5..b4a70216dafd2d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
@@ -7,8 +7,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @store_trunc_v8i16i8(ptr %ap, ptr %dest) {
 ; CHECK-LABEL: store_trunc_v8i16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x1]
 ; CHECK-NEXT:    ret
   %a = load <8 x i16>, ptr %ap
@@ -20,8 +20,8 @@ define void @store_trunc_v8i16i8(ptr %ap, ptr %dest) {
 define void @store_trunc_v4i32i8(ptr %ap, ptr %dest) {
 ; CHECK-LABEL: store_trunc_v4i32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    st1b { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
   %a = load <4 x i32>, ptr %ap
@@ -33,8 +33,8 @@ define void @store_trunc_v4i32i8(ptr %ap, ptr %dest) {
 define void @store_trunc_v4i32i16(ptr %ap, ptr %dest) {
 ; CHECK-LABEL: store_trunc_v4i32i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
   %a = load <4 x i32>, ptr %ap
@@ -46,8 +46,8 @@ define void @store_trunc_v4i32i16(ptr %ap, ptr %dest) {
 define void @store_trunc_v2i64i8(ptr %ap, ptr %dest) {
 ; CHECK-LABEL: store_trunc_v2i64i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    st1w { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
   %a = load <2 x i64>, ptr %ap
@@ -59,8 +59,8 @@ define void @store_trunc_v2i64i8(ptr %ap, ptr %dest) {
 define void @store_trunc_v2i256i64(ptr %ap, ptr %dest) {
 ; CHECK-LABEL: store_trunc_v2i256i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    ldr d0, [x0, #32]
 ; CHECK-NEXT:    ldr d1, [x0]
 ; CHECK-NEXT:    splice z1.d, p0, z1.d, z0.d
 ; CHECK-NEXT:    str q1, [x1]

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
index ef992f0736fee8..0c1fb60d7bfa3c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
@@ -13,8 +13,8 @@ define <16 x i8> @trunc_v16i16_v16i8(ptr %in) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; CHECK-NEXT:    uzp1 z1.b, z1.b, z1.b
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -29,15 +29,15 @@ define void @trunc_v32i16_v32i8(ptr %in, ptr %out) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
-; CHECK-NEXT:    ldp q3, q2, [x0]
+; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    uzp1 z1.b, z1.b, z1.b
-; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
-; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; CHECK-NEXT:    uzp1 z3.b, z3.b, z3.b
 ; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
-; CHECK-NEXT:    splice z3.b, p0, z3.b, z2.b
-; CHECK-NEXT:    add z1.b, z3.b, z3.b
+; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
+; CHECK-NEXT:    splice z2.b, p0, z2.b, z3.b
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    add z1.b, z2.b, z2.b
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
   %a = load <32 x i16>, ptr %in
@@ -53,27 +53,27 @@ define void @trunc_v64i16_v64i8(ptr %in, ptr %out) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0, #64]
 ; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
-; CHECK-NEXT:    ldp q2, q3, [x0, #96]
+; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    ldp q4, q5, [x0, #96]
+; CHECK-NEXT:    ldp q6, q7, [x0, #32]
 ; CHECK-NEXT:    uzp1 z1.b, z1.b, z1.b
-; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
-; CHECK-NEXT:    add z0.b, z0.b, z0.b
-; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
-; CHECK-NEXT:    ldp q4, q5, [x0]
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; CHECK-NEXT:    uzp1 z3.b, z3.b, z3.b
-; CHECK-NEXT:    splice z2.b, p0, z2.b, z3.b
+; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT:    uzp1 z5.b, z5.b, z5.b
 ; CHECK-NEXT:    uzp1 z4.b, z4.b, z4.b
-; CHECK-NEXT:    ldp q6, q7, [x0, #32]
-; CHECK-NEXT:    uzp1 z1.b, z5.b, z5.b
-; CHECK-NEXT:    splice z4.b, p0, z4.b, z1.b
-; CHECK-NEXT:    uzp1 z3.b, z6.b, z6.b
-; CHECK-NEXT:    uzp1 z1.b, z7.b, z7.b
-; CHECK-NEXT:    splice z3.b, p0, z3.b, z1.b
-; CHECK-NEXT:    add z1.b, z2.b, z2.b
+; CHECK-NEXT:    uzp1 z7.b, z7.b, z7.b
+; CHECK-NEXT:    uzp1 z6.b, z6.b, z6.b
+; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
+; CHECK-NEXT:    splice z2.b, p0, z2.b, z3.b
+; CHECK-NEXT:    splice z4.b, p0, z4.b, z5.b
+; CHECK-NEXT:    splice z6.b, p0, z6.b, z7.b
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    add z2.b, z2.b, z2.b
+; CHECK-NEXT:    add z1.b, z4.b, z4.b
+; CHECK-NEXT:    add z3.b, z6.b, z6.b
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
-; CHECK-NEXT:    add z0.b, z4.b, z4.b
-; CHECK-NEXT:    add z1.b, z3.b, z3.b
-; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
   %a = load <64 x i16>, ptr %in
   %b = trunc <64 x i16> %a to <64 x i8>
@@ -88,49 +88,49 @@ define void @trunc_v128i16_v128i8(ptr %in, ptr %out) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0, #192]
 ; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
-; CHECK-NEXT:    ldp q2, q3, [x0, #224]
+; CHECK-NEXT:    ldp q6, q7, [x0, #224]
+; CHECK-NEXT:    ldp q2, q3, [x0, #32]
 ; CHECK-NEXT:    uzp1 z1.b, z1.b, z1.b
-; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
-; CHECK-NEXT:    add z0.b, z0.b, z0.b
-; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
-; CHECK-NEXT:    ldp q6, q7, [x0, #128]
-; CHECK-NEXT:    uzp1 z3.b, z3.b, z3.b
-; CHECK-NEXT:    splice z2.b, p0, z2.b, z3.b
-; CHECK-NEXT:    add z2.b, z2.b, z2.b
-; CHECK-NEXT:    uzp1 z6.b, z6.b, z6.b
-; CHECK-NEXT:    ldp q1, q3, [x0, #160]
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; CHECK-NEXT:    uzp1 z7.b, z7.b, z7.b
-; CHECK-NEXT:    splice z6.b, p0, z6.b, z7.b
-; CHECK-NEXT:    uzp1 z1.b, z1.b, z1.b
-; CHECK-NEXT:    ldp q16, q17, [x0, #64]
+; CHECK-NEXT:    uzp1 z6.b, z6.b, z6.b
+; CHECK-NEXT:    ldp q4, q5, [x0]
 ; CHECK-NEXT:    uzp1 z3.b, z3.b, z3.b
-; CHECK-NEXT:    splice z1.b, p0, z1.b, z3.b
-; CHECK-NEXT:    add z1.b, z1.b, z1.b
+; CHECK-NEXT:    ldp q16, q17, [x0, #64]
+; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
+; CHECK-NEXT:    ldp q18, q19, [x0, #128]
+; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
+; CHECK-NEXT:    ldp q20, q21, [x0, #160]
+; CHECK-NEXT:    splice z6.b, p0, z6.b, z7.b
+; CHECK-NEXT:    ldp q22, q23, [x0, #96]
+; CHECK-NEXT:    uzp1 z1.b, z17.b, z17.b
+; CHECK-NEXT:    uzp1 z19.b, z19.b, z19.b
+; CHECK-NEXT:    uzp1 z18.b, z18.b, z18.b
 ; CHECK-NEXT:    uzp1 z16.b, z16.b, z16.b
-; CHECK-NEXT:    ldp q7, q18, [x0, #96]
-; CHECK-NEXT:    uzp1 z17.b, z17.b, z17.b
-; CHECK-NEXT:    splice z16.b, p0, z16.b, z17.b
-; CHECK-NEXT:    uzp1 z7.b, z7.b, z7.b
-; CHECK-NEXT:    ldp q4, q5, [x0, #32]
-; CHECK-NEXT:    uzp1 z3.b, z18.b, z18.b
-; CHECK-NEXT:    splice z7.b, p0, z7.b, z3.b
+; CHECK-NEXT:    uzp1 z21.b, z21.b, z21.b
+; CHECK-NEXT:    uzp1 z20.b, z20.b, z20.b
+; CHECK-NEXT:    uzp1 z5.b, z5.b, z5.b
+; CHECK-NEXT:    uzp1 z7.b, z23.b, z23.b
+; CHECK-NEXT:    uzp1 z17.b, z22.b, z22.b
 ; CHECK-NEXT:    uzp1 z4.b, z4.b, z4.b
-; CHECK-NEXT:    ldp q19, q20, [x0]
-; CHECK-NEXT:    uzp1 z3.b, z5.b, z5.b
-; CHECK-NEXT:    stp q0, q2, [x1, #96]
-; CHECK-NEXT:    add z0.b, z6.b, z6.b
-; CHECK-NEXT:    splice z4.b, p0, z4.b, z3.b
-; CHECK-NEXT:    stp q0, q1, [x1, #64]
+; CHECK-NEXT:    splice z2.b, p0, z2.b, z3.b
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    splice z18.b, p0, z18.b, z19.b
+; CHECK-NEXT:    splice z16.b, p0, z16.b, z1.b
+; CHECK-NEXT:    add z1.b, z6.b, z6.b
+; CHECK-NEXT:    splice z20.b, p0, z20.b, z21.b
+; CHECK-NEXT:    splice z17.b, p0, z17.b, z7.b
+; CHECK-NEXT:    splice z4.b, p0, z4.b, z5.b
+; CHECK-NEXT:    stp q0, q1, [x1, #96]
+; CHECK-NEXT:    add z2.b, z2.b, z2.b
+; CHECK-NEXT:    add z5.b, z18.b, z18.b
 ; CHECK-NEXT:    add z0.b, z16.b, z16.b
-; CHECK-NEXT:    uzp1 z18.b, z19.b, z19.b
-; CHECK-NEXT:    add z1.b, z7.b, z7.b
+; CHECK-NEXT:    add z3.b, z20.b, z20.b
+; CHECK-NEXT:    add z1.b, z17.b, z17.b
+; CHECK-NEXT:    add z4.b, z4.b, z4.b
+; CHECK-NEXT:    stp q5, q3, [x1, #64]
+; CHECK-NEXT:    stp q4, q2, [x1]
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
-; CHECK-NEXT:    add z1.b, z4.b, z4.b
-; CHECK-NEXT:    uzp1 z17.b, z20.b, z20.b
-; CHECK-NEXT:    splice z18.b, p0, z18.b, z17.b
-; CHECK-NEXT:    add z0.b, z18.b, z18.b
-; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
   %a = load <128 x i16>, ptr %in
   %b = trunc <128 x i16> %a to <128 x i8>
@@ -148,8 +148,8 @@ define <8 x i8> @trunc_v8i32_v8i8(ptr %in) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -162,18 +162,18 @@ define <8 x i8> @trunc_v8i32_v8i8(ptr %in) nounwind {
 define <16 x i8> @trunc_v16i32_v16i8(ptr %in) nounwind {
 ; CHECK-LABEL: trunc_v16i32_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ldp q1, q0, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    ldp q3, q2, [x0]
 ; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
-; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
-; CHECK-NEXT:    uzp1 z1.b, z0.b, z0.b
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    splice z3.h, p0, z3.h, z2.h
+; CHECK-NEXT:    splice z1.h, p0, z1.h, z0.h
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
 ; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    uzp1 z0.b, z3.b, z3.b
+; CHECK-NEXT:    uzp1 z1.b, z1.b, z1.b
+; CHECK-NEXT:    uzp1 z0.b, z2.b, z2.b
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -186,32 +186,32 @@ define <16 x i8> @trunc_v16i32_v16i8(ptr %in) nounwind {
 define void @trunc_v32i32_v32i8(ptr %in, ptr %out) nounwind {
 ; CHECK-LABEL: trunc_v32i32_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0, #96]
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    ptrue p1.b, vl8
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    ldp q2, q3, [x0, #64]
+; CHECK-NEXT:    ldp q2, q3, [x0, #96]
+; CHECK-NEXT:    ldp q4, q5, [x0, #64]
+; CHECK-NEXT:    ldp q6, q7, [x0]
 ; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
-; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
-; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    ldp q4, q5, [x0]
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    uzp1 z7.h, z7.h, z7.h
+; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
 ; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    splice z4.h, p0, z4.h, z5.h
+; CHECK-NEXT:    splice z6.h, p0, z6.h, z7.h
+; CHECK-NEXT:    ptrue p0.b, vl8
 ; CHECK-NEXT:    uzp1 z1.b, z2.b, z2.b
-; CHECK-NEXT:    splice z1.b, p1, z1.b, z0.b
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
-; CHECK-NEXT:    ldp q6, q7, [x0, #32]
-; CHECK-NEXT:    uzp1 z3.h, z5.h, z5.h
-; CHECK-NEXT:    splice z4.h, p0, z4.h, z3.h
-; CHECK-NEXT:    uzp1 z2.h, z6.h, z6.h
-; CHECK-NEXT:    uzp1 z0.h, z7.h, z7.h
-; CHECK-NEXT:    splice z2.h, p0, z2.h, z0.h
-; CHECK-NEXT:    uzp1 z0.b, z2.b, z2.b
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; CHECK-NEXT:    uzp1 z2.b, z4.b, z4.b
-; CHECK-NEXT:    splice z2.b, p1, z2.b, z0.b
-; CHECK-NEXT:    add z0.b, z1.b, z1.b
-; CHECK-NEXT:    add z1.b, z2.b, z2.b
+; CHECK-NEXT:    uzp1 z3.b, z6.b, z6.b
+; CHECK-NEXT:    splice z2.b, p0, z2.b, z1.b
+; CHECK-NEXT:    splice z3.b, p0, z3.b, z0.b
+; CHECK-NEXT:    add z0.b, z2.b, z2.b
+; CHECK-NEXT:    add z1.b, z3.b, z3.b
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
   %a = load <32 x i32>, ptr %in
@@ -225,58 +225,58 @@ define void @trunc_v32i32_v32i8(ptr %in, ptr %out) nounwind {
 define void @trunc_v64i32_v64i8(ptr %in, ptr %out) nounwind {
 ; CHECK-LABEL: trunc_v64i32_v64i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0, #128]
+; CHECK-NEXT:    ldp q0, q1, [x0, #64]
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    ptrue p1.b, vl8
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    ldp q2, q3, [x0, #160]
+; CHECK-NEXT:    ptrue p1.b, vl8
+; CHECK-NEXT:    ldp q4, q5, [x0, #96]
+; CHECK-NEXT:    ldp q6, q7, [x0]
 ; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
-; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
-; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    ldp q16, q17, [x0, #128]
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
-; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
-; CHECK-NEXT:    ldp q1, q17, [x0, #224]
-; CHECK-NEXT:    splice z0.b, p1, z0.b, z2.b
-; CHECK-NEXT:    add z0.b, z0.b, z0.b
-; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
-; CHECK-NEXT:    ldp q18, q2, [x0, #192]
+; CHECK-NEXT:    ldp q18, q19, [x0, #192]
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    ldp q20, q21, [x0, #224]
+; CHECK-NEXT:    uzp1 z7.h, z7.h, z7.h
+; CHECK-NEXT:    ldp q22, q23, [x0, #32]
 ; CHECK-NEXT:    uzp1 z17.h, z17.h, z17.h
-; CHECK-NEXT:    splice z1.h, p0, z1.h, z17.h
-; CHECK-NEXT:    uzp1 z1.b, z1.b, z1.b
+; CHECK-NEXT:    uzp1 z16.h, z16.h, z16.h
+; CHECK-NEXT:    uzp1 z19.h, z19.h, z19.h
 ; CHECK-NEXT:    uzp1 z18.h, z18.h, z18.h
-; CHECK-NEXT:    ldp q4, q5, [x0, #64]
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    splice z18.h, p0, z18.h, z2.h
-; CHECK-NEXT:    uzp1 z2.b, z18.b, z18.b
-; CHECK-NEXT:    splice z2.b, p1, z2.b, z1.b
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
-; CHECK-NEXT:    ldp q6, q7, [x0, #96]
+; CHECK-NEXT:    uzp1 z21.h, z21.h, z21.h
+; CHECK-NEXT:    uzp1 z20.h, z20.h, z20.h
+; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
+; CHECK-NEXT:    uzp1 z23.h, z23.h, z23.h
+; CHECK-NEXT:    uzp1 z22.h, z22.h, z22.h
 ; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
+; CHECK-NEXT:    splice z16.h, p0, z16.h, z17.h
+; CHECK-NEXT:    splice z18.h, p0, z18.h, z19.h
+; CHECK-NEXT:    splice z20.h, p0, z20.h, z21.h
+; CHECK-NEXT:    splice z6.h, p0, z6.h, z7.h
+; CHECK-NEXT:    splice z22.h, p0, z22.h, z23.h
 ; CHECK-NEXT:    splice z4.h, p0, z4.h, z5.h
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    uzp1 z1.b, z2.b, z2.b
+; CHECK-NEXT:    uzp1 z2.b, z16.b, z16.b
+; CHECK-NEXT:    uzp1 z5.b, z18.b, z18.b
+; CHECK-NEXT:    uzp1 z3.b, z20.b, z20.b
+; CHECK-NEXT:    uzp1 z6.b, z6.b, z6.b
+; CHECK-NEXT:    uzp1 z7.b, z22.b, z22.b
 ; CHECK-NEXT:    uzp1 z4.b, z4.b, z4.b
-; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
-; CHECK-NEXT:    ldp q3, q16, [x0]
-; CHECK-NEXT:    uzp1 z1.h, z7.h, z7.h
-; CHECK-NEXT:    splice z6.h, p0, z6.h, z1.h
-; CHECK-NEXT:    uzp1 z1.b, z6.b, z6.b
-; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    splice z4.b, p1, z4.b, z1.b
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    splice z2.b, p1, z2.b, z1.b
+; CHECK-NEXT:    splice z5.b, p1, z5.b, z3.b
+; CHECK-NEXT:    splice z6.b, p1, z6.b, z7.b
+; CHECK-NEXT:    splice z0.b, p1, z0.b, z4.b
 ; CHECK-NEXT:    add z1.b, z2.b, z2.b
-; CHECK-NEXT:    ldp q19, q20, [x0, #32]
-; CHECK-NEXT:    uzp1 z16.h, z16.h, z16.h
-; CHECK-NEXT:    stp q0, q1, [x1, #32]
-; CHECK-NEXT:    splice z3.h, p0, z3.h, z16.h
-; CHECK-NEXT:    add z1.b, z4.b, z4.b
-; CHECK-NEXT:    uzp1 z3.b, z3.b, z3.b
-; CHECK-NEXT:    uzp1 z18.h, z19.h, z19.h
-; CHECK-NEXT:    uzp1 z17.h, z20.h, z20.h
-; CHECK-NEXT:    splice z18.h, p0, z18.h, z17.h
-; CHECK-NEXT:    uzp1 z16.b, z18.b, z18.b
-; CHECK-NEXT:    splice z3.b, p1, z3.b, z16.b
-; CHECK-NEXT:    add z0.b, z3.b, z3.b
-; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    add z2.b, z5.b, z5.b
+; CHECK-NEXT:    add z3.b, z6.b, z6.b
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    stp q1, q2, [x1, #32]
+; CHECK-NEXT:    stp q3, q0, [x1]
 ; CHECK-NEXT:    ret
   %a = load <64 x i32>, ptr %in
   %b = trunc <64 x i32> %a to <64 x i8>
@@ -294,8 +294,8 @@ define <8 x i16> @trunc_v8i32_v8i16(ptr %in) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -310,15 +310,15 @@ define void @trunc_v16i32_v16i16(ptr %in, ptr %out) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    ldp q3, q2, [x0]
+; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
-; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
-; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    splice z3.h, p0, z3.h, z2.h
-; CHECK-NEXT:    add z1.h, z3.h, z3.h
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
+; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    add z1.h, z2.h, z2.h
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
   %a = load <16 x i32>, ptr %in
@@ -334,27 +334,27 @@ define void @trunc_v32i32_v32i16(ptr %in, ptr %out) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0, #64]
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    ldp q2, q3, [x0, #96]
+; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    ldp q4, q5, [x0, #96]
+; CHECK-NEXT:    ldp q6, q7, [x0, #32]
 ; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
-; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
-; CHECK-NEXT:    add z0.h, z0.h, z0.h
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    ldp q4, q5, [x0]
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
 ; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
-; CHECK-NEXT:    ldp q6, q7, [x0, #32]
-; CHECK-NEXT:    uzp1 z1.h, z5.h, z5.h
-; CHECK-NEXT:    splice z4.h, p0, z4.h, z1.h
-; CHECK-NEXT:    uzp1 z3.h, z6.h, z6.h
-; CHECK-NEXT:    uzp1 z1.h, z7.h, z7.h
-; CHECK-NEXT:    splice z3.h, p0, z3.h, z1.h
-; CHECK-NEXT:    add z1.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z7.h, z7.h, z7.h
+; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
+; CHECK-NEXT:    splice z4.h, p0, z4.h, z5.h
+; CHECK-NEXT:    splice z6.h, p0, z6.h, z7.h
+; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    add z2.h, z2.h, z2.h
+; CHECK-NEXT:    add z1.h, z4.h, z4.h
+; CHECK-NEXT:    add z3.h, z6.h, z6.h
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
-; CHECK-NEXT:    add z0.h, z4.h, z4.h
-; CHECK-NEXT:    add z1.h, z3.h, z3.h
-; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
   %a = load <32 x i32>, ptr %in
   %b = trunc <32 x i32> %a to <32 x i16>
@@ -369,49 +369,49 @@ define void @trunc_v64i32_v64i16(ptr %in, ptr %out) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0, #192]
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    ldp q2, q3, [x0, #224]
+; CHECK-NEXT:    ldp q6, q7, [x0, #224]
+; CHECK-NEXT:    ldp q2, q3, [x0, #32]
 ; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
-; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
-; CHECK-NEXT:    add z0.h, z0.h, z0.h
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    ldp q6, q7, [x0, #128]
-; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
-; CHECK-NEXT:    add z2.h, z2.h, z2.h
-; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
-; CHECK-NEXT:    ldp q1, q3, [x0, #160]
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    uzp1 z7.h, z7.h, z7.h
-; CHECK-NEXT:    splice z6.h, p0, z6.h, z7.h
-; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
-; CHECK-NEXT:    ldp q16, q17, [x0, #64]
+; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
+; CHECK-NEXT:    ldp q4, q5, [x0]
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    splice z1.h, p0, z1.h, z3.h
-; CHECK-NEXT:    add z1.h, z1.h, z1.h
+; CHECK-NEXT:    ldp q16, q17, [x0, #64]
+; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
+; CHECK-NEXT:    ldp q18, q19, [x0, #128]
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    ldp q20, q21, [x0, #160]
+; CHECK-NEXT:    splice z6.h, p0, z6.h, z7.h
+; CHECK-NEXT:    ldp q22, q23, [x0, #96]
+; CHECK-NEXT:    uzp1 z1.h, z17.h, z17.h
+; CHECK-NEXT:    uzp1 z19.h, z19.h, z19.h
+; CHECK-NEXT:    uzp1 z18.h, z18.h, z18.h
 ; CHECK-NEXT:    uzp1 z16.h, z16.h, z16.h
-; CHECK-NEXT:    ldp q7, q18, [x0, #96]
-; CHECK-NEXT:    uzp1 z17.h, z17.h, z17.h
-; CHECK-NEXT:    splice z16.h, p0, z16.h, z17.h
-; CHECK-NEXT:    uzp1 z7.h, z7.h, z7.h
-; CHECK-NEXT:    ldp q4, q5, [x0, #32]
-; CHECK-NEXT:    uzp1 z3.h, z18.h, z18.h
-; CHECK-NEXT:    splice z7.h, p0, z7.h, z3.h
+; CHECK-NEXT:    uzp1 z21.h, z21.h, z21.h
+; CHECK-NEXT:    uzp1 z20.h, z20.h, z20.h
+; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT:    uzp1 z7.h, z23.h, z23.h
+; CHECK-NEXT:    uzp1 z17.h, z22.h, z22.h
 ; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
-; CHECK-NEXT:    ldp q19, q20, [x0]
-; CHECK-NEXT:    uzp1 z3.h, z5.h, z5.h
-; CHECK-NEXT:    stp q0, q2, [x1, #96]
-; CHECK-NEXT:    add z0.h, z6.h, z6.h
-; CHECK-NEXT:    splice z4.h, p0, z4.h, z3.h
-; CHECK-NEXT:    stp q0, q1, [x1, #64]
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
+; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    splice z18.h, p0, z18.h, z19.h
+; CHECK-NEXT:    splice z16.h, p0, z16.h, z1.h
+; CHECK-NEXT:    add z1.h, z6.h, z6.h
+; CHECK-NEXT:    splice z20.h, p0, z20.h, z21.h
+; CHECK-NEXT:    splice z17.h, p0, z17.h, z7.h
+; CHECK-NEXT:    splice z4.h, p0, z4.h, z5.h
+; CHECK-NEXT:    stp q0, q1, [x1, #96]
+; CHECK-NEXT:    add z2.h, z2.h, z2.h
+; CHECK-NEXT:    add z5.h, z18.h, z18.h
 ; CHECK-NEXT:    add z0.h, z16.h, z16.h
-; CHECK-NEXT:    uzp1 z18.h, z19.h, z19.h
-; CHECK-NEXT:    add z1.h, z7.h, z7.h
+; CHECK-NEXT:    add z3.h, z20.h, z20.h
+; CHECK-NEXT:    add z1.h, z17.h, z17.h
+; CHECK-NEXT:    add z4.h, z4.h, z4.h
+; CHECK-NEXT:    stp q5, q3, [x1, #64]
+; CHECK-NEXT:    stp q4, q2, [x1]
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
-; CHECK-NEXT:    add z1.h, z4.h, z4.h
-; CHECK-NEXT:    uzp1 z17.h, z20.h, z20.h
-; CHECK-NEXT:    splice z18.h, p0, z18.h, z17.h
-; CHECK-NEXT:    add z0.h, z18.h, z18.h
-; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
   %a = load <64 x i32>, ptr %in
   %b = trunc <64 x i32> %a to <64 x i16>
@@ -430,8 +430,8 @@ define <4 x i8> @trunc_v4i64_v4i8(ptr %in) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl2
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -444,18 +444,18 @@ define <4 x i8> @trunc_v4i64_v4i8(ptr %in) nounwind {
 define <8 x i8> @trunc_v8i64_v8i8(ptr %in) nounwind {
 ; CHECK-LABEL: trunc_v8i64_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ldp q1, q0, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    ldp q3, q2, [x0]
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
 ; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
-; CHECK-NEXT:    splice z3.s, p0, z3.s, z2.s
+; CHECK-NEXT:    splice z1.s, p0, z1.s, z0.s
+; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    uzp1 z1.h, z3.h, z3.h
+; CHECK-NEXT:    uzp1 z0.h, z1.h, z1.h
+; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
 ; CHECK-NEXT:    splice z1.h, p0, z1.h, z0.h
 ; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -468,33 +468,33 @@ define <8 x i8> @trunc_v8i64_v8i8(ptr %in) nounwind {
 define <16 x i8> @trunc_v16i64_v16i8(ptr %in) nounwind {
 ; CHECK-LABEL: trunc_v16i64_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0, #96]
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.s, vl2
-; CHECK-NEXT:    ptrue p1.h, vl4
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    ldp q2, q3, [x0, #64]
+; CHECK-NEXT:    ldp q2, q3, [x0, #96]
+; CHECK-NEXT:    ldp q4, q5, [x0, #64]
+; CHECK-NEXT:    ldp q6, q7, [x0]
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
-; CHECK-NEXT:    ldp q4, q5, [x0]
 ; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z5.s, z5.s, z5.s
+; CHECK-NEXT:    uzp1 z4.s, z4.s, z4.s
+; CHECK-NEXT:    uzp1 z7.s, z7.s, z7.s
+; CHECK-NEXT:    uzp1 z6.s, z6.s, z6.s
 ; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    splice z4.s, p0, z4.s, z5.s
+; CHECK-NEXT:    splice z6.s, p0, z6.s, z7.s
+; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
-; CHECK-NEXT:    splice z1.h, p1, z1.h, z0.h
-; CHECK-NEXT:    uzp1 z4.s, z4.s, z4.s
-; CHECK-NEXT:    uzp1 z1.b, z1.b, z1.b
-; CHECK-NEXT:    ldp q6, q7, [x0, #32]
-; CHECK-NEXT:    uzp1 z3.s, z5.s, z5.s
-; CHECK-NEXT:    splice z4.s, p0, z4.s, z3.s
-; CHECK-NEXT:    uzp1 z2.s, z6.s, z6.s
-; CHECK-NEXT:    uzp1 z0.s, z7.s, z7.s
-; CHECK-NEXT:    splice z2.s, p0, z2.s, z0.s
-; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    uzp1 z0.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    uzp1 z2.h, z4.h, z4.h
-; CHECK-NEXT:    splice z2.h, p1, z2.h, z0.h
-; CHECK-NEXT:    uzp1 z0.b, z2.b, z2.b
+; CHECK-NEXT:    uzp1 z3.h, z6.h, z6.h
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z1.h
+; CHECK-NEXT:    splice z3.h, p0, z3.h, z0.h
+; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    uzp1 z1.b, z2.b, z2.b
+; CHECK-NEXT:    uzp1 z0.b, z3.b, z3.b
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -507,62 +507,62 @@ define <16 x i8> @trunc_v16i64_v16i8(ptr %in) nounwind {
 define void @trunc_v32i64_v32i8(ptr %in, ptr %out) nounwind {
 ; CHECK-LABEL: trunc_v32i64_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0, #224]
+; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl2
-; CHECK-NEXT:    ptrue p1.h, vl4
-; CHECK-NEXT:    ptrue p2.b, vl8
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    ldp q2, q3, [x0, #192]
+; CHECK-NEXT:    ldp q2, q3, [x0, #224]
+; CHECK-NEXT:    ldp q4, q5, [x0, #32]
+; CHECK-NEXT:    ldp q6, q7, [x0, #64]
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
-; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT:    ldp q1, q16, [x0, #160]
-; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    splice z2.h, p1, z2.h, z0.h
-; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    uzp1 z0.b, z2.b, z2.b
-; CHECK-NEXT:    ldp q3, q17, [x0, #128]
-; CHECK-NEXT:    uzp1 z16.s, z16.s, z16.s
-; CHECK-NEXT:    splice z1.s, p0, z1.s, z16.s
-; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
+; CHECK-NEXT:    ldp q16, q17, [x0, #192]
 ; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
+; CHECK-NEXT:    ldp q18, q19, [x0, #128]
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
+; CHECK-NEXT:    ldp q20, q21, [x0, #160]
+; CHECK-NEXT:    uzp1 z7.s, z7.s, z7.s
+; CHECK-NEXT:    ldp q22, q23, [x0, #96]
 ; CHECK-NEXT:    uzp1 z17.s, z17.s, z17.s
-; CHECK-NEXT:    splice z3.s, p0, z3.s, z17.s
-; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    splice z3.h, p1, z3.h, z1.h
-; CHECK-NEXT:    ldp q4, q5, [x0]
-; CHECK-NEXT:    uzp1 z1.b, z3.b, z3.b
-; CHECK-NEXT:    splice z1.b, p2, z1.b, z0.b
-; CHECK-NEXT:    uzp1 z4.s, z4.s, z4.s
-; CHECK-NEXT:    ldp q6, q7, [x0, #64]
-; CHECK-NEXT:    uzp1 z5.s, z5.s, z5.s
-; CHECK-NEXT:    splice z4.s, p0, z4.s, z5.s
+; CHECK-NEXT:    uzp1 z16.s, z16.s, z16.s
+; CHECK-NEXT:    uzp1 z19.s, z19.s, z19.s
+; CHECK-NEXT:    uzp1 z18.s, z18.s, z18.s
+; CHECK-NEXT:    uzp1 z21.s, z21.s, z21.s
+; CHECK-NEXT:    uzp1 z20.s, z20.s, z20.s
 ; CHECK-NEXT:    uzp1 z6.s, z6.s, z6.s
-; CHECK-NEXT:    ldp q18, q19, [x0, #96]
-; CHECK-NEXT:    uzp1 z7.s, z7.s, z7.s
+; CHECK-NEXT:    uzp1 z23.s, z23.s, z23.s
+; CHECK-NEXT:    uzp1 z22.s, z22.s, z22.s
+; CHECK-NEXT:    uzp1 z5.s, z5.s, z5.s
+; CHECK-NEXT:    uzp1 z4.s, z4.s, z4.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
+; CHECK-NEXT:    splice z16.s, p0, z16.s, z17.s
+; CHECK-NEXT:    splice z18.s, p0, z18.s, z19.s
+; CHECK-NEXT:    splice z20.s, p0, z20.s, z21.s
 ; CHECK-NEXT:    splice z6.s, p0, z6.s, z7.s
+; CHECK-NEXT:    splice z22.s, p0, z22.s, z23.s
+; CHECK-NEXT:    splice z4.s, p0, z4.s, z5.s
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z2.h, z16.h, z16.h
+; CHECK-NEXT:    uzp1 z5.h, z18.h, z18.h
+; CHECK-NEXT:    uzp1 z3.h, z20.h, z20.h
 ; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
-; CHECK-NEXT:    uzp1 z16.s, z18.s, z18.s
-; CHECK-NEXT:    ldp q2, q3, [x0, #32]
-; CHECK-NEXT:    uzp1 z0.s, z19.s, z19.s
-; CHECK-NEXT:    splice z16.s, p0, z16.s, z0.s
-; CHECK-NEXT:    uzp1 z0.h, z16.h, z16.h
-; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
-; CHECK-NEXT:    splice z6.h, p1, z6.h, z0.h
-; CHECK-NEXT:    uzp1 z0.b, z6.b, z6.b
-; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
-; CHECK-NEXT:    uzp1 z3.h, z4.h, z4.h
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    splice z3.h, p1, z3.h, z2.h
-; CHECK-NEXT:    uzp1 z2.b, z3.b, z3.b
-; CHECK-NEXT:    splice z2.b, p2, z2.b, z0.b
-; CHECK-NEXT:    add z0.b, z1.b, z1.b
+; CHECK-NEXT:    uzp1 z7.h, z22.h, z22.h
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z1.h
+; CHECK-NEXT:    splice z5.h, p0, z5.h, z3.h
+; CHECK-NEXT:    splice z6.h, p0, z6.h, z7.h
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z4.h
+; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    uzp1 z1.b, z2.b, z2.b
+; CHECK-NEXT:    uzp1 z2.b, z5.b, z5.b
+; CHECK-NEXT:    uzp1 z3.b, z6.b, z6.b
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    splice z2.b, p0, z2.b, z1.b
+; CHECK-NEXT:    splice z0.b, p0, z0.b, z3.b
 ; CHECK-NEXT:    add z1.b, z2.b, z2.b
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
   %a = load <32 x i64>, ptr %in
   %b = trunc <32 x i64> %a to <32 x i8>
@@ -580,8 +580,8 @@ define <4 x i16> @trunc_v4i64_v4i16(ptr %in) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl2
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -594,18 +594,18 @@ define <4 x i16> @trunc_v4i64_v4i16(ptr %in) nounwind {
 define <8 x i16> @trunc_v8i64_v8i16(ptr %in) nounwind {
 ; CHECK-LABEL: trunc_v8i64_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ldp q1, q0, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    ldp q3, q2, [x0]
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z1.h, z0.h, z0.h
 ; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
 ; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
-; CHECK-NEXT:    splice z3.s, p0, z3.s, z2.s
+; CHECK-NEXT:    splice z1.s, p0, z1.s, z0.s
+; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    uzp1 z0.h, z3.h, z3.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z2.h, z2.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -618,32 +618,32 @@ define <8 x i16> @trunc_v8i64_v8i16(ptr %in) nounwind {
 define void @trunc_v16i64_v16i16(ptr %in, ptr %out) nounwind {
 ; CHECK-LABEL: trunc_v16i64_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0, #96]
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.s, vl2
-; CHECK-NEXT:    ptrue p1.h, vl4
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    ldp q2, q3, [x0, #64]
+; CHECK-NEXT:    ldp q2, q3, [x0, #96]
+; CHECK-NEXT:    ldp q4, q5, [x0, #64]
+; CHECK-NEXT:    ldp q6, q7, [x0]
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
-; CHECK-NEXT:    ldp q4, q5, [x0]
 ; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z5.s, z5.s, z5.s
+; CHECK-NEXT:    uzp1 z4.s, z4.s, z4.s
+; CHECK-NEXT:    uzp1 z7.s, z7.s, z7.s
+; CHECK-NEXT:    uzp1 z6.s, z6.s, z6.s
 ; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    splice z4.s, p0, z4.s, z5.s
+; CHECK-NEXT:    splice z6.s, p0, z6.s, z7.s
+; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
-; CHECK-NEXT:    splice z1.h, p1, z1.h, z0.h
-; CHECK-NEXT:    uzp1 z4.s, z4.s, z4.s
-; CHECK-NEXT:    ldp q6, q7, [x0, #32]
-; CHECK-NEXT:    uzp1 z3.s, z5.s, z5.s
-; CHECK-NEXT:    splice z4.s, p0, z4.s, z3.s
-; CHECK-NEXT:    uzp1 z2.s, z6.s, z6.s
-; CHECK-NEXT:    uzp1 z0.s, z7.s, z7.s
-; CHECK-NEXT:    splice z2.s, p0, z2.s, z0.s
-; CHECK-NEXT:    uzp1 z0.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    uzp1 z2.h, z4.h, z4.h
-; CHECK-NEXT:    splice z2.h, p1, z2.h, z0.h
-; CHECK-NEXT:    add z0.h, z1.h, z1.h
-; CHECK-NEXT:    add z1.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z3.h, z6.h, z6.h
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z1.h
+; CHECK-NEXT:    splice z3.h, p0, z3.h, z0.h
+; CHECK-NEXT:    add z0.h, z2.h, z2.h
+; CHECK-NEXT:    add z1.h, z3.h, z3.h
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
   %a = load <16 x i64>, ptr %in
@@ -657,58 +657,58 @@ define void @trunc_v16i64_v16i16(ptr %in, ptr %out) nounwind {
 define void @trunc_v32i64_v32i16(ptr %in, ptr %out) nounwind {
 ; CHECK-LABEL: trunc_v32i64_v32i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0, #128]
+; CHECK-NEXT:    ldp q0, q1, [x0, #64]
 ; CHECK-NEXT:    ptrue p0.s, vl2
-; CHECK-NEXT:    ptrue p1.h, vl4
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    ldp q2, q3, [x0, #160]
+; CHECK-NEXT:    ptrue p1.h, vl4
+; CHECK-NEXT:    ldp q4, q5, [x0, #96]
+; CHECK-NEXT:    ldp q6, q7, [x0]
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
+; CHECK-NEXT:    ldp q16, q17, [x0, #128]
 ; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    ldp q1, q17, [x0, #224]
-; CHECK-NEXT:    splice z0.h, p1, z0.h, z2.h
-; CHECK-NEXT:    add z0.h, z0.h, z0.h
-; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    ldp q18, q2, [x0, #192]
+; CHECK-NEXT:    ldp q18, q19, [x0, #192]
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
+; CHECK-NEXT:    ldp q20, q21, [x0, #224]
+; CHECK-NEXT:    uzp1 z7.s, z7.s, z7.s
+; CHECK-NEXT:    ldp q22, q23, [x0, #32]
 ; CHECK-NEXT:    uzp1 z17.s, z17.s, z17.s
-; CHECK-NEXT:    splice z1.s, p0, z1.s, z17.s
-; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
+; CHECK-NEXT:    uzp1 z16.s, z16.s, z16.s
+; CHECK-NEXT:    uzp1 z19.s, z19.s, z19.s
 ; CHECK-NEXT:    uzp1 z18.s, z18.s, z18.s
-; CHECK-NEXT:    ldp q4, q5, [x0, #64]
-; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
-; CHECK-NEXT:    splice z18.s, p0, z18.s, z2.s
-; CHECK-NEXT:    uzp1 z2.h, z18.h, z18.h
-; CHECK-NEXT:    splice z2.h, p1, z2.h, z1.h
-; CHECK-NEXT:    uzp1 z4.s, z4.s, z4.s
-; CHECK-NEXT:    ldp q6, q7, [x0, #96]
+; CHECK-NEXT:    uzp1 z21.s, z21.s, z21.s
+; CHECK-NEXT:    uzp1 z20.s, z20.s, z20.s
+; CHECK-NEXT:    uzp1 z6.s, z6.s, z6.s
+; CHECK-NEXT:    uzp1 z23.s, z23.s, z23.s
+; CHECK-NEXT:    uzp1 z22.s, z22.s, z22.s
 ; CHECK-NEXT:    uzp1 z5.s, z5.s, z5.s
+; CHECK-NEXT:    uzp1 z4.s, z4.s, z4.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
+; CHECK-NEXT:    splice z16.s, p0, z16.s, z17.s
+; CHECK-NEXT:    splice z18.s, p0, z18.s, z19.s
+; CHECK-NEXT:    splice z20.s, p0, z20.s, z21.s
+; CHECK-NEXT:    splice z6.s, p0, z6.s, z7.s
+; CHECK-NEXT:    splice z22.s, p0, z22.s, z23.s
 ; CHECK-NEXT:    splice z4.s, p0, z4.s, z5.s
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
+; CHECK-NEXT:    uzp1 z2.h, z16.h, z16.h
+; CHECK-NEXT:    uzp1 z5.h, z18.h, z18.h
+; CHECK-NEXT:    uzp1 z3.h, z20.h, z20.h
+; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
+; CHECK-NEXT:    uzp1 z7.h, z22.h, z22.h
 ; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
-; CHECK-NEXT:    uzp1 z6.s, z6.s, z6.s
-; CHECK-NEXT:    ldp q3, q16, [x0]
-; CHECK-NEXT:    uzp1 z1.s, z7.s, z7.s
-; CHECK-NEXT:    splice z6.s, p0, z6.s, z1.s
-; CHECK-NEXT:    uzp1 z1.h, z6.h, z6.h
-; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT:    splice z4.h, p1, z4.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    splice z2.h, p1, z2.h, z1.h
+; CHECK-NEXT:    splice z5.h, p1, z5.h, z3.h
+; CHECK-NEXT:    splice z6.h, p1, z6.h, z7.h
+; CHECK-NEXT:    splice z0.h, p1, z0.h, z4.h
 ; CHECK-NEXT:    add z1.h, z2.h, z2.h
-; CHECK-NEXT:    ldp q19, q20, [x0, #32]
-; CHECK-NEXT:    uzp1 z16.s, z16.s, z16.s
-; CHECK-NEXT:    stp q0, q1, [x1, #32]
-; CHECK-NEXT:    splice z3.s, p0, z3.s, z16.s
-; CHECK-NEXT:    add z1.h, z4.h, z4.h
-; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    uzp1 z18.s, z19.s, z19.s
-; CHECK-NEXT:    uzp1 z17.s, z20.s, z20.s
-; CHECK-NEXT:    splice z18.s, p0, z18.s, z17.s
-; CHECK-NEXT:    uzp1 z16.h, z18.h, z18.h
-; CHECK-NEXT:    splice z3.h, p1, z3.h, z16.h
-; CHECK-NEXT:    add z0.h, z3.h, z3.h
-; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    add z2.h, z5.h, z5.h
+; CHECK-NEXT:    add z3.h, z6.h, z6.h
+; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    stp q1, q2, [x1, #32]
+; CHECK-NEXT:    stp q3, q0, [x1]
 ; CHECK-NEXT:    ret
   %a = load <32 x i64>, ptr %in
   %b = trunc <32 x i64> %a to <32 x i16>
@@ -726,8 +726,8 @@ define <4 x i32> @trunc_v4i64_v4i32(ptr %in) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ptrue p0.s, vl2
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -742,15 +742,15 @@ define void @trunc_v8i64_v8i32(ptr %in, ptr %out) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.s, vl2
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    ldp q3, q2, [x0]
+; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
-; CHECK-NEXT:    add z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
 ; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
-; CHECK-NEXT:    splice z3.s, p0, z3.s, z2.s
-; CHECK-NEXT:    add z1.s, z3.s, z3.s
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
+; CHECK-NEXT:    add z0.s, z0.s, z0.s
+; CHECK-NEXT:    add z1.s, z2.s, z2.s
 ; CHECK-NEXT:    stp q1, q0, [x1]
 ; CHECK-NEXT:    ret
   %a = load <8 x i64>, ptr %in
@@ -766,27 +766,27 @@ define void @trunc_v16i64_v16i32(ptr %in, ptr %out) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0, #64]
 ; CHECK-NEXT:    ptrue p0.s, vl2
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    ldp q2, q3, [x0, #96]
+; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    ldp q4, q5, [x0, #96]
+; CHECK-NEXT:    ldp q6, q7, [x0, #32]
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
-; CHECK-NEXT:    add z0.s, z0.s, z0.s
-; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
-; CHECK-NEXT:    ldp q4, q5, [x0]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
+; CHECK-NEXT:    uzp1 z5.s, z5.s, z5.s
 ; CHECK-NEXT:    uzp1 z4.s, z4.s, z4.s
-; CHECK-NEXT:    ldp q6, q7, [x0, #32]
-; CHECK-NEXT:    uzp1 z1.s, z5.s, z5.s
-; CHECK-NEXT:    splice z4.s, p0, z4.s, z1.s
-; CHECK-NEXT:    uzp1 z3.s, z6.s, z6.s
-; CHECK-NEXT:    uzp1 z1.s, z7.s, z7.s
-; CHECK-NEXT:    splice z3.s, p0, z3.s, z1.s
-; CHECK-NEXT:    add z1.s, z2.s, z2.s
+; CHECK-NEXT:    uzp1 z7.s, z7.s, z7.s
+; CHECK-NEXT:    uzp1 z6.s, z6.s, z6.s
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
+; CHECK-NEXT:    splice z4.s, p0, z4.s, z5.s
+; CHECK-NEXT:    splice z6.s, p0, z6.s, z7.s
+; CHECK-NEXT:    add z0.s, z0.s, z0.s
+; CHECK-NEXT:    add z2.s, z2.s, z2.s
+; CHECK-NEXT:    add z1.s, z4.s, z4.s
+; CHECK-NEXT:    add z3.s, z6.s, z6.s
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
-; CHECK-NEXT:    add z0.s, z4.s, z4.s
-; CHECK-NEXT:    add z1.s, z3.s, z3.s
-; CHECK-NEXT:    stp q0, q1, [x1]
+; CHECK-NEXT:    stp q2, q3, [x1]
 ; CHECK-NEXT:    ret
   %a = load <16 x i64>, ptr %in
   %b = trunc <16 x i64> %a to <16 x i32>
@@ -801,49 +801,49 @@ define void @trunc_v32i64_v32i32(ptr %in, ptr %out) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0, #192]
 ; CHECK-NEXT:    ptrue p0.s, vl2
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    ldp q2, q3, [x0, #224]
+; CHECK-NEXT:    ldp q6, q7, [x0, #224]
+; CHECK-NEXT:    ldp q2, q3, [x0, #32]
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
-; CHECK-NEXT:    add z0.s, z0.s, z0.s
-; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
-; CHECK-NEXT:    ldp q6, q7, [x0, #128]
-; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
-; CHECK-NEXT:    add z2.s, z2.s, z2.s
-; CHECK-NEXT:    uzp1 z6.s, z6.s, z6.s
-; CHECK-NEXT:    ldp q1, q3, [x0, #160]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uzp1 z7.s, z7.s, z7.s
-; CHECK-NEXT:    splice z6.s, p0, z6.s, z7.s
-; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    ldp q16, q17, [x0, #64]
+; CHECK-NEXT:    uzp1 z6.s, z6.s, z6.s
+; CHECK-NEXT:    ldp q4, q5, [x0]
 ; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT:    splice z1.s, p0, z1.s, z3.s
-; CHECK-NEXT:    add z1.s, z1.s, z1.s
+; CHECK-NEXT:    ldp q16, q17, [x0, #64]
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
+; CHECK-NEXT:    ldp q18, q19, [x0, #128]
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    ldp q20, q21, [x0, #160]
+; CHECK-NEXT:    splice z6.s, p0, z6.s, z7.s
+; CHECK-NEXT:    ldp q22, q23, [x0, #96]
+; CHECK-NEXT:    uzp1 z1.s, z17.s, z17.s
+; CHECK-NEXT:    uzp1 z19.s, z19.s, z19.s
+; CHECK-NEXT:    uzp1 z18.s, z18.s, z18.s
 ; CHECK-NEXT:    uzp1 z16.s, z16.s, z16.s
-; CHECK-NEXT:    ldp q7, q18, [x0, #96]
-; CHECK-NEXT:    uzp1 z17.s, z17.s, z17.s
-; CHECK-NEXT:    splice z16.s, p0, z16.s, z17.s
-; CHECK-NEXT:    uzp1 z7.s, z7.s, z7.s
-; CHECK-NEXT:    ldp q4, q5, [x0, #32]
-; CHECK-NEXT:    uzp1 z3.s, z18.s, z18.s
-; CHECK-NEXT:    splice z7.s, p0, z7.s, z3.s
+; CHECK-NEXT:    uzp1 z21.s, z21.s, z21.s
+; CHECK-NEXT:    uzp1 z20.s, z20.s, z20.s
+; CHECK-NEXT:    uzp1 z5.s, z5.s, z5.s
+; CHECK-NEXT:    uzp1 z7.s, z23.s, z23.s
+; CHECK-NEXT:    uzp1 z17.s, z22.s, z22.s
 ; CHECK-NEXT:    uzp1 z4.s, z4.s, z4.s
-; CHECK-NEXT:    ldp q19, q20, [x0]
-; CHECK-NEXT:    uzp1 z3.s, z5.s, z5.s
-; CHECK-NEXT:    stp q0, q2, [x1, #96]
-; CHECK-NEXT:    add z0.s, z6.s, z6.s
-; CHECK-NEXT:    splice z4.s, p0, z4.s, z3.s
-; CHECK-NEXT:    stp q0, q1, [x1, #64]
+; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
+; CHECK-NEXT:    add z0.s, z0.s, z0.s
+; CHECK-NEXT:    splice z18.s, p0, z18.s, z19.s
+; CHECK-NEXT:    splice z16.s, p0, z16.s, z1.s
+; CHECK-NEXT:    add z1.s, z6.s, z6.s
+; CHECK-NEXT:    splice z20.s, p0, z20.s, z21.s
+; CHECK-NEXT:    splice z17.s, p0, z17.s, z7.s
+; CHECK-NEXT:    splice z4.s, p0, z4.s, z5.s
+; CHECK-NEXT:    stp q0, q1, [x1, #96]
+; CHECK-NEXT:    add z2.s, z2.s, z2.s
+; CHECK-NEXT:    add z5.s, z18.s, z18.s
 ; CHECK-NEXT:    add z0.s, z16.s, z16.s
-; CHECK-NEXT:    uzp1 z18.s, z19.s, z19.s
-; CHECK-NEXT:    add z1.s, z7.s, z7.s
+; CHECK-NEXT:    add z3.s, z20.s, z20.s
+; CHECK-NEXT:    add z1.s, z17.s, z17.s
+; CHECK-NEXT:    add z4.s, z4.s, z4.s
+; CHECK-NEXT:    stp q5, q3, [x1, #64]
+; CHECK-NEXT:    stp q4, q2, [x1]
 ; CHECK-NEXT:    stp q0, q1, [x1, #32]
-; CHECK-NEXT:    add z1.s, z4.s, z4.s
-; CHECK-NEXT:    uzp1 z17.s, z20.s, z20.s
-; CHECK-NEXT:    splice z18.s, p0, z18.s, z17.s
-; CHECK-NEXT:    add z0.s, z18.s, z18.s
-; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
   %a = load <32 x i64>, ptr %in
   %b = trunc <32 x i64> %a to <32 x i32>

diff  --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
index dc0e49fafcea78..3603c8e01143dd 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
@@ -13,14 +13,14 @@ define <4 x i8> @shuffle_ext_byone_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-NEXT:    mov z1.h, z0.h[1]
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    mov z2.h, z0.h[2]
-; CHECK-NEXT:    mov z0.h, z0.h[3]
-; CHECK-NEXT:    fmov w9, s1
-; CHECK-NEXT:    fmov w10, s2
-; CHECK-NEXT:    fmov w11, s0
+; CHECK-NEXT:    mov z3.h, z0.h[3]
 ; CHECK-NEXT:    strh w8, [sp, #8]
-; CHECK-NEXT:    strh w9, [sp, #14]
-; CHECK-NEXT:    strh w10, [sp, #12]
-; CHECK-NEXT:    strh w11, [sp, #10]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    strh w8, [sp, #14]
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    strh w9, [sp, #12]
+; CHECK-NEXT:    strh w8, [sp, #10]
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
@@ -61,15 +61,14 @@ define void @shuffle_ext_byone_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: shuffle_ext_byone_v32i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0, #16]
-; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    ldp q1, q3, [x1]
 ; CHECK-NEXT:    mov z0.b, z0.b[15]
 ; CHECK-NEXT:    mov z2.b, z1.b[15]
 ; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    ldr q0, [x1, #16]
-; CHECK-NEXT:    fmov w9, s2
 ; CHECK-NEXT:    insr z1.b, w8
-; CHECK-NEXT:    insr z0.b, w9
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    insr z3.b, w8
+; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -84,8 +83,8 @@ define void @shuffle_ext_byone_v32i8(ptr %a, ptr %b) {
 define <2 x i16> @shuffle_ext_byone_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-LABEL: shuffle_ext_byone_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    revw z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -125,15 +124,14 @@ define void @shuffle_ext_byone_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: shuffle_ext_byone_v16i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0, #16]
-; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    ldp q1, q3, [x1]
 ; CHECK-NEXT:    mov z0.h, z0.h[7]
 ; CHECK-NEXT:    mov z2.h, z1.h[7]
 ; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    ldr q0, [x1, #16]
-; CHECK-NEXT:    fmov w9, s2
 ; CHECK-NEXT:    insr z1.h, w8
-; CHECK-NEXT:    insr z0.h, w9
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    insr z3.h, w8
+; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -175,15 +173,14 @@ define void @shuffle_ext_byone_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: shuffle_ext_byone_v8i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0, #16]
-; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    ldp q1, q3, [x1]
 ; CHECK-NEXT:    mov z0.s, z0.s[3]
 ; CHECK-NEXT:    mov z2.s, z1.s[3]
 ; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    ldr q0, [x1, #16]
-; CHECK-NEXT:    fmov w9, s2
 ; CHECK-NEXT:    insr z1.s, w8
-; CHECK-NEXT:    insr z0.s, w9
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    insr z3.s, w8
+; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -210,15 +207,14 @@ define void @shuffle_ext_byone_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: shuffle_ext_byone_v4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0, #16]
-; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    ldp q1, q3, [x1]
 ; CHECK-NEXT:    mov z0.d, z0.d[1]
 ; CHECK-NEXT:    mov z2.d, z1.d[1]
 ; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    ldr q0, [x1, #16]
-; CHECK-NEXT:    fmov x9, d2
 ; CHECK-NEXT:    insr z1.d, x8
-; CHECK-NEXT:    insr z0.d, x9
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    fmov x8, d2
+; CHECK-NEXT:    insr z3.d, x8
+; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -232,10 +228,10 @@ define <4 x half> @shuffle_ext_byone_v4f16(<4 x half> %op1, <4 x half> %op2) {
 ; CHECK-LABEL: shuffle_ext_byone_v4f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
-; CHECK-NEXT:    mov z0.h, z0.h[3]
-; CHECK-NEXT:    insr z1.h, h0
+; CHECK-NEXT:    mov z2.h, z0.h[3]
 ; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    insr z0.h, h2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %ret = shufflevector <4 x half> %op1, <4 x half> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x half> %ret
@@ -245,10 +241,10 @@ define <8 x half> @shuffle_ext_byone_v8f16(<8 x half> %op1, <8 x half> %op2) {
 ; CHECK-LABEL: shuffle_ext_byone_v8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
-; CHECK-NEXT:    mov z0.h, z0.h[7]
-; CHECK-NEXT:    insr z1.h, h0
+; CHECK-NEXT:    mov z2.h, z0.h[7]
 ; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    insr z0.h, h2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %ret = shufflevector <8 x half> %op1, <8 x half> %op2, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
   ret <8 x half> %ret
@@ -257,13 +253,13 @@ define <8 x half> @shuffle_ext_byone_v8f16(<8 x half> %op1, <8 x half> %op2) {
 define void @shuffle_ext_byone_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: shuffle_ext_byone_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q2, [x1]
-; CHECK-NEXT:    mov z3.h, z1.h[7]
+; CHECK-NEXT:    ldp q1, q3, [x1]
 ; CHECK-NEXT:    ldr q0, [x0, #16]
-; CHECK-NEXT:    insr z2.h, h3
 ; CHECK-NEXT:    mov z0.h, z0.h[7]
+; CHECK-NEXT:    mov z2.h, z1.h[7]
 ; CHECK-NEXT:    insr z1.h, h0
-; CHECK-NEXT:    stp q1, q2, [x0]
+; CHECK-NEXT:    insr z3.h, h2
+; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -277,10 +273,10 @@ define <2 x float> @shuffle_ext_byone_v2f32(<2 x float> %op1, <2 x float> %op2)
 ; CHECK-LABEL: shuffle_ext_byone_v2f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
-; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    insr z1.s, s0
+; CHECK-NEXT:    mov z2.s, z0.s[1]
 ; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    insr z0.s, s2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %ret = shufflevector <2 x float> %op1, <2 x float> %op2, <2 x i32> <i32 1, i32 2>
   ret <2 x float> %ret
@@ -290,10 +286,10 @@ define <4 x float> @shuffle_ext_byone_v4f32(<4 x float> %op1, <4 x float> %op2)
 ; CHECK-LABEL: shuffle_ext_byone_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
-; CHECK-NEXT:    mov z0.s, z0.s[3]
-; CHECK-NEXT:    insr z1.s, s0
+; CHECK-NEXT:    mov z2.s, z0.s[3]
 ; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    insr z0.s, s2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %ret = shufflevector <4 x float> %op1, <4 x float> %op2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
   ret <4 x float> %ret
@@ -302,13 +298,13 @@ define <4 x float> @shuffle_ext_byone_v4f32(<4 x float> %op1, <4 x float> %op2)
 define void @shuffle_ext_byone_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: shuffle_ext_byone_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q2, [x1]
-; CHECK-NEXT:    mov z3.s, z1.s[3]
+; CHECK-NEXT:    ldp q1, q3, [x1]
 ; CHECK-NEXT:    ldr q0, [x0, #16]
-; CHECK-NEXT:    insr z2.s, s3
 ; CHECK-NEXT:    mov z0.s, z0.s[3]
+; CHECK-NEXT:    mov z2.s, z1.s[3]
 ; CHECK-NEXT:    insr z1.s, s0
-; CHECK-NEXT:    stp q1, q2, [x0]
+; CHECK-NEXT:    insr z3.s, s2
+; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -321,10 +317,10 @@ define <2 x double> @shuffle_ext_byone_v2f64(<2 x double> %op1, <2 x double> %op
 ; CHECK-LABEL: shuffle_ext_byone_v2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
-; CHECK-NEXT:    mov z0.d, z0.d[1]
-; CHECK-NEXT:    insr z1.d, d0
+; CHECK-NEXT:    mov z2.d, z0.d[1]
 ; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    insr z0.d, d2
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %ret = shufflevector <2 x double> %op1, <2 x double> %op2, <2 x i32> <i32 1, i32 2>
   ret <2 x double> %ret
@@ -333,13 +329,13 @@ define <2 x double> @shuffle_ext_byone_v2f64(<2 x double> %op1, <2 x double> %op
 define void @shuffle_ext_byone_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: shuffle_ext_byone_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q2, [x1]
-; CHECK-NEXT:    mov z3.d, z1.d[1]
+; CHECK-NEXT:    ldp q1, q3, [x1]
 ; CHECK-NEXT:    ldr q0, [x0, #16]
-; CHECK-NEXT:    insr z2.d, d3
 ; CHECK-NEXT:    mov z0.d, z0.d[1]
+; CHECK-NEXT:    mov z2.d, z1.d[1]
 ; CHECK-NEXT:    insr z1.d, d0
-; CHECK-NEXT:    stp q1, q2, [x0]
+; CHECK-NEXT:    insr z3.d, d2
+; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
@@ -351,13 +347,13 @@ define void @shuffle_ext_byone_v4f64(ptr %a, ptr %b) {
 define void @shuffle_ext_byone_reverse(ptr %a, ptr %b) {
 ; CHECK-LABEL: shuffle_ext_byone_reverse:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q1, q2, [x0]
-; CHECK-NEXT:    mov z3.d, z1.d[1]
+; CHECK-NEXT:    ldp q1, q3, [x0]
 ; CHECK-NEXT:    ldr q0, [x1, #16]
-; CHECK-NEXT:    insr z2.d, d3
 ; CHECK-NEXT:    mov z0.d, z0.d[1]
+; CHECK-NEXT:    mov z2.d, z1.d[1]
 ; CHECK-NEXT:    insr z1.d, d0
-; CHECK-NEXT:    stp q1, q2, [x0]
+; CHECK-NEXT:    insr z3.d, d2
+; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b

diff  --git a/llvm/test/CodeGen/AArch64/sve-tailcall.ll b/llvm/test/CodeGen/AArch64/sve-tailcall.ll
index 58135e44fa912d..f32c80d392b633 100644
--- a/llvm/test/CodeGen/AArch64/sve-tailcall.ll
+++ b/llvm/test/CodeGen/AArch64/sve-tailcall.ll
@@ -84,33 +84,33 @@ define i32 @sve_caller_non_sve_callee(<vscale x 4 x i32> %arg) nounwind {
 ; CHECK-NEXT:    //NO_APP
 ; CHECK-NEXT:    bl non_sve_callee
 ; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -159,33 +159,33 @@ define i32 @sve_caller_non_sve_callee_fastcc(<vscale x 4 x i32> %arg) nounwind {
 ; CHECK-NEXT:    //NO_APP
 ; CHECK-NEXT:    bl non_sve_callee
 ; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve-trunc.ll b/llvm/test/CodeGen/AArch64/sve-trunc.ll
index 6869cd5d30ed6c..dfa4a6148b86e9 100644
--- a/llvm/test/CodeGen/AArch64/sve-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-trunc.ll
@@ -111,35 +111,37 @@ define <vscale x 16 x i1> @trunc_i64toi1_split3(<vscale x 16 x i64> %in) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    and z7.d, z7.d, #0x1
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    and z7.d, z7.d, #0x1
 ; CHECK-NEXT:    and z6.d, z6.d, #0x1
 ; CHECK-NEXT:    and z5.d, z5.d, #0x1
 ; CHECK-NEXT:    and z4.d, z4.d, #0x1
-; CHECK-NEXT:    cmpne p1.d, p0/z, z7.d, #0
-; CHECK-NEXT:    cmpne p2.d, p0/z, z6.d, #0
-; CHECK-NEXT:    cmpne p3.d, p0/z, z5.d, #0
-; CHECK-NEXT:    cmpne p4.d, p0/z, z4.d, #0
 ; CHECK-NEXT:    and z3.d, z3.d, #0x1
 ; CHECK-NEXT:    and z2.d, z2.d, #0x1
 ; CHECK-NEXT:    and z1.d, z1.d, #0x1
 ; CHECK-NEXT:    and z0.d, z0.d, #0x1
+; CHECK-NEXT:    cmpne p1.d, p0/z, z7.d, #0
+; CHECK-NEXT:    cmpne p2.d, p0/z, z6.d, #0
+; CHECK-NEXT:    cmpne p3.d, p0/z, z5.d, #0
+; CHECK-NEXT:    cmpne p4.d, p0/z, z4.d, #0
+; CHECK-NEXT:    cmpne p5.d, p0/z, z3.d, #0
+; CHECK-NEXT:    cmpne p6.d, p0/z, z2.d, #0
 ; CHECK-NEXT:    uzp1 p1.s, p2.s, p1.s
-; CHECK-NEXT:    uzp1 p2.s, p4.s, p3.s
-; CHECK-NEXT:    cmpne p3.d, p0/z, z3.d, #0
-; CHECK-NEXT:    cmpne p4.d, p0/z, z2.d, #0
-; CHECK-NEXT:    cmpne p5.d, p0/z, z1.d, #0
+; CHECK-NEXT:    cmpne p2.d, p0/z, z1.d, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    uzp1 p3.s, p4.s, p3.s
-; CHECK-NEXT:    uzp1 p0.s, p0.s, p5.s
+; CHECK-NEXT:    uzp1 p4.s, p6.s, p5.s
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    uzp1 p0.s, p0.s, p2.s
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    uzp1 p1.h, p3.h, p1.h
+; CHECK-NEXT:    uzp1 p0.h, p0.h, p4.h
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    uzp1 p1.h, p2.h, p1.h
-; CHECK-NEXT:    uzp1 p0.h, p0.h, p3.h
 ; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll b/llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll
index 8ef7b8032cc0ff..45d796f68c4249 100644
--- a/llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll
+++ b/llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll
@@ -87,16 +87,15 @@ define <vscale x 32 x i8> @umulo_nxv32i8(<vscale x 32 x i8> %x, <vscale x 32 x i
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    mul z4.b, p0/m, z4.b, z3.b
-; CHECK-NEXT:    umulh z1.b, p0/m, z1.b, z3.b
-; CHECK-NEXT:    movprfx z3, z0
-; CHECK-NEXT:    umulh z3.b, p0/m, z3.b, z2.b
-; CHECK-NEXT:    cmpne p1.b, p0/z, z1.b, #0
+; CHECK-NEXT:    umulh z4.b, p0/m, z4.b, z3.b
+; CHECK-NEXT:    movprfx z5, z0
+; CHECK-NEXT:    umulh z5.b, p0/m, z5.b, z2.b
+; CHECK-NEXT:    mul z1.b, p0/m, z1.b, z3.b
 ; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z2.b
-; CHECK-NEXT:    cmpne p0.b, p0/z, z3.b, #0
-; CHECK-NEXT:    mov z4.b, p1/m, #0 // =0x0
+; CHECK-NEXT:    cmpne p1.b, p0/z, z4.b, #0
+; CHECK-NEXT:    cmpne p0.b, p0/z, z5.b, #0
 ; CHECK-NEXT:    mov z0.b, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, z4.d
+; CHECK-NEXT:    mov z1.b, p1/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 32 x i8>, <vscale x 32 x i1> } @llvm.umul.with.overflow.nxv32i8(<vscale x 32 x i8> %x, <vscale x 32 x i8> %y)
   %b = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i1> } %a, 0
@@ -112,27 +111,25 @@ define <vscale x 64 x i8> @umulo_nxv64i8(<vscale x 64 x i8> %x, <vscale x 64 x i
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    movprfx z24, z3
-; CHECK-NEXT:    mul z24.b, p0/m, z24.b, z7.b
-; CHECK-NEXT:    umulh z3.b, p0/m, z3.b, z7.b
-; CHECK-NEXT:    cmpne p1.b, p0/z, z3.b, #0
-; CHECK-NEXT:    movprfx z3, z2
-; CHECK-NEXT:    umulh z3.b, p0/m, z3.b, z6.b
-; CHECK-NEXT:    cmpne p2.b, p0/z, z3.b, #0
-; CHECK-NEXT:    movprfx z3, z1
-; CHECK-NEXT:    mul z3.b, p0/m, z3.b, z5.b
-; CHECK-NEXT:    umulh z1.b, p0/m, z1.b, z5.b
-; CHECK-NEXT:    mul z2.b, p0/m, z2.b, z6.b
-; CHECK-NEXT:    cmpne p3.b, p0/z, z1.b, #0
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    umulh z1.b, p0/m, z1.b, z4.b
+; CHECK-NEXT:    umulh z24.b, p0/m, z24.b, z7.b
+; CHECK-NEXT:    movprfx z25, z0
+; CHECK-NEXT:    umulh z25.b, p0/m, z25.b, z4.b
+; CHECK-NEXT:    movprfx z26, z2
+; CHECK-NEXT:    umulh z26.b, p0/m, z26.b, z6.b
+; CHECK-NEXT:    movprfx z27, z1
+; CHECK-NEXT:    umulh z27.b, p0/m, z27.b, z5.b
+; CHECK-NEXT:    mul z3.b, p0/m, z3.b, z7.b
 ; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z4.b
-; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, #0
-; CHECK-NEXT:    mov z3.b, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z24.b, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.b, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.b, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, z3.d
-; CHECK-NEXT:    mov z3.d, z24.d
+; CHECK-NEXT:    cmpne p1.b, p0/z, z25.b, #0
+; CHECK-NEXT:    mul z2.b, p0/m, z2.b, z6.b
+; CHECK-NEXT:    mul z1.b, p0/m, z1.b, z5.b
+; CHECK-NEXT:    cmpne p2.b, p0/z, z24.b, #0
+; CHECK-NEXT:    cmpne p3.b, p0/z, z26.b, #0
+; CHECK-NEXT:    cmpne p0.b, p0/z, z27.b, #0
+; CHECK-NEXT:    mov z0.b, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z3.b, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.b, p0/m, #0 // =0x0
+; CHECK-NEXT:    mov z2.b, p3/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 64 x i8>, <vscale x 64 x i1> } @llvm.umul.with.overflow.nxv64i8(<vscale x 64 x i8> %x, <vscale x 64 x i8> %y)
   %b = extractvalue { <vscale x 64 x i8>, <vscale x 64 x i1> } %a, 0
@@ -207,16 +204,15 @@ define <vscale x 16 x i16> @umulo_nxv16i16(<vscale x 16 x i16> %x, <vscale x 16
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    mul z4.h, p0/m, z4.h, z3.h
-; CHECK-NEXT:    umulh z1.h, p0/m, z1.h, z3.h
-; CHECK-NEXT:    movprfx z3, z0
-; CHECK-NEXT:    umulh z3.h, p0/m, z3.h, z2.h
-; CHECK-NEXT:    cmpne p1.h, p0/z, z1.h, #0
+; CHECK-NEXT:    umulh z4.h, p0/m, z4.h, z3.h
+; CHECK-NEXT:    movprfx z5, z0
+; CHECK-NEXT:    umulh z5.h, p0/m, z5.h, z2.h
+; CHECK-NEXT:    mul z1.h, p0/m, z1.h, z3.h
 ; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z2.h
-; CHECK-NEXT:    cmpne p0.h, p0/z, z3.h, #0
-; CHECK-NEXT:    mov z4.h, p1/m, #0 // =0x0
+; CHECK-NEXT:    cmpne p1.h, p0/z, z4.h, #0
+; CHECK-NEXT:    cmpne p0.h, p0/z, z5.h, #0
 ; CHECK-NEXT:    mov z0.h, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, z4.d
+; CHECK-NEXT:    mov z1.h, p1/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 16 x i16>, <vscale x 16 x i1> } @llvm.umul.with.overflow.nxv16i16(<vscale x 16 x i16> %x, <vscale x 16 x i16> %y)
   %b = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i1> } %a, 0
@@ -232,27 +228,25 @@ define <vscale x 32 x i16> @umulo_nxv32i16(<vscale x 32 x i16> %x, <vscale x 32
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    movprfx z24, z3
-; CHECK-NEXT:    mul z24.h, p0/m, z24.h, z7.h
-; CHECK-NEXT:    umulh z3.h, p0/m, z3.h, z7.h
-; CHECK-NEXT:    cmpne p1.h, p0/z, z3.h, #0
-; CHECK-NEXT:    movprfx z3, z2
-; CHECK-NEXT:    umulh z3.h, p0/m, z3.h, z6.h
-; CHECK-NEXT:    cmpne p2.h, p0/z, z3.h, #0
-; CHECK-NEXT:    movprfx z3, z1
-; CHECK-NEXT:    mul z3.h, p0/m, z3.h, z5.h
-; CHECK-NEXT:    umulh z1.h, p0/m, z1.h, z5.h
-; CHECK-NEXT:    mul z2.h, p0/m, z2.h, z6.h
-; CHECK-NEXT:    cmpne p3.h, p0/z, z1.h, #0
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    umulh z1.h, p0/m, z1.h, z4.h
+; CHECK-NEXT:    umulh z24.h, p0/m, z24.h, z7.h
+; CHECK-NEXT:    movprfx z25, z0
+; CHECK-NEXT:    umulh z25.h, p0/m, z25.h, z4.h
+; CHECK-NEXT:    movprfx z26, z2
+; CHECK-NEXT:    umulh z26.h, p0/m, z26.h, z6.h
+; CHECK-NEXT:    movprfx z27, z1
+; CHECK-NEXT:    umulh z27.h, p0/m, z27.h, z5.h
+; CHECK-NEXT:    mul z3.h, p0/m, z3.h, z7.h
 ; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z4.h
-; CHECK-NEXT:    cmpne p0.h, p0/z, z1.h, #0
-; CHECK-NEXT:    mov z3.h, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z24.h, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.h, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.h, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, z3.d
-; CHECK-NEXT:    mov z3.d, z24.d
+; CHECK-NEXT:    cmpne p1.h, p0/z, z25.h, #0
+; CHECK-NEXT:    mul z2.h, p0/m, z2.h, z6.h
+; CHECK-NEXT:    mul z1.h, p0/m, z1.h, z5.h
+; CHECK-NEXT:    cmpne p2.h, p0/z, z24.h, #0
+; CHECK-NEXT:    cmpne p3.h, p0/z, z26.h, #0
+; CHECK-NEXT:    cmpne p0.h, p0/z, z27.h, #0
+; CHECK-NEXT:    mov z0.h, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z3.h, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.h, p0/m, #0 // =0x0
+; CHECK-NEXT:    mov z2.h, p3/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 32 x i16>, <vscale x 32 x i1> } @llvm.umul.with.overflow.nxv32i16(<vscale x 32 x i16> %x, <vscale x 32 x i16> %y)
   %b = extractvalue { <vscale x 32 x i16>, <vscale x 32 x i1> } %a, 0
@@ -307,16 +301,15 @@ define <vscale x 8 x i32> @umulo_nxv8i32(<vscale x 8 x i32> %x, <vscale x 8 x i3
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    mul z4.s, p0/m, z4.s, z3.s
-; CHECK-NEXT:    umulh z1.s, p0/m, z1.s, z3.s
-; CHECK-NEXT:    movprfx z3, z0
-; CHECK-NEXT:    umulh z3.s, p0/m, z3.s, z2.s
-; CHECK-NEXT:    cmpne p1.s, p0/z, z1.s, #0
+; CHECK-NEXT:    umulh z4.s, p0/m, z4.s, z3.s
+; CHECK-NEXT:    movprfx z5, z0
+; CHECK-NEXT:    umulh z5.s, p0/m, z5.s, z2.s
+; CHECK-NEXT:    mul z1.s, p0/m, z1.s, z3.s
 ; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z2.s
-; CHECK-NEXT:    cmpne p0.s, p0/z, z3.s, #0
-; CHECK-NEXT:    mov z4.s, p1/m, #0 // =0x0
+; CHECK-NEXT:    cmpne p1.s, p0/z, z4.s, #0
+; CHECK-NEXT:    cmpne p0.s, p0/z, z5.s, #0
 ; CHECK-NEXT:    mov z0.s, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, z4.d
+; CHECK-NEXT:    mov z1.s, p1/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 8 x i32>, <vscale x 8 x i1> } @llvm.umul.with.overflow.nxv8i32(<vscale x 8 x i32> %x, <vscale x 8 x i32> %y)
   %b = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i1> } %a, 0
@@ -332,27 +325,25 @@ define <vscale x 16 x i32> @umulo_nxv16i32(<vscale x 16 x i32> %x, <vscale x 16
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    movprfx z24, z3
-; CHECK-NEXT:    mul z24.s, p0/m, z24.s, z7.s
-; CHECK-NEXT:    umulh z3.s, p0/m, z3.s, z7.s
-; CHECK-NEXT:    cmpne p1.s, p0/z, z3.s, #0
-; CHECK-NEXT:    movprfx z3, z2
-; CHECK-NEXT:    umulh z3.s, p0/m, z3.s, z6.s
-; CHECK-NEXT:    cmpne p2.s, p0/z, z3.s, #0
-; CHECK-NEXT:    movprfx z3, z1
-; CHECK-NEXT:    mul z3.s, p0/m, z3.s, z5.s
-; CHECK-NEXT:    umulh z1.s, p0/m, z1.s, z5.s
-; CHECK-NEXT:    mul z2.s, p0/m, z2.s, z6.s
-; CHECK-NEXT:    cmpne p3.s, p0/z, z1.s, #0
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    umulh z1.s, p0/m, z1.s, z4.s
+; CHECK-NEXT:    umulh z24.s, p0/m, z24.s, z7.s
+; CHECK-NEXT:    movprfx z25, z0
+; CHECK-NEXT:    umulh z25.s, p0/m, z25.s, z4.s
+; CHECK-NEXT:    movprfx z26, z2
+; CHECK-NEXT:    umulh z26.s, p0/m, z26.s, z6.s
+; CHECK-NEXT:    movprfx z27, z1
+; CHECK-NEXT:    umulh z27.s, p0/m, z27.s, z5.s
+; CHECK-NEXT:    mul z3.s, p0/m, z3.s, z7.s
 ; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z4.s
-; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
-; CHECK-NEXT:    mov z3.s, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z24.s, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.s, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.s, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, z3.d
-; CHECK-NEXT:    mov z3.d, z24.d
+; CHECK-NEXT:    cmpne p1.s, p0/z, z25.s, #0
+; CHECK-NEXT:    mul z2.s, p0/m, z2.s, z6.s
+; CHECK-NEXT:    mul z1.s, p0/m, z1.s, z5.s
+; CHECK-NEXT:    cmpne p2.s, p0/z, z24.s, #0
+; CHECK-NEXT:    cmpne p3.s, p0/z, z26.s, #0
+; CHECK-NEXT:    cmpne p0.s, p0/z, z27.s, #0
+; CHECK-NEXT:    mov z0.s, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z3.s, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.s, p0/m, #0 // =0x0
+; CHECK-NEXT:    mov z2.s, p3/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 16 x i32>, <vscale x 16 x i1> } @llvm.umul.with.overflow.nxv16i32(<vscale x 16 x i32> %x, <vscale x 16 x i32> %y)
   %b = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i1> } %a, 0
@@ -387,16 +378,15 @@ define <vscale x 4 x i64> @umulo_nxv4i64(<vscale x 4 x i64> %x, <vscale x 4 x i6
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    movprfx z4, z1
-; CHECK-NEXT:    mul z4.d, p0/m, z4.d, z3.d
-; CHECK-NEXT:    umulh z1.d, p0/m, z1.d, z3.d
-; CHECK-NEXT:    movprfx z3, z0
-; CHECK-NEXT:    umulh z3.d, p0/m, z3.d, z2.d
-; CHECK-NEXT:    cmpne p1.d, p0/z, z1.d, #0
+; CHECK-NEXT:    umulh z4.d, p0/m, z4.d, z3.d
+; CHECK-NEXT:    movprfx z5, z0
+; CHECK-NEXT:    umulh z5.d, p0/m, z5.d, z2.d
+; CHECK-NEXT:    mul z1.d, p0/m, z1.d, z3.d
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z2.d
-; CHECK-NEXT:    cmpne p0.d, p0/z, z3.d, #0
-; CHECK-NEXT:    mov z4.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    cmpne p1.d, p0/z, z4.d, #0
+; CHECK-NEXT:    cmpne p0.d, p0/z, z5.d, #0
 ; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, z4.d
+; CHECK-NEXT:    mov z1.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 4 x i64>, <vscale x 4 x i1> } @llvm.umul.with.overflow.nxv4i64(<vscale x 4 x i64> %x, <vscale x 4 x i64> %y)
   %b = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i1> } %a, 0
@@ -412,27 +402,25 @@ define <vscale x 8 x i64> @umulo_nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i6
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    movprfx z24, z3
-; CHECK-NEXT:    mul z24.d, p0/m, z24.d, z7.d
-; CHECK-NEXT:    umulh z3.d, p0/m, z3.d, z7.d
-; CHECK-NEXT:    cmpne p1.d, p0/z, z3.d, #0
-; CHECK-NEXT:    movprfx z3, z2
-; CHECK-NEXT:    umulh z3.d, p0/m, z3.d, z6.d
-; CHECK-NEXT:    cmpne p2.d, p0/z, z3.d, #0
-; CHECK-NEXT:    movprfx z3, z1
-; CHECK-NEXT:    mul z3.d, p0/m, z3.d, z5.d
-; CHECK-NEXT:    umulh z1.d, p0/m, z1.d, z5.d
-; CHECK-NEXT:    mul z2.d, p0/m, z2.d, z6.d
-; CHECK-NEXT:    cmpne p3.d, p0/z, z1.d, #0
-; CHECK-NEXT:    movprfx z1, z0
-; CHECK-NEXT:    umulh z1.d, p0/m, z1.d, z4.d
+; CHECK-NEXT:    umulh z24.d, p0/m, z24.d, z7.d
+; CHECK-NEXT:    movprfx z25, z0
+; CHECK-NEXT:    umulh z25.d, p0/m, z25.d, z4.d
+; CHECK-NEXT:    movprfx z26, z2
+; CHECK-NEXT:    umulh z26.d, p0/m, z26.d, z6.d
+; CHECK-NEXT:    movprfx z27, z1
+; CHECK-NEXT:    umulh z27.d, p0/m, z27.d, z5.d
+; CHECK-NEXT:    mul z3.d, p0/m, z3.d, z7.d
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z4.d
-; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; CHECK-NEXT:    mov z3.d, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z24.d, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.d, p2/m, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, z3.d
-; CHECK-NEXT:    mov z3.d, z24.d
+; CHECK-NEXT:    cmpne p1.d, p0/z, z25.d, #0
+; CHECK-NEXT:    mul z2.d, p0/m, z2.d, z6.d
+; CHECK-NEXT:    mul z1.d, p0/m, z1.d, z5.d
+; CHECK-NEXT:    cmpne p2.d, p0/z, z24.d, #0
+; CHECK-NEXT:    cmpne p3.d, p0/z, z26.d, #0
+; CHECK-NEXT:    cmpne p0.d, p0/z, z27.d, #0
+; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
+; CHECK-NEXT:    mov z3.d, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
+; CHECK-NEXT:    mov z2.d, p3/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %a = call { <vscale x 8 x i64>, <vscale x 8 x i1> } @llvm.umul.with.overflow.nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i64> %y)
   %b = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i1> } %a, 0

diff  --git a/llvm/test/CodeGen/AArch64/sve-vecreduce-fold.ll b/llvm/test/CodeGen/AArch64/sve-vecreduce-fold.ll
index 0bdaefdfc2a3f0..898090340869ee 100644
--- a/llvm/test/CodeGen/AArch64/sve-vecreduce-fold.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vecreduce-fold.ll
@@ -80,8 +80,8 @@ define i1 @reduce_and_insert_subvec_into_var(<vscale x 4 x i1> %in, <vscale x 16
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    punpklo p3.h, p1.b
 ; CHECK-NEXT:    punpkhi p1.h, p1.b
-; CHECK-NEXT:    punpkhi p3.h, p3.b
 ; CHECK-NEXT:    ptrue p2.b
+; CHECK-NEXT:    punpkhi p3.h, p3.b
 ; CHECK-NEXT:    uzp1 p0.h, p0.h, p3.h
 ; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
 ; CHECK-NEXT:    nots p0.b, p2/z, p0.b

diff  --git a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
index fd2cf67bea2d67..9920bc6048e8f3 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
@@ -185,17 +185,17 @@ ret {<vscale x 4 x i64>, <vscale x 4 x i64>} %retval
 define {<vscale x 8 x i64>, <vscale x 8 x i64>}  @vector_deinterleave_nxv8i64_nxv16i64(<vscale x 16 x i64> %vec) {
 ; CHECK-LABEL: vector_deinterleave_nxv8i64_nxv16i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z24.d, z4.d, z5.d
-; CHECK-NEXT:    uzp1 z25.d, z2.d, z3.d
-; CHECK-NEXT:    uzp1 z26.d, z0.d, z1.d
+; CHECK-NEXT:    uzp1 z24.d, z2.d, z3.d
+; CHECK-NEXT:    uzp1 z25.d, z0.d, z1.d
+; CHECK-NEXT:    uzp1 z26.d, z4.d, z5.d
 ; CHECK-NEXT:    uzp1 z27.d, z6.d, z7.d
 ; CHECK-NEXT:    uzp2 z28.d, z0.d, z1.d
 ; CHECK-NEXT:    uzp2 z29.d, z2.d, z3.d
 ; CHECK-NEXT:    uzp2 z30.d, z4.d, z5.d
 ; CHECK-NEXT:    uzp2 z7.d, z6.d, z7.d
-; CHECK-NEXT:    mov z0.d, z26.d
-; CHECK-NEXT:    mov z1.d, z25.d
-; CHECK-NEXT:    mov z2.d, z24.d
+; CHECK-NEXT:    mov z0.d, z25.d
+; CHECK-NEXT:    mov z1.d, z24.d
+; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    mov z4.d, z28.d
 ; CHECK-NEXT:    mov z5.d, z29.d

diff  --git a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
index 51700bc252f3be..23bf5065286ebd 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
@@ -164,12 +164,13 @@ define <vscale x 4 x i1> @interleave2_nxv4i1(<vscale x 2 x i1> %vec0, <vscale x
 define <vscale x 16 x i32> @interleave2_nxv16i32(<vscale x 8 x i32> %vec0, <vscale x 8 x i32> %vec1) {
 ; CHECK-LABEL: interleave2_nxv16i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z4.s, z0.s, z2.s
-; CHECK-NEXT:    zip2 z5.s, z0.s, z2.s
-; CHECK-NEXT:    zip1 z2.s, z1.s, z3.s
+; CHECK-NEXT:    zip1 z4.s, z1.s, z3.s
+; CHECK-NEXT:    zip1 z5.s, z0.s, z2.s
+; CHECK-NEXT:    zip2 z2.s, z0.s, z2.s
 ; CHECK-NEXT:    zip2 z3.s, z1.s, z3.s
-; CHECK-NEXT:    mov z0.d, z4.d
-; CHECK-NEXT:    mov z1.d, z5.d
+; CHECK-NEXT:    mov z0.d, z5.d
+; CHECK-NEXT:    mov z1.d, z2.d
+; CHECK-NEXT:    mov z2.d, z4.d
 ; CHECK-NEXT:    ret
   %retval = call <vscale x 16 x i32>@llvm.experimental.vector.interleave2.nxv16i32(<vscale x 8 x i32> %vec0, <vscale x 8 x i32> %vec1)
   ret <vscale x 16 x i32> %retval
@@ -178,12 +179,13 @@ define <vscale x 16 x i32> @interleave2_nxv16i32(<vscale x 8 x i32> %vec0, <vsca
 define <vscale x 8 x i64> @interleave2_nxv8i64(<vscale x 4 x i64> %vec0, <vscale x 4 x i64> %vec1) {
 ; CHECK-LABEL: interleave2_nxv8i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z4.d, z0.d, z2.d
-; CHECK-NEXT:    zip2 z5.d, z0.d, z2.d
-; CHECK-NEXT:    zip1 z2.d, z1.d, z3.d
+; CHECK-NEXT:    zip1 z4.d, z1.d, z3.d
+; CHECK-NEXT:    zip1 z5.d, z0.d, z2.d
+; CHECK-NEXT:    zip2 z2.d, z0.d, z2.d
 ; CHECK-NEXT:    zip2 z3.d, z1.d, z3.d
-; CHECK-NEXT:    mov z0.d, z4.d
-; CHECK-NEXT:    mov z1.d, z5.d
+; CHECK-NEXT:    mov z0.d, z5.d
+; CHECK-NEXT:    mov z1.d, z2.d
+; CHECK-NEXT:    mov z2.d, z4.d
 ; CHECK-NEXT:    ret
   %retval = call <vscale x 8 x i64> @llvm.experimental.vector.interleave2.nxv8i64(<vscale x 4 x i64> %vec0, <vscale x 4 x i64> %vec1)
   ret <vscale x 8 x i64> %retval

diff  --git a/llvm/test/CodeGen/AArch64/sve-vector-splat.ll b/llvm/test/CodeGen/AArch64/sve-vector-splat.ll
index ea47f0d26ea7ec..9253d5ab4531ac 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-splat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-splat.ll
@@ -536,7 +536,7 @@ define <vscale x 4 x float> @splat_nxv4f32_fold(<vscale x 4 x float> %x) {
 define <vscale x 2 x float> @splat_nxv2f32_fmov_fold() {
 ; CHECK-LABEL: splat_nxv2f32_fmov_fold:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1109917696
+; CHECK-NEXT:    mov w8, #1109917696 // =0x42280000
 ; CHECK-NEXT:    mov z0.s, w8
 ; CHECK-NEXT:    ret
   %1 = insertelement <vscale x 2 x float> undef, float 4.200000e+01, i32 0
@@ -547,7 +547,7 @@ define <vscale x 2 x float> @splat_nxv2f32_fmov_fold() {
 define <vscale x 4 x float> @splat_nxv4f32_fmov_fold() {
 ; CHECK-LABEL: splat_nxv4f32_fmov_fold:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1109917696
+; CHECK-NEXT:    mov w8, #1109917696 // =0x42280000
 ; CHECK-NEXT:    mov z0.s, w8
 ; CHECK-NEXT:    ret
   %1 = insertelement <vscale x 4 x float> undef, float 4.200000e+01, i32 0
@@ -558,7 +558,7 @@ define <vscale x 4 x float> @splat_nxv4f32_fmov_fold() {
 define <vscale x 2 x double> @splat_nxv2f64_fmov_fold() {
 ; CHECK-LABEL: splat_nxv2f64_fmov_fold:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #4631107791820423168
+; CHECK-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
 ; CHECK-NEXT:    mov z0.d, x8
 ; CHECK-NEXT:    ret
   %1 = insertelement <vscale x 2 x double> undef, double 4.200000e+01, i32 0
@@ -571,7 +571,7 @@ define <vscale x 2 x double> @splat_nxv2f64_fmov_fold() {
 define <vscale x 2 x float> @splat_nxv2f32_imm_out_of_range() {
 ; CHECK-LABEL: splat_nxv2f32_imm_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #7864
+; CHECK-NEXT:    mov w8, #7864 // =0x1eb8
 ; CHECK-NEXT:    movk w8, #16469, lsl #16
 ; CHECK-NEXT:    mov z0.s, w8
 ; CHECK-NEXT:    ret
@@ -583,7 +583,7 @@ define <vscale x 2 x float> @splat_nxv2f32_imm_out_of_range() {
 define <vscale x 4 x float> @splat_nxv4f32_imm_out_of_range() {
 ; CHECK-LABEL: splat_nxv4f32_imm_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #7864
+; CHECK-NEXT:    mov w8, #7864 // =0x1eb8
 ; CHECK-NEXT:    movk w8, #16469, lsl #16
 ; CHECK-NEXT:    mov z0.s, w8
 ; CHECK-NEXT:    ret
@@ -595,9 +595,9 @@ define <vscale x 4 x float> @splat_nxv4f32_imm_out_of_range() {
 define <vscale x 2 x double> @splat_nxv2f64_imm_out_of_range() {
 ; CHECK-LABEL: splat_nxv2f64_imm_out_of_range:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    adrp x8, .LCPI57_0
 ; CHECK-NEXT:    add x8, x8, :lo12:.LCPI57_0
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x8]
 ; CHECK-NEXT:    ret
   %1 = insertelement <vscale x 2 x double> undef, double 3.33, i32 0

diff  --git a/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll b/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll
index 3e6236a149ff31..1323ffcf12db0f 100644
--- a/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll
@@ -7,16 +7,16 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @func_vscale_none(ptr %a, ptr %b) #0 {
 ; CHECK-NOARG-LABEL: func_vscale_none:
 ; CHECK-NOARG:       // %bb.0:
-; CHECK-NOARG-NEXT:    ldp q0, q1, [x0, #32]
-; CHECK-NOARG-NEXT:    ldp q4, q5, [x1, #32]
-; CHECK-NOARG-NEXT:    add v0.4s, v0.4s, v4.4s
-; CHECK-NOARG-NEXT:    ldp q2, q3, [x0]
-; CHECK-NOARG-NEXT:    add v1.4s, v1.4s, v5.4s
-; CHECK-NOARG-NEXT:    ldp q6, q4, [x1]
-; CHECK-NOARG-NEXT:    stp q0, q1, [x0, #32]
-; CHECK-NOARG-NEXT:    add v2.4s, v2.4s, v6.4s
-; CHECK-NOARG-NEXT:    add v0.4s, v3.4s, v4.4s
-; CHECK-NOARG-NEXT:    stp q2, q0, [x0]
+; CHECK-NOARG-NEXT:    ldp q0, q3, [x1, #32]
+; CHECK-NOARG-NEXT:    ldp q1, q2, [x0, #32]
+; CHECK-NOARG-NEXT:    ldp q4, q6, [x1]
+; CHECK-NOARG-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NOARG-NEXT:    ldp q1, q5, [x0]
+; CHECK-NOARG-NEXT:    add v2.4s, v2.4s, v3.4s
+; CHECK-NOARG-NEXT:    add v1.4s, v1.4s, v4.4s
+; CHECK-NOARG-NEXT:    add v3.4s, v5.4s, v6.4s
+; CHECK-NOARG-NEXT:    stp q0, q2, [x0, #32]
+; CHECK-NOARG-NEXT:    stp q1, q3, [x0]
 ; CHECK-NOARG-NEXT:    ret
 ;
 ; CHECK-ARG-LABEL: func_vscale_none:
@@ -39,16 +39,16 @@ attributes #0 = { "target-features"="+sve" }
 define void @func_vscale1_1(ptr %a, ptr %b) #1 {
 ; CHECK-LABEL: func_vscale1_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0, #32]
-; CHECK-NEXT:    ldp q4, q5, [x1, #32]
-; CHECK-NEXT:    add v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    ldp q2, q3, [x0]
-; CHECK-NEXT:    add v1.4s, v1.4s, v5.4s
-; CHECK-NEXT:    ldp q6, q4, [x1]
-; CHECK-NEXT:    stp q0, q1, [x0, #32]
-; CHECK-NEXT:    add v2.4s, v2.4s, v6.4s
-; CHECK-NEXT:    add v0.4s, v3.4s, v4.4s
-; CHECK-NEXT:    stp q2, q0, [x0]
+; CHECK-NEXT:    ldp q0, q3, [x1, #32]
+; CHECK-NEXT:    ldp q1, q2, [x0, #32]
+; CHECK-NEXT:    ldp q4, q6, [x1]
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    ldp q1, q5, [x0]
+; CHECK-NEXT:    add v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v4.4s
+; CHECK-NEXT:    add v3.4s, v5.4s, v6.4s
+; CHECK-NEXT:    stp q0, q2, [x0, #32]
+; CHECK-NEXT:    stp q1, q3, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i32>, ptr %a
   %op2 = load <16 x i32>, ptr %b
@@ -62,8 +62,8 @@ attributes #1 = { "target-features"="+sve" vscale_range(1,1) }
 define void @func_vscale2_2(ptr %a, ptr %b) #2 {
 ; CHECK-LABEL: func_vscale2_2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #8
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov x8, #8 // =0x8
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
@@ -85,8 +85,8 @@ attributes #2 = { "target-features"="+sve" vscale_range(2,2) }
 define void @func_vscale2_4(ptr %a, ptr %b) #3 {
 ; CHECK-LABEL: func_vscale2_4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #8
 ; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    mov x8, #8 // =0x8
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]

diff  --git a/llvm/test/CodeGen/AArch64/sve-vselect-imm.ll b/llvm/test/CodeGen/AArch64/sve-vselect-imm.ll
index 48e90cd0d09d67..a4f009b289b71b 100644
--- a/llvm/test/CodeGen/AArch64/sve-vselect-imm.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vselect-imm.ll
@@ -118,9 +118,9 @@ define <vscale x 2 x i64> @sel_64_shifted(<vscale x 2 x i1> %p) {
 define <vscale x 8 x i16> @sel_16_illegal_wrong_extension(<vscale x 8 x i1> %p) {
 ; CHECK-LABEL: sel_16_illegal_wrong_extension:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.h, #0 // =0x0
-; CHECK-NEXT:    mov z1.h, #128 // =0x80
-; CHECK-NEXT:    mov z0.h, p0/m, z1.h
+; CHECK-NEXT:    mov z0.h, #128 // =0x80
+; CHECK-NEXT:    mov z1.h, #0 // =0x0
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %vec = shufflevector <vscale x 8 x i16> insertelement (<vscale x 8 x i16> undef, i16 128, i32 0), <vscale x 8 x i16> zeroinitializer, <vscale x 8 x i32> zeroinitializer
   %sel = select <vscale x 8 x i1> %p, <vscale x 8 x i16> %vec, <vscale x 8 x i16> zeroinitializer
@@ -130,9 +130,9 @@ define <vscale x 8 x i16> @sel_16_illegal_wrong_extension(<vscale x 8 x i1> %p)
 define <vscale x 4 x i32> @sel_32_illegal_wrong_extension(<vscale x 4 x i1> %p) {
 ; CHECK-LABEL: sel_32_illegal_wrong_extension:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.s, #0 // =0x0
-; CHECK-NEXT:    mov z1.s, #128 // =0x80
-; CHECK-NEXT:    mov z0.s, p0/m, z1.s
+; CHECK-NEXT:    mov z0.s, #128 // =0x80
+; CHECK-NEXT:    mov z1.s, #0 // =0x0
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %vec = shufflevector <vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 128, i32 0), <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> zeroinitializer
   %sel = select <vscale x 4 x i1> %p, <vscale x 4 x i32> %vec, <vscale x 4 x i32> zeroinitializer
@@ -142,9 +142,9 @@ define <vscale x 4 x i32> @sel_32_illegal_wrong_extension(<vscale x 4 x i1> %p)
 define <vscale x 2 x i64> @sel_64_illegal_wrong_extension(<vscale x 2 x i1> %p) {
 ; CHECK-LABEL: sel_64_illegal_wrong_extension:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z0.d, #0 // =0x0
-; CHECK-NEXT:    mov z1.d, #128 // =0x80
-; CHECK-NEXT:    mov z0.d, p0/m, z1.d
+; CHECK-NEXT:    mov z0.d, #128 // =0x80
+; CHECK-NEXT:    mov z1.d, #0 // =0x0
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %vec = shufflevector <vscale x 2 x i64> insertelement (<vscale x 2 x i64> undef, i64 128, i32 0), <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i32> zeroinitializer
   %sel = select <vscale x 2 x i1> %p, <vscale x 2 x i64> %vec, <vscale x 2 x i64> zeroinitializer
@@ -154,7 +154,7 @@ define <vscale x 2 x i64> @sel_64_illegal_wrong_extension(<vscale x 2 x i1> %p)
 define <vscale x 8 x i16> @sel_16_illegal_shifted(<vscale x 8 x i1> %p) {
 ; CHECK-LABEL: sel_16_illegal_shifted:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #513
+; CHECK-NEXT:    mov w8, #513 // =0x201
 ; CHECK-NEXT:    mov z1.h, #0 // =0x0
 ; CHECK-NEXT:    mov z0.h, w8
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
@@ -167,7 +167,7 @@ define <vscale x 8 x i16> @sel_16_illegal_shifted(<vscale x 8 x i1> %p) {
 define <vscale x 4 x i32> @sel_32_illegal_shifted(<vscale x 4 x i1> %p) {
 ; CHECK-LABEL: sel_32_illegal_shifted:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #513
+; CHECK-NEXT:    mov w8, #513 // =0x201
 ; CHECK-NEXT:    mov z1.s, #0 // =0x0
 ; CHECK-NEXT:    mov z0.s, w8
 ; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
@@ -180,7 +180,7 @@ define <vscale x 4 x i32> @sel_32_illegal_shifted(<vscale x 4 x i1> %p) {
 define <vscale x 2 x i64> @sel_64_illegal_shifted(<vscale x 2 x i1> %p) {
 ; CHECK-LABEL: sel_64_illegal_shifted:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #513
+; CHECK-NEXT:    mov w8, #513 // =0x201
 ; CHECK-NEXT:    mov z1.d, #0 // =0x0
 ; CHECK-NEXT:    mov z0.d, x8
 ; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
@@ -363,7 +363,7 @@ ret <vscale x 2 x double> %sel
 define <vscale x 8 x half> @sel_merge_nxv8f16_negative_zero(<vscale x 8 x i1> %p, <vscale x 8 x half> %in) {
 ; CHECK-LABEL: sel_merge_nxv8f16_negative_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32768
+; CHECK-NEXT:    mov w8, #32768 // =0x8000
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
@@ -375,7 +375,7 @@ ret <vscale x 8 x half> %sel
 define <vscale x 4 x half> @sel_merge_nx4f16_negative_zero(<vscale x 4 x i1> %p, <vscale x 4 x half> %in) {
 ; CHECK-LABEL: sel_merge_nx4f16_negative_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32768
+; CHECK-NEXT:    mov w8, #32768 // =0x8000
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
@@ -387,7 +387,7 @@ ret <vscale x 4 x half> %sel
 define <vscale x 2 x half> @sel_merge_nx2f16_negative_zero(<vscale x 2 x i1> %p, <vscale x 2 x half> %in) {
 ; CHECK-LABEL: sel_merge_nx2f16_negative_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32768
+; CHECK-NEXT:    mov w8, #32768 // =0x8000
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
@@ -399,7 +399,7 @@ ret <vscale x 2 x half> %sel
 define <vscale x 4 x float> @sel_merge_nx4f32_negative_zero(<vscale x 4 x i1> %p, <vscale x 4 x float> %in) {
 ; CHECK-LABEL: sel_merge_nx4f32_negative_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-2147483648
+; CHECK-NEXT:    mov w8, #-2147483648 // =0x80000000
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
@@ -411,7 +411,7 @@ ret <vscale x 4 x float> %sel
 define <vscale x 2 x float> @sel_merge_nx2f32_negative_zero(<vscale x 2 x i1> %p, <vscale x 2 x float> %in) {
 ; CHECK-LABEL: sel_merge_nx2f32_negative_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-2147483648
+; CHECK-NEXT:    mov w8, #-2147483648 // =0x80000000
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
@@ -423,7 +423,7 @@ ret <vscale x 2 x float> %sel
 define <vscale x 2 x double> @sel_merge_nx2f64_negative_zero(<vscale x 2 x i1> %p, <vscale x 2 x double> %in) {
 ; CHECK-LABEL: sel_merge_nx2f64_negative_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-9223372036854775808
+; CHECK-NEXT:    mov x8, #-9223372036854775808 // =0x8000000000000000
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
@@ -502,7 +502,7 @@ define <vscale x 2 x i64> @sel_merge_64_illegal_wrong_extension(<vscale x 2 x i1
 define <vscale x 8 x i16> @sel_merge_16_illegal_shifted(<vscale x 8 x i1> %p, <vscale x 8 x i16> %in) {
 ; CHECK-LABEL: sel_merge_16_illegal_shifted:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #513
+; CHECK-NEXT:    mov w8, #513 // =0x201
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
@@ -514,7 +514,7 @@ define <vscale x 8 x i16> @sel_merge_16_illegal_shifted(<vscale x 8 x i1> %p, <v
 define <vscale x 4 x i32> @sel_merge_32_illegal_shifted(<vscale x 4 x i1> %p, <vscale x 4 x i32> %in) {
 ; CHECK-LABEL: sel_merge_32_illegal_shifted:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #513
+; CHECK-NEXT:    mov w8, #513 // =0x201
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
@@ -526,7 +526,7 @@ define <vscale x 4 x i32> @sel_merge_32_illegal_shifted(<vscale x 4 x i1> %p, <v
 define <vscale x 2 x i64> @sel_merge_64_illegal_shifted(<vscale x 2 x i1> %p, <vscale x 2 x i64> %in) {
 ; CHECK-LABEL: sel_merge_64_illegal_shifted:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #513
+; CHECK-NEXT:    mov w8, #513 // =0x201
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/sve2-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve2-fcopysign.ll
index 1e6114f78b422a..7f65997fda7ad6 100644
--- a/llvm/test/CodeGen/AArch64/sve2-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-fcopysign.ll
@@ -58,14 +58,14 @@ define <vscale x 4 x float> @test_copysign_v4f32_v4f64(<vscale x 4 x float> %a,
 ; CHECK_EXTEND_ROUND-LABEL: test_copysign_v4f32_v4f64:
 ; CHECK_EXTEND_ROUND:       // %bb.0:
 ; CHECK_EXTEND_ROUND-NEXT:    ptrue p0.d
-; CHECK_EXTEND_ROUND-NEXT:    mov z3.s, #0x7fffffff
+; CHECK_EXTEND_ROUND-NEXT:    uunpkhi z3.d, z0.s
+; CHECK_EXTEND_ROUND-NEXT:    mov z4.s, #0x7fffffff
+; CHECK_EXTEND_ROUND-NEXT:    uunpklo z0.d, z0.s
 ; CHECK_EXTEND_ROUND-NEXT:    fcvt z2.s, p0/m, z2.d
-; CHECK_EXTEND_ROUND-NEXT:    uunpkhi z4.d, z0.s
 ; CHECK_EXTEND_ROUND-NEXT:    fcvt z1.s, p0/m, z1.d
-; CHECK_EXTEND_ROUND-NEXT:    uunpklo z0.d, z0.s
-; CHECK_EXTEND_ROUND-NEXT:    bsl z4.d, z4.d, z2.d, z3.d
-; CHECK_EXTEND_ROUND-NEXT:    bsl z0.d, z0.d, z1.d, z3.d
-; CHECK_EXTEND_ROUND-NEXT:    uzp1 z0.s, z0.s, z4.s
+; CHECK_EXTEND_ROUND-NEXT:    bsl z3.d, z3.d, z2.d, z4.d
+; CHECK_EXTEND_ROUND-NEXT:    bsl z0.d, z0.d, z1.d, z4.d
+; CHECK_EXTEND_ROUND-NEXT:    uzp1 z0.s, z0.s, z3.s
 ; CHECK_EXTEND_ROUND-NEXT:    ret
   %tmp0 = fptrunc <vscale x 4 x double> %b to <vscale x 4 x float>
   %r = call <vscale x 4 x float> @llvm.copysign.v4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %tmp0)
@@ -110,9 +110,9 @@ define <vscale x 4 x double> @test_copysign_v4f64_v4f32(<vscale x 4 x double> %a
 ; CHECK_NO_EXTEND_ROUND-NEXT:    ptrue p0.d
 ; CHECK_NO_EXTEND_ROUND-NEXT:    uunpkhi z3.d, z2.s
 ; CHECK_NO_EXTEND_ROUND-NEXT:    uunpklo z2.d, z2.s
+; CHECK_NO_EXTEND_ROUND-NEXT:    mov z4.d, #0x7fffffffffffffff
 ; CHECK_NO_EXTEND_ROUND-NEXT:    fcvt z3.d, p0/m, z3.s
 ; CHECK_NO_EXTEND_ROUND-NEXT:    fcvt z2.d, p0/m, z2.s
-; CHECK_NO_EXTEND_ROUND-NEXT:    mov z4.d, #0x7fffffffffffffff
 ; CHECK_NO_EXTEND_ROUND-NEXT:    bsl z0.d, z0.d, z2.d, z4.d
 ; CHECK_NO_EXTEND_ROUND-NEXT:    bsl z1.d, z1.d, z3.d, z4.d
 ; CHECK_NO_EXTEND_ROUND-NEXT:    ret
@@ -120,13 +120,13 @@ define <vscale x 4 x double> @test_copysign_v4f64_v4f32(<vscale x 4 x double> %a
 ; CHECK_EXTEND_ROUND-LABEL: test_copysign_v4f64_v4f32:
 ; CHECK_EXTEND_ROUND:       // %bb.0:
 ; CHECK_EXTEND_ROUND-NEXT:    ptrue p0.d
-; CHECK_EXTEND_ROUND-NEXT:    uunpklo z3.d, z2.s
-; CHECK_EXTEND_ROUND-NEXT:    uunpkhi z2.d, z2.s
-; CHECK_EXTEND_ROUND-NEXT:    fcvt z3.d, p0/m, z3.s
+; CHECK_EXTEND_ROUND-NEXT:    uunpkhi z3.d, z2.s
+; CHECK_EXTEND_ROUND-NEXT:    uunpklo z2.d, z2.s
 ; CHECK_EXTEND_ROUND-NEXT:    mov z4.d, #0x7fffffffffffffff
 ; CHECK_EXTEND_ROUND-NEXT:    fcvt z2.d, p0/m, z2.s
-; CHECK_EXTEND_ROUND-NEXT:    bsl z0.d, z0.d, z3.d, z4.d
-; CHECK_EXTEND_ROUND-NEXT:    bsl z1.d, z1.d, z2.d, z4.d
+; CHECK_EXTEND_ROUND-NEXT:    fcvt z3.d, p0/m, z3.s
+; CHECK_EXTEND_ROUND-NEXT:    bsl z0.d, z0.d, z2.d, z4.d
+; CHECK_EXTEND_ROUND-NEXT:    bsl z1.d, z1.d, z3.d, z4.d
 ; CHECK_EXTEND_ROUND-NEXT:    ret
   %tmp0 = fpext <vscale x 4 x float> %b to <vscale x 4 x double>
   %r = call <vscale x 4 x double> @llvm.copysign.v4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %tmp0)
@@ -186,14 +186,14 @@ define <vscale x 4 x half> @test_copysign_v4f16_v4f64(<vscale x 4 x half> %a, <v
 ; CHECK_EXTEND_ROUND-LABEL: test_copysign_v4f16_v4f64:
 ; CHECK_EXTEND_ROUND:       // %bb.0:
 ; CHECK_EXTEND_ROUND-NEXT:    ptrue p0.d
-; CHECK_EXTEND_ROUND-NEXT:    mov z3.h, #32767 // =0x7fff
+; CHECK_EXTEND_ROUND-NEXT:    uunpkhi z3.d, z0.s
+; CHECK_EXTEND_ROUND-NEXT:    mov z4.h, #32767 // =0x7fff
+; CHECK_EXTEND_ROUND-NEXT:    uunpklo z0.d, z0.s
 ; CHECK_EXTEND_ROUND-NEXT:    fcvt z2.h, p0/m, z2.d
-; CHECK_EXTEND_ROUND-NEXT:    uunpkhi z4.d, z0.s
 ; CHECK_EXTEND_ROUND-NEXT:    fcvt z1.h, p0/m, z1.d
-; CHECK_EXTEND_ROUND-NEXT:    uunpklo z0.d, z0.s
-; CHECK_EXTEND_ROUND-NEXT:    bsl z4.d, z4.d, z2.d, z3.d
-; CHECK_EXTEND_ROUND-NEXT:    bsl z0.d, z0.d, z1.d, z3.d
-; CHECK_EXTEND_ROUND-NEXT:    uzp1 z0.s, z0.s, z4.s
+; CHECK_EXTEND_ROUND-NEXT:    bsl z3.d, z3.d, z2.d, z4.d
+; CHECK_EXTEND_ROUND-NEXT:    bsl z0.d, z0.d, z1.d, z4.d
+; CHECK_EXTEND_ROUND-NEXT:    uzp1 z0.s, z0.s, z3.s
 ; CHECK_EXTEND_ROUND-NEXT:    ret
   %tmp0 = fptrunc <vscale x 4 x double> %b to <vscale x 4 x half>
   %r = call <vscale x 4 x half> @llvm.copysign.v4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %tmp0)
@@ -228,14 +228,14 @@ define <vscale x 8 x half> @test_copysign_v8f16_v8f32(<vscale x 8 x half> %a, <v
 ; CHECK_EXTEND_ROUND-LABEL: test_copysign_v8f16_v8f32:
 ; CHECK_EXTEND_ROUND:       // %bb.0:
 ; CHECK_EXTEND_ROUND-NEXT:    ptrue p0.s
-; CHECK_EXTEND_ROUND-NEXT:    mov z3.h, #32767 // =0x7fff
+; CHECK_EXTEND_ROUND-NEXT:    uunpkhi z3.s, z0.h
+; CHECK_EXTEND_ROUND-NEXT:    mov z4.h, #32767 // =0x7fff
+; CHECK_EXTEND_ROUND-NEXT:    uunpklo z0.s, z0.h
 ; CHECK_EXTEND_ROUND-NEXT:    fcvt z2.h, p0/m, z2.s
-; CHECK_EXTEND_ROUND-NEXT:    uunpkhi z4.s, z0.h
 ; CHECK_EXTEND_ROUND-NEXT:    fcvt z1.h, p0/m, z1.s
-; CHECK_EXTEND_ROUND-NEXT:    uunpklo z0.s, z0.h
-; CHECK_EXTEND_ROUND-NEXT:    bsl z4.d, z4.d, z2.d, z3.d
-; CHECK_EXTEND_ROUND-NEXT:    bsl z0.d, z0.d, z1.d, z3.d
-; CHECK_EXTEND_ROUND-NEXT:    uzp1 z0.h, z0.h, z4.h
+; CHECK_EXTEND_ROUND-NEXT:    bsl z3.d, z3.d, z2.d, z4.d
+; CHECK_EXTEND_ROUND-NEXT:    bsl z0.d, z0.d, z1.d, z4.d
+; CHECK_EXTEND_ROUND-NEXT:    uzp1 z0.h, z0.h, z3.h
 ; CHECK_EXTEND_ROUND-NEXT:    ret
   %tmp0 = fptrunc <vscale x 8 x float> %b to <vscale x 8 x half>
   %r = call <vscale x 8 x half> @llvm.copysign.v8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %tmp0)

diff  --git a/llvm/test/CodeGen/AArch64/sve2-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve2-fixed-length-fcopysign.ll
index 82ea8514f24dd0..e77f85bd46dd38 100644
--- a/llvm/test/CodeGen/AArch64/sve2-fixed-length-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-fixed-length-fcopysign.ll
@@ -16,10 +16,10 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @test_copysign_v4f16_v4f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: test_copysign_v4f16_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    mvni v2.4h, #128, lsl #8
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    mvni v0.4h, #128, lsl #8
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
+; CHECK-NEXT:    bsl v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
   %a = load <4 x half>, ptr %ap
@@ -32,10 +32,10 @@ define void @test_copysign_v4f16_v4f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 define void @test_copysign_v8f16_v8f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: test_copysign_v8f16_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    mvni v2.8h, #128, lsl #8
-; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    mvni v0.8h, #128, lsl #8
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
+; CHECK-NEXT:    bsl v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
   %a = load <8 x half>, ptr %ap
@@ -49,11 +49,11 @@ define void @test_copysign_v16f16_v16f16(ptr %ap, ptr %bp) vscale_range(2,0) #0
 ; CHECK-LABEL: test_copysign_v16f16_v16f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl16
-; CHECK-NEXT:    mov z2.h, #32767 // =0x7fff
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; CHECK-NEXT:    bsl z0.d, z0.d, z1.d, z2.d
-; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    mov z0.h, #32767 // =0x7fff
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x1]
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %a = load <16 x half>, ptr %ap
   %b = load <16 x half>, ptr %bp
@@ -65,27 +65,27 @@ define void @test_copysign_v16f16_v16f16(ptr %ap, ptr %bp) vscale_range(2,0) #0
 define void @test_copysign_v32f16_v32f16(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: test_copysign_v32f16_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #16
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_256-NEXT:    mov z4.h, #32767 // =0x7fff
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    bsl z0.d, z0.d, z2.d, z4.d
-; VBITS_GE_256-NEXT:    bsl z1.d, z1.d, z3.d, z4.d
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
+; VBITS_GE_256-NEXT:    mov z0.h, #32767 // =0x7fff
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z4.h }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    bsl z1.d, z1.d, z3.d, z0.d
+; VBITS_GE_256-NEXT:    bsl z2.d, z2.d, z4.d, z0.d
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z2.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: test_copysign_v32f16_v32f16:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
-; VBITS_GE_512-NEXT:    mov z2.h, #32767 // =0x7fff
-; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    bsl z0.d, z0.d, z1.d, z2.d
-; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    mov z0.h, #32767 // =0x7fff
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z2.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; VBITS_GE_512-NEXT:    st1h { z1.h }, p0, [x0]
 ; VBITS_GE_512-NEXT:    ret
   %a = load <32 x half>, ptr %ap
   %b = load <32 x half>, ptr %bp
@@ -98,11 +98,11 @@ define void @test_copysign_v64f16_v64f16(ptr %ap, ptr %bp) vscale_range(8,0) #0
 ; CHECK-LABEL: test_copysign_v64f16_v64f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl64
-; CHECK-NEXT:    mov z2.h, #32767 // =0x7fff
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; CHECK-NEXT:    bsl z0.d, z0.d, z1.d, z2.d
-; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    mov z0.h, #32767 // =0x7fff
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x1]
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %a = load <64 x half>, ptr %ap
   %b = load <64 x half>, ptr %bp
@@ -115,11 +115,11 @@ define void @test_copysign_v128f16_v128f16(ptr %ap, ptr %bp) vscale_range(16,0)
 ; CHECK-LABEL: test_copysign_v128f16_v128f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl128
-; CHECK-NEXT:    mov z2.h, #32767 // =0x7fff
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
-; CHECK-NEXT:    bsl z0.d, z0.d, z1.d, z2.d
-; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    mov z0.h, #32767 // =0x7fff
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x1]
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %a = load <128 x half>, ptr %ap
   %b = load <128 x half>, ptr %bp
@@ -133,10 +133,10 @@ define void @test_copysign_v128f16_v128f16(ptr %ap, ptr %bp) vscale_range(16,0)
 define void @test_copysign_v2f32_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: test_copysign_v2f32_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    mvni v2.2s, #128, lsl #24
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    mvni v0.2s, #128, lsl #24
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d2, [x1]
+; CHECK-NEXT:    bsl v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
   %a = load <2 x float>, ptr %ap
@@ -149,10 +149,10 @@ define void @test_copysign_v2f32_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 define void @test_copysign_v4f32_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: test_copysign_v4f32_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    mvni v2.4s, #128, lsl #24
-; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    mvni v0.4s, #128, lsl #24
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
+; CHECK-NEXT:    bsl v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
   %a = load <4 x float>, ptr %ap
@@ -166,11 +166,11 @@ define void @test_copysign_v8f32_v8f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: test_copysign_v8f32_v8f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl8
-; CHECK-NEXT:    mov z2.s, #0x7fffffff
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; CHECK-NEXT:    bsl z0.d, z0.d, z1.d, z2.d
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    mov z0.s, #0x7fffffff
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    st1w { z1.s }, p0, [x0]
 ; CHECK-NEXT:    ret
   %a = load <8 x float>, ptr %ap
   %b = load <8 x float>, ptr %bp
@@ -182,27 +182,27 @@ define void @test_copysign_v8f32_v8f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 define void @test_copysign_v16f32_v16f32(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: test_copysign_v16f32_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #8
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    mov z4.s, #0x7fffffff
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    bsl z0.d, z0.d, z2.d, z4.d
-; VBITS_GE_256-NEXT:    bsl z1.d, z1.d, z3.d, z4.d
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
+; VBITS_GE_256-NEXT:    mov z0.s, #0x7fffffff
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z4.s }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    bsl z1.d, z1.d, z3.d, z0.d
+; VBITS_GE_256-NEXT:    bsl z2.d, z2.d, z4.d, z0.d
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z2.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: test_copysign_v16f32_v16f32:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
-; VBITS_GE_512-NEXT:    mov z2.s, #0x7fffffff
-; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    bsl z0.d, z0.d, z1.d, z2.d
-; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    mov z0.s, #0x7fffffff
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z2.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; VBITS_GE_512-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_512-NEXT:    ret
   %a = load <16 x float>, ptr %ap
   %b = load <16 x float>, ptr %bp
@@ -215,11 +215,11 @@ define void @test_copysign_v32f32_v32f32(ptr %ap, ptr %bp) vscale_range(8,0) #0
 ; CHECK-LABEL: test_copysign_v32f32_v32f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl32
-; CHECK-NEXT:    mov z2.s, #0x7fffffff
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; CHECK-NEXT:    bsl z0.d, z0.d, z1.d, z2.d
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    mov z0.s, #0x7fffffff
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    st1w { z1.s }, p0, [x0]
 ; CHECK-NEXT:    ret
   %a = load <32 x float>, ptr %ap
   %b = load <32 x float>, ptr %bp
@@ -232,11 +232,11 @@ define void @test_copysign_v64f32_v64f32(ptr %ap, ptr %bp) vscale_range(16,0) #0
 ; CHECK-LABEL: test_copysign_v64f32_v64f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl64
-; CHECK-NEXT:    mov z2.s, #0x7fffffff
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; CHECK-NEXT:    bsl z0.d, z0.d, z1.d, z2.d
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    mov z0.s, #0x7fffffff
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    st1w { z1.s }, p0, [x0]
 ; CHECK-NEXT:    ret
   %a = load <64 x float>, ptr %ap
   %b = load <64 x float>, ptr %bp
@@ -268,11 +268,11 @@ define void @test_copysign_v4f64_v4f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: test_copysign_v4f64_v4f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
-; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; CHECK-NEXT:    bsl z0.d, z0.d, z1.d, z2.d
-; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    mov z0.d, #0x7fffffffffffffff
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    st1d { z1.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   %a = load <4 x double>, ptr %ap
   %b = load <4 x double>, ptr %bp
@@ -284,27 +284,27 @@ define void @test_copysign_v4f64_v4f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 define void @test_copysign_v8f64_v8f64(ptr %ap, ptr %bp) #0 {
 ; VBITS_GE_256-LABEL: test_copysign_v8f64_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    mov x8, #4
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_256-NEXT:    mov z4.d, #0x7fffffffffffffff
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    bsl z0.d, z0.d, z2.d, z4.d
-; VBITS_GE_256-NEXT:    bsl z1.d, z1.d, z3.d, z4.d
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT:    mov z0.d, #0x7fffffffffffffff
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    bsl z1.d, z1.d, z3.d, z0.d
+; VBITS_GE_256-NEXT:    bsl z2.d, z2.d, z4.d, z0.d
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z2.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: test_copysign_v8f64_v8f64:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
-; VBITS_GE_512-NEXT:    mov z2.d, #0x7fffffffffffffff
-; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    bsl z0.d, z0.d, z1.d, z2.d
-; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    mov z0.d, #0x7fffffffffffffff
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z2.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; VBITS_GE_512-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_512-NEXT:    ret
   %a = load <8 x double>, ptr %ap
   %b = load <8 x double>, ptr %bp
@@ -317,11 +317,11 @@ define void @test_copysign_v16f64_v16f64(ptr %ap, ptr %bp) vscale_range(8,0) #0
 ; CHECK-LABEL: test_copysign_v16f64_v16f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl16
-; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; CHECK-NEXT:    bsl z0.d, z0.d, z1.d, z2.d
-; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    mov z0.d, #0x7fffffffffffffff
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    st1d { z1.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   %a = load <16 x double>, ptr %ap
   %b = load <16 x double>, ptr %bp
@@ -334,11 +334,11 @@ define void @test_copysign_v32f64_v32f64(ptr %ap, ptr %bp) vscale_range(16,0) #0
 ; CHECK-LABEL: test_copysign_v32f64_v32f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl32
-; CHECK-NEXT:    mov z2.d, #0x7fffffffffffffff
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; CHECK-NEXT:    bsl z0.d, z0.d, z1.d, z2.d
-; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    mov z0.d, #0x7fffffffffffffff
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
+; CHECK-NEXT:    bsl z1.d, z1.d, z2.d, z0.d
+; CHECK-NEXT:    st1d { z1.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   %a = load <32 x double>, ptr %ap
   %b = load <32 x double>, ptr %bp
@@ -353,10 +353,10 @@ define void @test_copysign_v2f32_v2f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: test_copysign_v2f32_v2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    mvni v2.2s, #128, lsl #24
-; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    mvni v1.2s, #128, lsl #24
+; CHECK-NEXT:    ldr d2, [x0]
 ; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    bit v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    bit v0.8b, v2.8b, v1.8b
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
   %a = load <2 x float>, ptr %ap
@@ -375,10 +375,10 @@ define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mvni v2.4s, #128, lsl #24
-; CHECK-NEXT:    fcvt z1.s, p0/m, z1.d
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fcvt z1.s, p1/m, z1.d
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    str q0, [x0]
@@ -452,10 +452,10 @@ define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: test_copysign_v4f16_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    mvni v2.4h, #128, lsl #8
-; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    mvni v1.4h, #128, lsl #8
+; CHECK-NEXT:    ldr d2, [x0]
 ; CHECK-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-NEXT:    bit v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    bit v0.8b, v2.8b, v1.8b
 ; CHECK-NEXT:    str d0, [x0]
 ; CHECK-NEXT:    ret
   %a = load <4 x half>, ptr %ap
@@ -471,10 +471,10 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mvni v2.4h, #128, lsl #8
-; CHECK-NEXT:    fcvt z1.h, p0/m, z1.d
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    fcvt z1.h, p1/m, z1.d
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
@@ -498,10 +498,10 @@ define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mvni v2.8h, #128, lsl #8
-; CHECK-NEXT:    fcvt z1.h, p0/m, z1.s
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
+; CHECK-NEXT:    fcvt z1.h, p1/m, z1.s
 ; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    str q0, [x0]

diff  --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll
index c4f9858d559a12..f94daa45fb82a6 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll
@@ -105,10 +105,10 @@ define void @wide_add_shift_add_rshrnb_b(ptr %dest, i64 %index, <vscale x 16 x i
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    rshrnb z1.b, z1.h, #6
-; CHECK-NEXT:    ld1b { z2.b }, p0/z, [x0, x1]
 ; CHECK-NEXT:    rshrnb z0.b, z0.h, #6
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
-; CHECK-NEXT:    add z0.b, z2.b, z0.b
+; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x0, x1]
+; CHECK-NEXT:    add z0.b, z1.b, z0.b
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0, x1]
 ; CHECK-NEXT:    ret
   %1 = add <vscale x 16 x i16> %arg1, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 32, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
@@ -126,10 +126,10 @@ define void @wide_add_shift_add_rshrnb_h(ptr %dest, i64 %index, <vscale x 8 x i3
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    rshrnb z1.h, z1.s, #6
-; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x0, x1, lsl #1]
 ; CHECK-NEXT:    rshrnb z0.h, z0.s, #6
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
-; CHECK-NEXT:    add z0.h, z2.h, z0.h
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT:    add z0.h, z1.h, z0.h
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0, x1, lsl #1]
 ; CHECK-NEXT:    ret
   %1 = add <vscale x 8 x i32> %arg1, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 32, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
@@ -184,10 +184,10 @@ define void @neg_add_has_two_uses(ptr %ptr, ptr %dst, ptr %dst2, i64 %index){
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    add z0.h, z0.h, #32 // =0x20
-; CHECK-NEXT:    lsr z1.h, z0.h, #6
-; CHECK-NEXT:    add z0.h, z0.h, z0.h
-; CHECK-NEXT:    st1h { z0.h }, p0, [x2, x3, lsl #1]
-; CHECK-NEXT:    st1b { z1.h }, p0, [x1, x3]
+; CHECK-NEXT:    add z1.h, z0.h, z0.h
+; CHECK-NEXT:    lsr z0.h, z0.h, #6
+; CHECK-NEXT:    st1h { z1.h }, p0, [x2, x3, lsl #1]
+; CHECK-NEXT:    st1b { z0.h }, p0, [x1, x3]
 ; CHECK-NEXT:    ret
   %load = load <vscale x 8 x i16>, ptr %ptr, align 2
   %1 = add <vscale x 8 x i16> %load, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 32, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)

diff  --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx2.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx2.ll
index 8bae00df00ef0b..a42d99fd8318e4 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx2.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx2.ll
@@ -8,10 +8,10 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8> } @sel_x2_i8(target("aarch64.svc
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z5.d, z4.d
-; CHECK-NEXT:    mov z7.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    mov z7.d, z2.d
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    sel { z0.b, z1.b }, pn8, { z6.b, z7.b }, { z4.b, z5.b }
@@ -28,10 +28,10 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16> } @sel_x2_i16(target("aarch64.sv
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z5.d, z4.d
-; CHECK-NEXT:    mov z7.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    mov z7.d, z2.d
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    sel { z0.h, z1.h }, pn8, { z6.h, z7.h }, { z4.h, z5.h }
@@ -48,10 +48,10 @@ define { <vscale x 8 x half>, <vscale x 8 x half> } @sel_x2_f16(target("aarch64.
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z5.d, z4.d
-; CHECK-NEXT:    mov z7.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    mov z7.d, z2.d
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    sel { z0.h, z1.h }, pn8, { z6.h, z7.h }, { z4.h, z5.h }
@@ -68,10 +68,10 @@ define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @sel_x2_bf16(target("aar
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z5.d, z4.d
-; CHECK-NEXT:    mov z7.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    mov z7.d, z2.d
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    sel { z0.h, z1.h }, pn8, { z6.h, z7.h }, { z4.h, z5.h }
@@ -88,10 +88,10 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32> } @sel_x2_i32(target("aarch64.sv
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z5.d, z4.d
-; CHECK-NEXT:    mov z7.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    mov z7.d, z2.d
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    sel { z0.s, z1.s }, pn8, { z6.s, z7.s }, { z4.s, z5.s }
@@ -108,10 +108,10 @@ define { <vscale x 4 x float>, <vscale x 4 x float> } @sel_x2_f32(target("aarch6
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z5.d, z4.d
-; CHECK-NEXT:    mov z7.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    mov z7.d, z2.d
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    sel { z0.s, z1.s }, pn8, { z6.s, z7.s }, { z4.s, z5.s }
@@ -128,10 +128,10 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64> } @sel_x2_i64(target("aarch64.sv
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z5.d, z4.d
-; CHECK-NEXT:    mov z7.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    mov z7.d, z2.d
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    sel { z0.d, z1.d }, pn8, { z6.d, z7.d }, { z4.d, z5.d }
@@ -148,10 +148,10 @@ define { <vscale x 2 x double>, <vscale x 2 x double> } @sel_x2_f64(target("aarc
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z5.d, z4.d
-; CHECK-NEXT:    mov z7.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    mov z7.d, z2.d
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    sel { z0.d, z1.d }, pn8, { z6.d, z7.d }, { z4.d, z5.d }

diff  --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll
index 5505aea7879c7e..df504362680ba1 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll
@@ -8,17 +8,17 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z26.d, z7.d
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    mov z24.d, z5.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    ld1b { z27.b }, p1/z, [x0]
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    ld1b { z27.b }, p1/z, [x0]
 ; CHECK-NEXT:    sel { z0.b - z3.b }, pn8, { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -33,17 +33,17 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z26.d, z7.d
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    mov z24.d, z5.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    ld1h { z27.h }, p1/z, [x0]
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    ld1h { z27.h }, p1/z, [x0]
 ; CHECK-NEXT:    sel { z0.h - z3.h }, pn8, { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -58,17 +58,17 @@ define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z26.d, z7.d
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    mov z24.d, z5.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    ld1h { z27.h }, p1/z, [x0]
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    ld1h { z27.h }, p1/z, [x0]
 ; CHECK-NEXT:    sel { z0.h - z3.h }, pn8, { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -83,17 +83,17 @@ define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <v
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z26.d, z7.d
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    mov z24.d, z5.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    ld1h { z27.h }, p1/z, [x0]
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    ld1h { z27.h }, p1/z, [x0]
 ; CHECK-NEXT:    sel { z0.h - z3.h }, pn8, { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -108,17 +108,17 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z26.d, z7.d
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    mov z24.d, z5.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    ld1w { z27.s }, p1/z, [x0]
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    ld1w { z27.s }, p1/z, [x0]
 ; CHECK-NEXT:    sel { z0.s - z3.s }, pn8, { z28.s - z31.s }, { z24.s - z27.s }
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -133,17 +133,17 @@ define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vsca
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z26.d, z7.d
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    mov z24.d, z5.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    ld1w { z27.s }, p1/z, [x0]
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    ld1w { z27.s }, p1/z, [x0]
 ; CHECK-NEXT:    sel { z0.s - z3.s }, pn8, { z28.s - z31.s }, { z24.s - z27.s }
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -158,17 +158,17 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z26.d, z7.d
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    mov z24.d, z5.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    ld1d { z27.d }, p1/z, [x0]
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    ld1d { z27.d }, p1/z, [x0]
 ; CHECK-NEXT:    sel { z0.d - z3.d }, pn8, { z28.d - z31.d }, { z24.d - z27.d }
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -183,17 +183,17 @@ define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <v
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z26.d, z7.d
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    mov z24.d, z5.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    ld1d { z27.d }, p1/z, [x0]
+; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
+; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    ld1d { z27.d }, p1/z, [x0]
 ; CHECK-NEXT:    sel { z0.d - z3.d }, pn8, { z28.d - z31.d }, { z24.d - z27.d }
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1

diff  --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll
index eb3d199a31e271..39639120802fa5 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll
@@ -9,9 +9,9 @@ define void @st1_x2_i8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vsc
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    st1b { z2.b, z3.b }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -27,9 +27,9 @@ define void @st1_x2_i16(<vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vs
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    st1h { z2.h, z3.h }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -45,9 +45,9 @@ define void @st1_x2_i32(<vscale x 16 x i8> %unused, <vscale x 4 x i32> %zn0, <vs
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    st1w { z2.s, z3.s }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -63,9 +63,9 @@ define void @st1_x2_i64(<vscale x 16 x i8> %unused, <vscale x 2 x i64> %zn0, <vs
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    st1d { z2.d, z3.d }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -81,9 +81,9 @@ define void @st1_x2_f16(<vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <v
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    st1h { z2.h, z3.h }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -99,9 +99,9 @@ define void @st1_x2_bf16(<vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0,
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    st1h { z2.h, z3.h }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -117,9 +117,9 @@ define void @st1_x2_f32(<vscale x 16 x i8> %unused, <vscale x 4 x float> %zn0, <
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    st1w { z2.s, z3.s }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -135,9 +135,9 @@ define void @st1_x2_f64(<vscale x 16 x i8> %unused, <vscale x 2 x double> %zn0,
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    st1d { z2.d, z3.d }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -153,10 +153,10 @@ define void @st1_x4_i8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vsc
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    st1b { z4.b - z7.b }, pn8, [x0]
@@ -173,10 +173,10 @@ define void @st1_x4_i16(<vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vs
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    st1h { z4.h - z7.h }, pn8, [x0]
@@ -193,10 +193,10 @@ define void @st1_x4_i32(<vscale x 16 x i8> %unused, <vscale x 4 x i32> %zn0, <vs
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    st1w { z4.s - z7.s }, pn8, [x0]
@@ -213,10 +213,10 @@ define void @st1_x4_i64(<vscale x 16 x i8> %unused, <vscale x 2 x i64> %zn0, <vs
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    st1d { z4.d - z7.d }, pn8, [x0]
@@ -233,10 +233,10 @@ define void @st1_x4_f16(<vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <v
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    st1h { z4.h - z7.h }, pn8, [x0]
@@ -253,10 +253,10 @@ define void @st1_x4_bf16(<vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0,
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    st1h { z4.h - z7.h }, pn8, [x0]
@@ -273,10 +273,10 @@ define void @st1_x4_f32(<vscale x 16 x i8> %unused, <vscale x 4 x float> %zn0, <
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    st1w { z4.s - z7.s }, pn8, [x0]
@@ -293,10 +293,10 @@ define void @st1_x4_f64(<vscale x 16 x i8> %unused, <vscale x 2 x double> %zn0,
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    st1d { z4.d - z7.d }, pn8, [x0]
@@ -315,9 +315,9 @@ define void @stnt1_x2_i8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <v
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    stnt1b { z2.b, z3.b }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -333,9 +333,9 @@ define void @stnt1_x2_i16(<vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    stnt1h { z2.h, z3.h }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -351,9 +351,9 @@ define void @stnt1_x2_i32(<vscale x 16 x i8> %unused, <vscale x 4 x i32> %zn0, <
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    stnt1w { z2.s, z3.s }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -369,9 +369,9 @@ define void @stnt1_x2_i64(<vscale x 16 x i8> %unused, <vscale x 2 x i64> %zn0, <
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    stnt1d { z2.d, z3.d }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -387,9 +387,9 @@ define void @stnt1_x2_f16(<vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0,
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    stnt1h { z2.h, z3.h }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -405,9 +405,9 @@ define void @stnt1_x2_bf16(<vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    stnt1h { z2.h, z3.h }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -423,9 +423,9 @@ define void @stnt1_x2_f32(<vscale x 16 x i8> %unused, <vscale x 4 x float> %zn0,
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    stnt1w { z2.s, z3.s }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -441,9 +441,9 @@ define void @stnt1_x2_f64(<vscale x 16 x i8> %unused, <vscale x 2 x double> %zn0
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    stnt1d { z2.d, z3.d }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -459,10 +459,10 @@ define void @stnt1_x4_i8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <v
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    stnt1b { z4.b - z7.b }, pn8, [x0]
@@ -479,10 +479,10 @@ define void @stnt1_x4_i16(<vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    stnt1h { z4.h - z7.h }, pn8, [x0]
@@ -499,10 +499,10 @@ define void @stnt1_x4_i32(<vscale x 16 x i8> %unused, <vscale x 4 x i32> %zn0, <
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    stnt1w { z4.s - z7.s }, pn8, [x0]
@@ -519,10 +519,10 @@ define void @stnt1_x4_i64(<vscale x 16 x i8> %unused, <vscale x 2 x i64> %zn0, <
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    stnt1d { z4.d - z7.d }, pn8, [x0]
@@ -539,10 +539,10 @@ define void @stnt1_x4_f16(<vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0,
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    stnt1h { z4.h - z7.h }, pn8, [x0]
@@ -559,10 +559,10 @@ define void @stnt1_x4_bf16(<vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    stnt1h { z4.h - z7.h }, pn8, [x0]
@@ -579,10 +579,10 @@ define void @stnt1_x4_f32(<vscale x 16 x i8> %unused, <vscale x 4 x float> %zn0,
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    stnt1w { z4.s - z7.s }, pn8, [x0]
@@ -599,10 +599,10 @@ define void @stnt1_x4_f64(<vscale x 16 x i8> %unused, <vscale x 2 x double> %zn0
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov z6.d, z3.d
+; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    stnt1d { z4.d - z7.d }, pn8, [x0]

diff  --git a/llvm/test/CodeGen/AArch64/swift-return.ll b/llvm/test/CodeGen/AArch64/swift-return.ll
index 5cd9182b6842f4..c0069c8e7a2391 100644
--- a/llvm/test/CodeGen/AArch64/swift-return.ll
+++ b/llvm/test/CodeGen/AArch64/swift-return.ll
@@ -236,8 +236,8 @@ define swiftcc { i8, i8, i8, i8 } @gen9(i8 %key) {
 ; CHECK-LABEL: _gen10
 ; CHECK:  fmov         d1, d0
 ; CHECK:  fmov         d2, d0
-; CHECK:  fmov         d3, d0
 ; CHECK:  mov      w1, w0
+; CHECK:  fmov         d3, d0
 ; CHECK:  mov      w2, w0
 ; CHECK:  mov      w3, w0
 ; CHECK:  ret

diff  --git a/llvm/test/CodeGen/AArch64/swifterror.ll b/llvm/test/CodeGen/AArch64/swifterror.ll
index d76fb959fd6cef..cd06f8dbfad84c 100644
--- a/llvm/test/CodeGen/AArch64/swifterror.ll
+++ b/llvm/test/CodeGen/AArch64/swifterror.ll
@@ -18,10 +18,10 @@ define float @foo(ptr swifterror %error_ptr_ref) {
 ; CHECK-APPLE-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-APPLE-NEXT:    .cfi_offset w30, -8
 ; CHECK-APPLE-NEXT:    .cfi_offset w29, -16
-; CHECK-APPLE-NEXT:    mov w0, #16
+; CHECK-APPLE-NEXT:    mov w0, #16 ; =0x10
 ; CHECK-APPLE-NEXT:    bl _malloc
-; CHECK-APPLE-NEXT:    mov w8, #1
 ; CHECK-APPLE-NEXT:    fmov s0, #1.00000000
+; CHECK-APPLE-NEXT:    mov w8, #1 ; =0x1
 ; CHECK-APPLE-NEXT:    mov x21, x0
 ; CHECK-APPLE-NEXT:    strb w8, [x0, #8]
 ; CHECK-APPLE-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
@@ -34,11 +34,11 @@ define float @foo(ptr swifterror %error_ptr_ref) {
 ; CHECK-O0-AARCH64-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-O0-AARCH64-NEXT:    .cfi_offset w30, -8
 ; CHECK-O0-AARCH64-NEXT:    .cfi_offset w29, -16
-; CHECK-O0-AARCH64-NEXT:    mov w8, #16
+; CHECK-O0-AARCH64-NEXT:    mov w8, #16 ; =0x10
 ; CHECK-O0-AARCH64-NEXT:    mov w0, w8
 ; CHECK-O0-AARCH64-NEXT:    bl _malloc
 ; CHECK-O0-AARCH64-NEXT:    mov x21, x0
-; CHECK-O0-AARCH64-NEXT:    mov w8, #1
+; CHECK-O0-AARCH64-NEXT:    mov w8, #1 ; =0x1
 ; CHECK-O0-AARCH64-NEXT:    strb w8, [x0, #8]
 ; CHECK-O0-AARCH64-NEXT:    fmov s0, #1.00000000
 ; CHECK-O0-AARCH64-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
@@ -49,11 +49,11 @@ define float @foo(ptr swifterror %error_ptr_ref) {
 ; CHECK-O0-ARM64_32-NEXT:    str x30, [sp, #-16]! ; 8-byte Folded Spill
 ; CHECK-O0-ARM64_32-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-O0-ARM64_32-NEXT:    .cfi_offset w30, -16
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #16
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #16 ; =0x10
 ; CHECK-O0-ARM64_32-NEXT:    mov w0, w8
 ; CHECK-O0-ARM64_32-NEXT:    bl _malloc
 ; CHECK-O0-ARM64_32-NEXT:    mov x21, x0
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #1
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #1 ; =0x1
 ; CHECK-O0-ARM64_32-NEXT:    strb w8, [x0, #8]
 ; CHECK-O0-ARM64_32-NEXT:    fmov s0, #1.00000000
 ; CHECK-O0-ARM64_32-NEXT:    ldr x30, [sp], #16 ; 8-byte Folded Reload
@@ -231,8 +231,8 @@ define float @caller2(ptr %error_ref) {
 ; CHECK-APPLE-AARCH64-NEXT:    .cfi_offset w22, -48
 ; CHECK-APPLE-AARCH64-NEXT:    .cfi_offset b8, -56
 ; CHECK-APPLE-AARCH64-NEXT:    .cfi_offset b9, -64
-; CHECK-APPLE-AARCH64-NEXT:    mov x19, x0
 ; CHECK-APPLE-AARCH64-NEXT:    fmov s8, #1.00000000
+; CHECK-APPLE-AARCH64-NEXT:    mov x19, x0
 ; CHECK-APPLE-AARCH64-NEXT:  LBB2_1: ; %bb_loop
 ; CHECK-APPLE-AARCH64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-APPLE-AARCH64-NEXT:    mov x21, xzr
@@ -313,8 +313,8 @@ define float @caller2(ptr %error_ref) {
 ; CHECK-APPLE-ARM64_32-NEXT:    .cfi_offset w22, -48
 ; CHECK-APPLE-ARM64_32-NEXT:    .cfi_offset b8, -56
 ; CHECK-APPLE-ARM64_32-NEXT:    .cfi_offset b9, -64
-; CHECK-APPLE-ARM64_32-NEXT:    mov x19, x0
 ; CHECK-APPLE-ARM64_32-NEXT:    fmov s8, #1.00000000
+; CHECK-APPLE-ARM64_32-NEXT:    mov x19, x0
 ; CHECK-APPLE-ARM64_32-NEXT:  LBB2_1: ; %bb_loop
 ; CHECK-APPLE-ARM64_32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-APPLE-ARM64_32-NEXT:    mov x21, xzr
@@ -414,11 +414,11 @@ define float @foo_if(ptr swifterror %error_ptr_ref, i32 %cc) {
 ; CHECK-APPLE-NEXT:    .cfi_offset w29, -16
 ; CHECK-APPLE-NEXT:    cbz w0, LBB3_2
 ; CHECK-APPLE-NEXT:  ; %bb.1: ; %gen_error
-; CHECK-APPLE-NEXT:    mov w0, #16
+; CHECK-APPLE-NEXT:    mov w0, #16 ; =0x10
 ; CHECK-APPLE-NEXT:    bl _malloc
 ; CHECK-APPLE-NEXT:    mov x21, x0
-; CHECK-APPLE-NEXT:    mov w8, #1
 ; CHECK-APPLE-NEXT:    fmov s0, #1.00000000
+; CHECK-APPLE-NEXT:    mov w8, #1 ; =0x1
 ; CHECK-APPLE-NEXT:    strb w8, [x0, #8]
 ; CHECK-APPLE-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
 ; CHECK-APPLE-NEXT:    ret
@@ -438,11 +438,11 @@ define float @foo_if(ptr swifterror %error_ptr_ref, i32 %cc) {
 ; CHECK-O0-AARCH64-NEXT:    str x21, [sp, #8] ; 8-byte Folded Spill
 ; CHECK-O0-AARCH64-NEXT:    cbz w0, LBB3_2
 ; CHECK-O0-AARCH64-NEXT:  ; %bb.1: ; %gen_error
-; CHECK-O0-AARCH64-NEXT:    mov w8, #16
+; CHECK-O0-AARCH64-NEXT:    mov w8, #16 ; =0x10
 ; CHECK-O0-AARCH64-NEXT:    mov w0, w8
 ; CHECK-O0-AARCH64-NEXT:    bl _malloc
 ; CHECK-O0-AARCH64-NEXT:    mov x21, x0
-; CHECK-O0-AARCH64-NEXT:    mov w8, #1
+; CHECK-O0-AARCH64-NEXT:    mov w8, #1 ; =0x1
 ; CHECK-O0-AARCH64-NEXT:    strb w8, [x0, #8]
 ; CHECK-O0-AARCH64-NEXT:    fmov s0, #1.00000000
 ; CHECK-O0-AARCH64-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
@@ -464,11 +464,11 @@ define float @foo_if(ptr swifterror %error_ptr_ref, i32 %cc) {
 ; CHECK-O0-ARM64_32-NEXT:    str x21, [sp, #8] ; 8-byte Folded Spill
 ; CHECK-O0-ARM64_32-NEXT:    cbz w0, LBB3_2
 ; CHECK-O0-ARM64_32-NEXT:  ; %bb.1: ; %gen_error
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #16
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #16 ; =0x10
 ; CHECK-O0-ARM64_32-NEXT:    mov w0, w8
 ; CHECK-O0-ARM64_32-NEXT:    bl _malloc
 ; CHECK-O0-ARM64_32-NEXT:    mov x21, x0
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #1
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #1 ; =0x1
 ; CHECK-O0-ARM64_32-NEXT:    strb w8, [x0, #8]
 ; CHECK-O0-ARM64_32-NEXT:    fmov s0, #1.00000000
 ; CHECK-O0-ARM64_32-NEXT:    ldr x30, [sp, #16] ; 8-byte Folded Reload
@@ -517,8 +517,8 @@ define float @foo_loop(ptr swifterror %error_ptr_ref, i32 %cc, float %cc2) {
 ; CHECK-APPLE-NEXT:    fmov s8, s0
 ; CHECK-APPLE-NEXT:    mov w19, w0
 ; CHECK-APPLE-NEXT:    mov x0, x21
-; CHECK-APPLE-NEXT:    mov w20, #1
 ; CHECK-APPLE-NEXT:    fmov s9, #1.00000000
+; CHECK-APPLE-NEXT:    mov w20, #1 ; =0x1
 ; CHECK-APPLE-NEXT:    b LBB4_2
 ; CHECK-APPLE-NEXT:  LBB4_1: ; %bb_cont
 ; CHECK-APPLE-NEXT:    ; in Loop: Header=BB4_2 Depth=1
@@ -529,7 +529,7 @@ define float @foo_loop(ptr swifterror %error_ptr_ref, i32 %cc, float %cc2) {
 ; CHECK-APPLE-NEXT:    cbz w19, LBB4_1
 ; CHECK-APPLE-NEXT:  ; %bb.3: ; %gen_error
 ; CHECK-APPLE-NEXT:    ; in Loop: Header=BB4_2 Depth=1
-; CHECK-APPLE-NEXT:    mov w0, #16
+; CHECK-APPLE-NEXT:    mov w0, #16 ; =0x10
 ; CHECK-APPLE-NEXT:    bl _malloc
 ; CHECK-APPLE-NEXT:    strb w20, [x0, #8]
 ; CHECK-APPLE-NEXT:    b LBB4_1
@@ -561,11 +561,11 @@ define float @foo_loop(ptr swifterror %error_ptr_ref, i32 %cc, float %cc2) {
 ; CHECK-O0-AARCH64-NEXT:    cbz w8, LBB4_3
 ; CHECK-O0-AARCH64-NEXT:  ; %bb.2: ; %gen_error
 ; CHECK-O0-AARCH64-NEXT:    ; in Loop: Header=BB4_1 Depth=1
-; CHECK-O0-AARCH64-NEXT:    mov w8, #16
+; CHECK-O0-AARCH64-NEXT:    mov w8, #16 ; =0x10
 ; CHECK-O0-AARCH64-NEXT:    mov w0, w8
 ; CHECK-O0-AARCH64-NEXT:    bl _malloc
 ; CHECK-O0-AARCH64-NEXT:    mov x9, x0
-; CHECK-O0-AARCH64-NEXT:    mov w8, #1
+; CHECK-O0-AARCH64-NEXT:    mov w8, #1 ; =0x1
 ; CHECK-O0-AARCH64-NEXT:    strb w8, [x9, #8]
 ; CHECK-O0-AARCH64-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
 ; CHECK-O0-AARCH64-NEXT:  LBB4_3: ; %bb_cont
@@ -602,13 +602,13 @@ define float @foo_loop(ptr swifterror %error_ptr_ref, i32 %cc, float %cc2) {
 ; CHECK-O0-ARM64_32-NEXT:    cbz w8, LBB4_3
 ; CHECK-O0-ARM64_32-NEXT:  ; %bb.2: ; %gen_error
 ; CHECK-O0-ARM64_32-NEXT:    ; in Loop: Header=BB4_1 Depth=1
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #16
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #16 ; =0x10
 ; CHECK-O0-ARM64_32-NEXT:    mov w0, w8
 ; CHECK-O0-ARM64_32-NEXT:    bl _malloc
 ; CHECK-O0-ARM64_32-NEXT:    mov x9, x0
 ; CHECK-O0-ARM64_32-NEXT:    ; kill: def $x0 killed $x9
 ; CHECK-O0-ARM64_32-NEXT:    mov x0, x9
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #1
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #1 ; =0x1
 ; CHECK-O0-ARM64_32-NEXT:    strb w8, [x9, #8]
 ; CHECK-O0-ARM64_32-NEXT:    str x0, [sp, #8] ; 8-byte Folded Spill
 ; CHECK-O0-ARM64_32-NEXT:  LBB4_3: ; %bb_cont
@@ -671,11 +671,11 @@ define void @foo_sret(ptr sret(%struct.S) %agg.result, i32 %val1, ptr swifterror
 ; CHECK-APPLE-NEXT:    .cfi_offset w20, -32
 ; CHECK-APPLE-NEXT:    mov w19, w0
 ; CHECK-APPLE-NEXT:    mov x20, x8
-; CHECK-APPLE-NEXT:    mov w0, #16
+; CHECK-APPLE-NEXT:    mov w0, #16 ; =0x10
 ; CHECK-APPLE-NEXT:    bl _malloc
-; CHECK-APPLE-NEXT:    mov w8, #1
-; CHECK-APPLE-NEXT:    mov x21, x0
+; CHECK-APPLE-NEXT:    mov w8, #1 ; =0x1
 ; CHECK-APPLE-NEXT:    strb w8, [x0, #8]
+; CHECK-APPLE-NEXT:    mov x21, x0
 ; CHECK-APPLE-NEXT:    str w19, [x20, #4]
 ; CHECK-APPLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
 ; CHECK-APPLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
@@ -691,14 +691,14 @@ define void @foo_sret(ptr sret(%struct.S) %agg.result, i32 %val1, ptr swifterror
 ; CHECK-O0-AARCH64-NEXT:    .cfi_offset w29, -16
 ; CHECK-O0-AARCH64-NEXT:    stur w0, [x29, #-4] ; 4-byte Folded Spill
 ; CHECK-O0-AARCH64-NEXT:    str x8, [sp] ; 8-byte Folded Spill
-; CHECK-O0-AARCH64-NEXT:    mov w8, #16
+; CHECK-O0-AARCH64-NEXT:    mov w8, #16 ; =0x10
 ; CHECK-O0-AARCH64-NEXT:    mov w0, w8
 ; CHECK-O0-AARCH64-NEXT:    bl _malloc
 ; CHECK-O0-AARCH64-NEXT:    ldr x8, [sp] ; 8-byte Folded Reload
 ; CHECK-O0-AARCH64-NEXT:    mov x10, x0
 ; CHECK-O0-AARCH64-NEXT:    ldur w0, [x29, #-4] ; 4-byte Folded Reload
 ; CHECK-O0-AARCH64-NEXT:    mov x21, x10
-; CHECK-O0-AARCH64-NEXT:    mov w9, #1
+; CHECK-O0-AARCH64-NEXT:    mov w9, #1 ; =0x1
 ; CHECK-O0-AARCH64-NEXT:    strb w9, [x10, #8]
 ; CHECK-O0-AARCH64-NEXT:    str w0, [x8, #4]
 ; CHECK-O0-AARCH64-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
@@ -713,14 +713,14 @@ define void @foo_sret(ptr sret(%struct.S) %agg.result, i32 %val1, ptr swifterror
 ; CHECK-O0-ARM64_32-NEXT:    .cfi_offset w30, -16
 ; CHECK-O0-ARM64_32-NEXT:    str w0, [sp, #12] ; 4-byte Folded Spill
 ; CHECK-O0-ARM64_32-NEXT:    str x8, [sp] ; 8-byte Folded Spill
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #16
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #16 ; =0x10
 ; CHECK-O0-ARM64_32-NEXT:    mov w0, w8
 ; CHECK-O0-ARM64_32-NEXT:    bl _malloc
 ; CHECK-O0-ARM64_32-NEXT:    ldr x8, [sp] ; 8-byte Folded Reload
 ; CHECK-O0-ARM64_32-NEXT:    mov x10, x0
 ; CHECK-O0-ARM64_32-NEXT:    ldr w0, [sp, #12] ; 4-byte Folded Reload
 ; CHECK-O0-ARM64_32-NEXT:    mov x21, x10
-; CHECK-O0-ARM64_32-NEXT:    mov w9, #1
+; CHECK-O0-ARM64_32-NEXT:    mov w9, #1 ; =0x1
 ; CHECK-O0-ARM64_32-NEXT:    strb w9, [x10, #8]
 ; CHECK-O0-ARM64_32-NEXT:    str w0, [x8, #4]
 ; CHECK-O0-ARM64_32-NEXT:    ldr x30, [sp, #16] ; 8-byte Folded Reload
@@ -757,7 +757,7 @@ define float @caller3(ptr %error_ref) {
 ; CHECK-APPLE-AARCH64-NEXT:    .cfi_offset w22, -48
 ; CHECK-APPLE-AARCH64-NEXT:    mov x19, x0
 ; CHECK-APPLE-AARCH64-NEXT:    add x8, sp, #8
-; CHECK-APPLE-AARCH64-NEXT:    mov w0, #1
+; CHECK-APPLE-AARCH64-NEXT:    mov w0, #1 ; =0x1
 ; CHECK-APPLE-AARCH64-NEXT:    mov x21, xzr
 ; CHECK-APPLE-AARCH64-NEXT:    bl _foo_sret
 ; CHECK-APPLE-AARCH64-NEXT:    mov x0, x21
@@ -789,7 +789,7 @@ define float @caller3(ptr %error_ref) {
 ; CHECK-O0-AARCH64-NEXT:    str x0, [sp] ; 8-byte Folded Spill
 ; CHECK-O0-AARCH64-NEXT:    mov x21, xzr
 ; CHECK-O0-AARCH64-NEXT:    add x8, sp, #24
-; CHECK-O0-AARCH64-NEXT:    mov w0, #1
+; CHECK-O0-AARCH64-NEXT:    mov w0, #1 ; =0x1
 ; CHECK-O0-AARCH64-NEXT:    bl _foo_sret
 ; CHECK-O0-AARCH64-NEXT:    str x21, [sp, #8] ; 8-byte Folded Spill
 ; CHECK-O0-AARCH64-NEXT:    cbnz x21, LBB6_2
@@ -823,7 +823,7 @@ define float @caller3(ptr %error_ref) {
 ; CHECK-APPLE-ARM64_32-NEXT:    .cfi_offset w22, -48
 ; CHECK-APPLE-ARM64_32-NEXT:    mov x19, x0
 ; CHECK-APPLE-ARM64_32-NEXT:    add x8, sp, #8
-; CHECK-APPLE-ARM64_32-NEXT:    mov w0, #1
+; CHECK-APPLE-ARM64_32-NEXT:    mov w0, #1 ; =0x1
 ; CHECK-APPLE-ARM64_32-NEXT:    mov x21, xzr
 ; CHECK-APPLE-ARM64_32-NEXT:    bl _foo_sret
 ; CHECK-APPLE-ARM64_32-NEXT:    mov x0, x21
@@ -854,7 +854,7 @@ define float @caller3(ptr %error_ref) {
 ; CHECK-O0-ARM64_32-NEXT:    str x0, [sp] ; 8-byte Folded Spill
 ; CHECK-O0-ARM64_32-NEXT:    mov x21, xzr
 ; CHECK-O0-ARM64_32-NEXT:    add x8, sp, #24
-; CHECK-O0-ARM64_32-NEXT:    mov w0, #1
+; CHECK-O0-ARM64_32-NEXT:    mov w0, #1 ; =0x1
 ; CHECK-O0-ARM64_32-NEXT:    bl _foo_sret
 ; CHECK-O0-ARM64_32-NEXT:    str x21, [sp, #8] ; 8-byte Folded Spill
 ; CHECK-O0-ARM64_32-NEXT:    cmp x21, #0
@@ -908,21 +908,21 @@ define float @foo_vararg(ptr swifterror %error_ptr_ref, ...) {
 ; CHECK-APPLE-AARCH64-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-APPLE-AARCH64-NEXT:    .cfi_offset w30, -8
 ; CHECK-APPLE-AARCH64-NEXT:    .cfi_offset w29, -16
-; CHECK-APPLE-AARCH64-NEXT:    mov w0, #16
+; CHECK-APPLE-AARCH64-NEXT:    mov w0, #16 ; =0x10
 ; CHECK-APPLE-AARCH64-NEXT:    bl _malloc
-; CHECK-APPLE-AARCH64-NEXT:    mov w8, #1
-; CHECK-APPLE-AARCH64-NEXT:    add x9, x29, #16
-; CHECK-APPLE-AARCH64-NEXT:    ldr w10, [x29, #16]
-; CHECK-APPLE-AARCH64-NEXT:    orr x9, x9, #0x8
+; CHECK-APPLE-AARCH64-NEXT:    mov w8, #1 ; =0x1
+; CHECK-APPLE-AARCH64-NEXT:    ldr w9, [x29, #16]
 ; CHECK-APPLE-AARCH64-NEXT:    strb w8, [x0, #8]
-; CHECK-APPLE-AARCH64-NEXT:    stur w10, [x29, #-12]
-; CHECK-APPLE-AARCH64-NEXT:    ldr w8, [x9], #8
-; CHECK-APPLE-AARCH64-NEXT:    str w8, [sp, #16]
-; CHECK-APPLE-AARCH64-NEXT:    ldr w8, [x9], #8
+; CHECK-APPLE-AARCH64-NEXT:    add x8, x29, #16
+; CHECK-APPLE-AARCH64-NEXT:    orr x8, x8, #0x8
+; CHECK-APPLE-AARCH64-NEXT:    stur w9, [x29, #-12]
+; CHECK-APPLE-AARCH64-NEXT:    ldr w9, [x8], #8
+; CHECK-APPLE-AARCH64-NEXT:    str w9, [sp, #16]
 ; CHECK-APPLE-AARCH64-NEXT:    fmov s0, #1.00000000
+; CHECK-APPLE-AARCH64-NEXT:    ldr w9, [x8], #8
+; CHECK-APPLE-AARCH64-NEXT:    stur x8, [x29, #-8]
 ; CHECK-APPLE-AARCH64-NEXT:    mov x21, x0
-; CHECK-APPLE-AARCH64-NEXT:    stur x9, [x29, #-8]
-; CHECK-APPLE-AARCH64-NEXT:    str w8, [sp, #12]
+; CHECK-APPLE-AARCH64-NEXT:    str w9, [sp, #12]
 ; CHECK-APPLE-AARCH64-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
 ; CHECK-APPLE-AARCH64-NEXT:    add sp, sp, #48
 ; CHECK-APPLE-AARCH64-NEXT:    ret
@@ -935,11 +935,11 @@ define float @foo_vararg(ptr swifterror %error_ptr_ref, ...) {
 ; CHECK-O0-AARCH64-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-O0-AARCH64-NEXT:    .cfi_offset w30, -8
 ; CHECK-O0-AARCH64-NEXT:    .cfi_offset w29, -16
-; CHECK-O0-AARCH64-NEXT:    mov w8, #16
+; CHECK-O0-AARCH64-NEXT:    mov w8, #16 ; =0x10
 ; CHECK-O0-AARCH64-NEXT:    mov w0, w8
 ; CHECK-O0-AARCH64-NEXT:    bl _malloc
 ; CHECK-O0-AARCH64-NEXT:    mov x21, x0
-; CHECK-O0-AARCH64-NEXT:    mov w8, #1
+; CHECK-O0-AARCH64-NEXT:    mov w8, #1 ; =0x1
 ; CHECK-O0-AARCH64-NEXT:    strb w8, [x0, #8]
 ; CHECK-O0-AARCH64-NEXT:    add x8, x29, #16
 ; CHECK-O0-AARCH64-NEXT:    stur x8, [x29, #-8]
@@ -971,23 +971,23 @@ define float @foo_vararg(ptr swifterror %error_ptr_ref, ...) {
 ; CHECK-APPLE-ARM64_32-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-APPLE-ARM64_32-NEXT:    .cfi_offset w30, -8
 ; CHECK-APPLE-ARM64_32-NEXT:    .cfi_offset w29, -16
-; CHECK-APPLE-ARM64_32-NEXT:    mov w0, #16
+; CHECK-APPLE-ARM64_32-NEXT:    mov w0, #16 ; =0x10
 ; CHECK-APPLE-ARM64_32-NEXT:    bl _malloc
-; CHECK-APPLE-ARM64_32-NEXT:    mov w8, #1
+; CHECK-APPLE-ARM64_32-NEXT:    mov w8, #1 ; =0x1
 ; CHECK-APPLE-ARM64_32-NEXT:    add x9, x29, #16
-; CHECK-APPLE-ARM64_32-NEXT:    orr w10, w9, #0x4
-; CHECK-APPLE-ARM64_32-NEXT:    and x11, x9, #0xfffffff0
 ; CHECK-APPLE-ARM64_32-NEXT:    strb w8, [x0, #8]
-; CHECK-APPLE-ARM64_32-NEXT:    stur w10, [x29, #-8]
-; CHECK-APPLE-ARM64_32-NEXT:    ldr w8, [x11]
-; CHECK-APPLE-ARM64_32-NEXT:    orr w11, w9, #0x8
-; CHECK-APPLE-ARM64_32-NEXT:    stp w8, w11, [x29, #-12]
-; CHECK-APPLE-ARM64_32-NEXT:    orr w8, w9, #0xc
-; CHECK-APPLE-ARM64_32-NEXT:    ldr w9, [x10]
+; CHECK-APPLE-ARM64_32-NEXT:    orr w8, w9, #0x4
+; CHECK-APPLE-ARM64_32-NEXT:    and x10, x9, #0xfffffff0
 ; CHECK-APPLE-ARM64_32-NEXT:    stur w8, [x29, #-8]
-; CHECK-APPLE-ARM64_32-NEXT:    str w9, [sp, #16]
-; CHECK-APPLE-ARM64_32-NEXT:    ldr w8, [x11]
+; CHECK-APPLE-ARM64_32-NEXT:    ldr w11, [x10]
+; CHECK-APPLE-ARM64_32-NEXT:    orr w10, w9, #0x8
+; CHECK-APPLE-ARM64_32-NEXT:    stp w11, w10, [x29, #-12]
+; CHECK-APPLE-ARM64_32-NEXT:    ldr w8, [x8]
+; CHECK-APPLE-ARM64_32-NEXT:    orr w9, w9, #0xc
+; CHECK-APPLE-ARM64_32-NEXT:    str w8, [sp, #16]
+; CHECK-APPLE-ARM64_32-NEXT:    stur w9, [x29, #-8]
 ; CHECK-APPLE-ARM64_32-NEXT:    fmov s0, #1.00000000
+; CHECK-APPLE-ARM64_32-NEXT:    ldr w8, [x10]
 ; CHECK-APPLE-ARM64_32-NEXT:    mov x21, x0
 ; CHECK-APPLE-ARM64_32-NEXT:    str w8, [sp, #12]
 ; CHECK-APPLE-ARM64_32-NEXT:    ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
@@ -1000,11 +1000,11 @@ define float @foo_vararg(ptr swifterror %error_ptr_ref, ...) {
 ; CHECK-O0-ARM64_32-NEXT:    str x30, [sp, #32] ; 8-byte Folded Spill
 ; CHECK-O0-ARM64_32-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-O0-ARM64_32-NEXT:    .cfi_offset w30, -16
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #16
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #16 ; =0x10
 ; CHECK-O0-ARM64_32-NEXT:    mov w0, w8
 ; CHECK-O0-ARM64_32-NEXT:    bl _malloc
 ; CHECK-O0-ARM64_32-NEXT:    mov x21, x0
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #1
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #1 ; =0x1
 ; CHECK-O0-ARM64_32-NEXT:    strb w8, [x0, #8]
 ; CHECK-O0-ARM64_32-NEXT:    add x8, sp, #48
 ; CHECK-O0-ARM64_32-NEXT:    ; kill: def $w8 killed $w8 killed $x8
@@ -1079,10 +1079,10 @@ define float @caller4(ptr %error_ref) {
 ; CHECK-APPLE-AARCH64-NEXT:    .cfi_offset w21, -40
 ; CHECK-APPLE-AARCH64-NEXT:    .cfi_offset w22, -48
 ; CHECK-APPLE-AARCH64-NEXT:    mov x19, x0
-; CHECK-APPLE-AARCH64-NEXT:    mov w8, #10
-; CHECK-APPLE-AARCH64-NEXT:    mov w9, #11
-; CHECK-APPLE-AARCH64-NEXT:    mov w10, #12
+; CHECK-APPLE-AARCH64-NEXT:    mov w8, #10 ; =0xa
+; CHECK-APPLE-AARCH64-NEXT:    mov w9, #11 ; =0xb
 ; CHECK-APPLE-AARCH64-NEXT:    stp w9, w8, [sp, #32]
+; CHECK-APPLE-AARCH64-NEXT:    mov w10, #12 ; =0xc
 ; CHECK-APPLE-AARCH64-NEXT:    str w10, [sp, #28]
 ; CHECK-APPLE-AARCH64-NEXT:    mov x21, xzr
 ; CHECK-APPLE-AARCH64-NEXT:    stp x9, x10, [sp, #8]
@@ -1116,11 +1116,11 @@ define float @caller4(ptr %error_ref) {
 ; CHECK-O0-AARCH64-NEXT:    ; implicit-def: $x1
 ; CHECK-O0-AARCH64-NEXT:    str x0, [sp, #24] ; 8-byte Folded Spill
 ; CHECK-O0-AARCH64-NEXT:    mov x21, xzr
-; CHECK-O0-AARCH64-NEXT:    mov w8, #10
+; CHECK-O0-AARCH64-NEXT:    mov w8, #10 ; =0xa
 ; CHECK-O0-AARCH64-NEXT:    stur w8, [x29, #-28]
-; CHECK-O0-AARCH64-NEXT:    mov w8, #11
+; CHECK-O0-AARCH64-NEXT:    mov w8, #11 ; =0xb
 ; CHECK-O0-AARCH64-NEXT:    stur w8, [x29, #-32]
-; CHECK-O0-AARCH64-NEXT:    mov w8, #12
+; CHECK-O0-AARCH64-NEXT:    mov w8, #12 ; =0xc
 ; CHECK-O0-AARCH64-NEXT:    stur w8, [x29, #-36]
 ; CHECK-O0-AARCH64-NEXT:    ldur w8, [x29, #-28]
 ; CHECK-O0-AARCH64-NEXT:    ; kill: def $x8 killed $w8
@@ -1164,13 +1164,13 @@ define float @caller4(ptr %error_ref) {
 ; CHECK-APPLE-ARM64_32-NEXT:    .cfi_offset w21, -40
 ; CHECK-APPLE-ARM64_32-NEXT:    .cfi_offset w22, -48
 ; CHECK-APPLE-ARM64_32-NEXT:    mov x19, x0
-; CHECK-APPLE-ARM64_32-NEXT:    mov w8, #10
-; CHECK-APPLE-ARM64_32-NEXT:    mov w9, #11
-; CHECK-APPLE-ARM64_32-NEXT:    mov w10, #12
+; CHECK-APPLE-ARM64_32-NEXT:    mov w8, #10 ; =0xa
+; CHECK-APPLE-ARM64_32-NEXT:    mov w9, #11 ; =0xb
 ; CHECK-APPLE-ARM64_32-NEXT:    stp w9, w8, [sp, #20]
+; CHECK-APPLE-ARM64_32-NEXT:    mov w10, #12 ; =0xc
 ; CHECK-APPLE-ARM64_32-NEXT:    str w10, [sp, #16]
 ; CHECK-APPLE-ARM64_32-NEXT:    mov x21, xzr
-; CHECK-APPLE-ARM64_32-NEXT:    mov x9, #11
+; CHECK-APPLE-ARM64_32-NEXT:    mov x9, #11 ; =0xb
 ; CHECK-APPLE-ARM64_32-NEXT:    movk x9, #12, lsl #32
 ; CHECK-APPLE-ARM64_32-NEXT:    stur x9, [sp, #4]
 ; CHECK-APPLE-ARM64_32-NEXT:    str w8, [sp]
@@ -1202,11 +1202,11 @@ define float @caller4(ptr %error_ref) {
 ; CHECK-O0-ARM64_32-NEXT:    ; implicit-def: $x1
 ; CHECK-O0-ARM64_32-NEXT:    str x0, [sp, #16] ; 8-byte Folded Spill
 ; CHECK-O0-ARM64_32-NEXT:    mov x21, xzr
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #10
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #10 ; =0xa
 ; CHECK-O0-ARM64_32-NEXT:    str w8, [sp, #40]
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #11
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #11 ; =0xb
 ; CHECK-O0-ARM64_32-NEXT:    str w8, [sp, #36]
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #12
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #12 ; =0xc
 ; CHECK-O0-ARM64_32-NEXT:    str w8, [sp, #32]
 ; CHECK-O0-ARM64_32-NEXT:    ldr w8, [sp, #40]
 ; CHECK-O0-ARM64_32-NEXT:    ldr w10, [sp, #36]
@@ -1467,14 +1467,14 @@ define swiftcc void @params_in_reg(i64, i64, i64, i64, i64, i64, i64, i64, ptr s
 ; CHECK-APPLE-NEXT:    mov x28, x2
 ; CHECK-APPLE-NEXT:    mov x19, x1
 ; CHECK-APPLE-NEXT:    mov x22, x0
-; CHECK-APPLE-NEXT:    mov w0, #1
-; CHECK-APPLE-NEXT:    mov w1, #2
-; CHECK-APPLE-NEXT:    mov w2, #3
-; CHECK-APPLE-NEXT:    mov w3, #4
-; CHECK-APPLE-NEXT:    mov w4, #5
-; CHECK-APPLE-NEXT:    mov w5, #6
-; CHECK-APPLE-NEXT:    mov w6, #7
-; CHECK-APPLE-NEXT:    mov w7, #8
+; CHECK-APPLE-NEXT:    mov w0, #1 ; =0x1
+; CHECK-APPLE-NEXT:    mov w1, #2 ; =0x2
+; CHECK-APPLE-NEXT:    mov w2, #3 ; =0x3
+; CHECK-APPLE-NEXT:    mov w3, #4 ; =0x4
+; CHECK-APPLE-NEXT:    mov w4, #5 ; =0x5
+; CHECK-APPLE-NEXT:    mov w5, #6 ; =0x6
+; CHECK-APPLE-NEXT:    mov w6, #7 ; =0x7
+; CHECK-APPLE-NEXT:    mov w7, #8 ; =0x8
 ; CHECK-APPLE-NEXT:    mov x20, xzr
 ; CHECK-APPLE-NEXT:    mov x21, xzr
 ; CHECK-APPLE-NEXT:    bl _params_in_reg2
@@ -1520,21 +1520,21 @@ define swiftcc void @params_in_reg(i64, i64, i64, i64, i64, i64, i64, i64, ptr s
 ; CHECK-O0-AARCH64-NEXT:    ; implicit-def: $x0
 ; CHECK-O0-AARCH64-NEXT:    mov x20, xzr
 ; CHECK-O0-AARCH64-NEXT:    mov x21, x20
-; CHECK-O0-AARCH64-NEXT:    mov w8, #1
+; CHECK-O0-AARCH64-NEXT:    mov w8, #1 ; =0x1
 ; CHECK-O0-AARCH64-NEXT:    mov w0, w8
-; CHECK-O0-AARCH64-NEXT:    mov w8, #2
+; CHECK-O0-AARCH64-NEXT:    mov w8, #2 ; =0x2
 ; CHECK-O0-AARCH64-NEXT:    mov w1, w8
-; CHECK-O0-AARCH64-NEXT:    mov w8, #3
+; CHECK-O0-AARCH64-NEXT:    mov w8, #3 ; =0x3
 ; CHECK-O0-AARCH64-NEXT:    mov w2, w8
-; CHECK-O0-AARCH64-NEXT:    mov w8, #4
+; CHECK-O0-AARCH64-NEXT:    mov w8, #4 ; =0x4
 ; CHECK-O0-AARCH64-NEXT:    mov w3, w8
-; CHECK-O0-AARCH64-NEXT:    mov w8, #5
+; CHECK-O0-AARCH64-NEXT:    mov w8, #5 ; =0x5
 ; CHECK-O0-AARCH64-NEXT:    mov w4, w8
-; CHECK-O0-AARCH64-NEXT:    mov w8, #6
+; CHECK-O0-AARCH64-NEXT:    mov w8, #6 ; =0x6
 ; CHECK-O0-AARCH64-NEXT:    mov w5, w8
-; CHECK-O0-AARCH64-NEXT:    mov w8, #7
+; CHECK-O0-AARCH64-NEXT:    mov w8, #7 ; =0x7
 ; CHECK-O0-AARCH64-NEXT:    mov w6, w8
-; CHECK-O0-AARCH64-NEXT:    mov w8, #8
+; CHECK-O0-AARCH64-NEXT:    mov w8, #8 ; =0x8
 ; CHECK-O0-AARCH64-NEXT:    mov w7, w8
 ; CHECK-O0-AARCH64-NEXT:    bl _params_in_reg2
 ; CHECK-O0-AARCH64-NEXT:    ldr x20, [sp, #8] ; 8-byte Folded Reload
@@ -1574,21 +1574,21 @@ define swiftcc void @params_in_reg(i64, i64, i64, i64, i64, i64, i64, i64, ptr s
 ; CHECK-O0-ARM64_32-NEXT:    ; implicit-def: $x0
 ; CHECK-O0-ARM64_32-NEXT:    mov x20, xzr
 ; CHECK-O0-ARM64_32-NEXT:    mov x21, x20
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #1
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #1 ; =0x1
 ; CHECK-O0-ARM64_32-NEXT:    mov w0, w8
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #2
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #2 ; =0x2
 ; CHECK-O0-ARM64_32-NEXT:    mov w1, w8
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #3
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #3 ; =0x3
 ; CHECK-O0-ARM64_32-NEXT:    mov w2, w8
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #4
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #4 ; =0x4
 ; CHECK-O0-ARM64_32-NEXT:    mov w3, w8
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #5
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #5 ; =0x5
 ; CHECK-O0-ARM64_32-NEXT:    mov w4, w8
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #6
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #6 ; =0x6
 ; CHECK-O0-ARM64_32-NEXT:    mov w5, w8
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #7
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #7 ; =0x7
 ; CHECK-O0-ARM64_32-NEXT:    mov w6, w8
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #8
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #8 ; =0x8
 ; CHECK-O0-ARM64_32-NEXT:    mov w7, w8
 ; CHECK-O0-ARM64_32-NEXT:    bl _params_in_reg2
 ; CHECK-O0-ARM64_32-NEXT:    ldr x20, [sp, #8] ; 8-byte Folded Reload
@@ -1646,14 +1646,14 @@ define swiftcc { i64, i64, i64, i64, i64, i64, i64, i64 } @params_and_return_in_
 ; CHECK-APPLE-NEXT:    mov x28, x2
 ; CHECK-APPLE-NEXT:    mov x19, x1
 ; CHECK-APPLE-NEXT:    mov x22, x0
-; CHECK-APPLE-NEXT:    mov w0, #1
-; CHECK-APPLE-NEXT:    mov w1, #2
-; CHECK-APPLE-NEXT:    mov w2, #3
-; CHECK-APPLE-NEXT:    mov w3, #4
-; CHECK-APPLE-NEXT:    mov w4, #5
-; CHECK-APPLE-NEXT:    mov w5, #6
-; CHECK-APPLE-NEXT:    mov w6, #7
-; CHECK-APPLE-NEXT:    mov w7, #8
+; CHECK-APPLE-NEXT:    mov w0, #1 ; =0x1
+; CHECK-APPLE-NEXT:    mov w1, #2 ; =0x2
+; CHECK-APPLE-NEXT:    mov w2, #3 ; =0x3
+; CHECK-APPLE-NEXT:    mov w3, #4 ; =0x4
+; CHECK-APPLE-NEXT:    mov w4, #5 ; =0x5
+; CHECK-APPLE-NEXT:    mov w5, #6 ; =0x6
+; CHECK-APPLE-NEXT:    mov w6, #7 ; =0x7
+; CHECK-APPLE-NEXT:    mov w7, #8 ; =0x8
 ; CHECK-APPLE-NEXT:    mov x20, xzr
 ; CHECK-APPLE-NEXT:    mov x21, xzr
 ; CHECK-APPLE-NEXT:    bl _params_in_reg2
@@ -1677,14 +1677,14 @@ define swiftcc { i64, i64, i64, i64, i64, i64, i64, i64 } @params_and_return_in_
 ; CHECK-APPLE-NEXT:    mov x28, x6
 ; CHECK-APPLE-NEXT:    mov x23, x7
 ; CHECK-APPLE-NEXT:    str x21, [sp, #24] ; 8-byte Folded Spill
-; CHECK-APPLE-NEXT:    mov w0, #1
-; CHECK-APPLE-NEXT:    mov w1, #2
-; CHECK-APPLE-NEXT:    mov w2, #3
-; CHECK-APPLE-NEXT:    mov w3, #4
-; CHECK-APPLE-NEXT:    mov w4, #5
-; CHECK-APPLE-NEXT:    mov w5, #6
-; CHECK-APPLE-NEXT:    mov w6, #7
-; CHECK-APPLE-NEXT:    mov w7, #8
+; CHECK-APPLE-NEXT:    mov w0, #1 ; =0x1
+; CHECK-APPLE-NEXT:    mov w1, #2 ; =0x2
+; CHECK-APPLE-NEXT:    mov w2, #3 ; =0x3
+; CHECK-APPLE-NEXT:    mov w3, #4 ; =0x4
+; CHECK-APPLE-NEXT:    mov w4, #5 ; =0x5
+; CHECK-APPLE-NEXT:    mov w5, #6 ; =0x6
+; CHECK-APPLE-NEXT:    mov w6, #7 ; =0x7
+; CHECK-APPLE-NEXT:    mov w7, #8 ; =0x8
 ; CHECK-APPLE-NEXT:    mov x20, xzr
 ; CHECK-APPLE-NEXT:    ldr x21, [sp, #8] ; 8-byte Folded Reload
 ; CHECK-APPLE-NEXT:    bl _params_in_reg2
@@ -1730,28 +1730,28 @@ define swiftcc { i64, i64, i64, i64, i64, i64, i64, i64 } @params_and_return_in_
 ; CHECK-O0-AARCH64-NEXT:    mov x20, xzr
 ; CHECK-O0-AARCH64-NEXT:    str x20, [sp, #80] ; 8-byte Folded Spill
 ; CHECK-O0-AARCH64-NEXT:    mov x21, x20
-; CHECK-O0-AARCH64-NEXT:    mov w8, #1
+; CHECK-O0-AARCH64-NEXT:    mov w8, #1 ; =0x1
 ; CHECK-O0-AARCH64-NEXT:    mov w0, w8
 ; CHECK-O0-AARCH64-NEXT:    str x0, [sp, #88] ; 8-byte Folded Spill
-; CHECK-O0-AARCH64-NEXT:    mov w8, #2
+; CHECK-O0-AARCH64-NEXT:    mov w8, #2 ; =0x2
 ; CHECK-O0-AARCH64-NEXT:    mov w1, w8
 ; CHECK-O0-AARCH64-NEXT:    str x1, [sp, #96] ; 8-byte Folded Spill
-; CHECK-O0-AARCH64-NEXT:    mov w8, #3
+; CHECK-O0-AARCH64-NEXT:    mov w8, #3 ; =0x3
 ; CHECK-O0-AARCH64-NEXT:    mov w2, w8
 ; CHECK-O0-AARCH64-NEXT:    str x2, [sp, #104] ; 8-byte Folded Spill
-; CHECK-O0-AARCH64-NEXT:    mov w8, #4
+; CHECK-O0-AARCH64-NEXT:    mov w8, #4 ; =0x4
 ; CHECK-O0-AARCH64-NEXT:    mov w3, w8
 ; CHECK-O0-AARCH64-NEXT:    str x3, [sp, #112] ; 8-byte Folded Spill
-; CHECK-O0-AARCH64-NEXT:    mov w8, #5
+; CHECK-O0-AARCH64-NEXT:    mov w8, #5 ; =0x5
 ; CHECK-O0-AARCH64-NEXT:    mov w4, w8
 ; CHECK-O0-AARCH64-NEXT:    str x4, [sp, #120] ; 8-byte Folded Spill
-; CHECK-O0-AARCH64-NEXT:    mov w8, #6
+; CHECK-O0-AARCH64-NEXT:    mov w8, #6 ; =0x6
 ; CHECK-O0-AARCH64-NEXT:    mov w5, w8
 ; CHECK-O0-AARCH64-NEXT:    str x5, [sp, #128] ; 8-byte Folded Spill
-; CHECK-O0-AARCH64-NEXT:    mov w8, #7
+; CHECK-O0-AARCH64-NEXT:    mov w8, #7 ; =0x7
 ; CHECK-O0-AARCH64-NEXT:    mov w6, w8
 ; CHECK-O0-AARCH64-NEXT:    stur x6, [x29, #-120] ; 8-byte Folded Spill
-; CHECK-O0-AARCH64-NEXT:    mov w8, #8
+; CHECK-O0-AARCH64-NEXT:    mov w8, #8 ; =0x8
 ; CHECK-O0-AARCH64-NEXT:    mov w7, w8
 ; CHECK-O0-AARCH64-NEXT:    stur x7, [x29, #-112] ; 8-byte Folded Spill
 ; CHECK-O0-AARCH64-NEXT:    bl _params_in_reg2
@@ -1835,28 +1835,28 @@ define swiftcc { i64, i64, i64, i64, i64, i64, i64, i64 } @params_and_return_in_
 ; CHECK-O0-ARM64_32-NEXT:    mov x20, xzr
 ; CHECK-O0-ARM64_32-NEXT:    str x20, [sp, #80] ; 8-byte Folded Spill
 ; CHECK-O0-ARM64_32-NEXT:    mov x21, x20
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #1
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #1 ; =0x1
 ; CHECK-O0-ARM64_32-NEXT:    mov w0, w8
 ; CHECK-O0-ARM64_32-NEXT:    str x0, [sp, #88] ; 8-byte Folded Spill
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #2
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #2 ; =0x2
 ; CHECK-O0-ARM64_32-NEXT:    mov w1, w8
 ; CHECK-O0-ARM64_32-NEXT:    str x1, [sp, #96] ; 8-byte Folded Spill
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #3
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #3 ; =0x3
 ; CHECK-O0-ARM64_32-NEXT:    mov w2, w8
 ; CHECK-O0-ARM64_32-NEXT:    str x2, [sp, #104] ; 8-byte Folded Spill
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #4
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #4 ; =0x4
 ; CHECK-O0-ARM64_32-NEXT:    mov w3, w8
 ; CHECK-O0-ARM64_32-NEXT:    str x3, [sp, #112] ; 8-byte Folded Spill
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #5
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #5 ; =0x5
 ; CHECK-O0-ARM64_32-NEXT:    mov w4, w8
 ; CHECK-O0-ARM64_32-NEXT:    str x4, [sp, #120] ; 8-byte Folded Spill
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #6
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #6 ; =0x6
 ; CHECK-O0-ARM64_32-NEXT:    mov w5, w8
 ; CHECK-O0-ARM64_32-NEXT:    str x5, [sp, #128] ; 8-byte Folded Spill
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #7
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #7 ; =0x7
 ; CHECK-O0-ARM64_32-NEXT:    mov w6, w8
 ; CHECK-O0-ARM64_32-NEXT:    str x6, [sp, #136] ; 8-byte Folded Spill
-; CHECK-O0-ARM64_32-NEXT:    mov w8, #8
+; CHECK-O0-ARM64_32-NEXT:    mov w8, #8 ; =0x8
 ; CHECK-O0-ARM64_32-NEXT:    mov w7, w8
 ; CHECK-O0-ARM64_32-NEXT:    str x7, [sp, #144] ; 8-byte Folded Spill
 ; CHECK-O0-ARM64_32-NEXT:    bl _params_in_reg2

diff  --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll
index a01a3826e340df..be3df664c8876a 100644
--- a/llvm/test/CodeGen/AArch64/tbl-loops.ll
+++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll
@@ -16,28 +16,28 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
 ; CHECK-NEXT:    b .LBB0_6
 ; CHECK-NEXT:  .LBB0_3: // %vector.ph
 ; CHECK-NEXT:    add x11, x8, #1
-; CHECK-NEXT:    mov w15, #1132396544
-; CHECK-NEXT:    and x10, x11, #0x1fffffff8
+; CHECK-NEXT:    mov w8, #1132396544 // =0x437f0000
 ; CHECK-NEXT:    add x12, x0, #4
-; CHECK-NEXT:    add x9, x0, x10
+; CHECK-NEXT:    and x10, x11, #0x1fffffff8
+; CHECK-NEXT:    dup v0.4s, w8
 ; CHECK-NEXT:    add x13, x1, #16
 ; CHECK-NEXT:    add x8, x1, x10, lsl #2
+; CHECK-NEXT:    add x9, x0, x10
 ; CHECK-NEXT:    mov x14, x10
-; CHECK-NEXT:    dup v0.4s, w15
 ; CHECK-NEXT:  .LBB0_4: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldp q1, q2, [x13, #-16]
 ; CHECK-NEXT:    subs x14, x14, #8
 ; CHECK-NEXT:    add x13, x13, #32
 ; CHECK-NEXT:    fcmgt v3.4s, v1.4s, v0.4s
-; CHECK-NEXT:    fcmlt v5.4s, v1.4s, #0.0
 ; CHECK-NEXT:    fcmgt v4.4s, v2.4s, v0.4s
+; CHECK-NEXT:    fcmlt v5.4s, v1.4s, #0.0
 ; CHECK-NEXT:    fcmlt v6.4s, v2.4s, #0.0
 ; CHECK-NEXT:    bit v1.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    bit v2.16b, v0.16b, v4.16b
 ; CHECK-NEXT:    bic v1.16b, v1.16b, v5.16b
-; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
 ; CHECK-NEXT:    bic v2.16b, v2.16b, v6.16b
+; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
 ; CHECK-NEXT:    fcvtzs v2.4s, v2.4s
 ; CHECK-NEXT:    xtn v1.4h, v1.4s
 ; CHECK-NEXT:    xtn v2.4h, v2.4s
@@ -53,11 +53,11 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
 ; CHECK-NEXT:  .LBB0_6: // %for.body.preheader1
 ; CHECK-NEXT:    movi d0, #0000000000000000
 ; CHECK-NEXT:    sub w10, w2, w10
-; CHECK-NEXT:    mov w11, #1132396544
+; CHECK-NEXT:    mov w11, #1132396544 // =0x437f0000
 ; CHECK-NEXT:  .LBB0_7: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr s1, [x8], #4
 ; CHECK-NEXT:    fmov s2, w11
+; CHECK-NEXT:    ldr s1, [x8], #4
 ; CHECK-NEXT:    fcmp s1, s2
 ; CHECK-NEXT:    fcsel s2, s2, s1, gt
 ; CHECK-NEXT:    fcmp s1, #0.0
@@ -166,23 +166,23 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
 ; CHECK-NEXT:  .LBB1_5: // %for.body.preheader1
 ; CHECK-NEXT:    movi d0, #0000000000000000
 ; CHECK-NEXT:    sub w10, w2, w10
-; CHECK-NEXT:    mov w11, #1132396544
+; CHECK-NEXT:    mov w11, #1132396544 // =0x437f0000
 ; CHECK-NEXT:  .LBB1_6: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldp s2, s3, [x8], #8
-; CHECK-NEXT:    fmov s1, w11
-; CHECK-NEXT:    fcmp s2, s1
-; CHECK-NEXT:    fcsel s4, s1, s2, gt
-; CHECK-NEXT:    fcmp s2, #0.0
-; CHECK-NEXT:    fcsel s2, s0, s4, mi
-; CHECK-NEXT:    fcmp s3, s1
-; CHECK-NEXT:    fcsel s1, s1, s3, gt
+; CHECK-NEXT:    ldp s1, s3, [x8], #8
+; CHECK-NEXT:    fmov s2, w11
+; CHECK-NEXT:    fcmp s1, s2
+; CHECK-NEXT:    fcsel s4, s2, s1, gt
+; CHECK-NEXT:    fcmp s1, #0.0
+; CHECK-NEXT:    fcsel s1, s0, s4, mi
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    fcsel s2, s2, s3, gt
 ; CHECK-NEXT:    fcmp s3, #0.0
-; CHECK-NEXT:    fcvtzs w12, s2
-; CHECK-NEXT:    fcsel s1, s0, s1, mi
-; CHECK-NEXT:    strb w12, [x9]
+; CHECK-NEXT:    fcvtzs w12, s1
+; CHECK-NEXT:    fcsel s2, s0, s2, mi
 ; CHECK-NEXT:    subs w10, w10, #1
-; CHECK-NEXT:    fcvtzs w13, s1
+; CHECK-NEXT:    strb w12, [x9]
+; CHECK-NEXT:    fcvtzs w13, s2
 ; CHECK-NEXT:    strb w13, [x9, #1]
 ; CHECK-NEXT:    add x9, x9, #2
 ; CHECK-NEXT:    b.ne .LBB1_6
@@ -190,25 +190,25 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB1_8: // %vector.ph
 ; CHECK-NEXT:    add x11, x8, #1
-; CHECK-NEXT:    mov w13, #1132396544
+; CHECK-NEXT:    mov w8, #1132396544 // =0x437f0000
 ; CHECK-NEXT:    and x10, x11, #0x1fffffffc
-; CHECK-NEXT:    mov x12, x10
+; CHECK-NEXT:    dup v0.4s, w8
 ; CHECK-NEXT:    add x8, x1, x10, lsl #3
 ; CHECK-NEXT:    add x9, x0, x10, lsl #1
-; CHECK-NEXT:    dup v0.4s, w13
+; CHECK-NEXT:    mov x12, x10
 ; CHECK-NEXT:  .LBB1_9: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ld2 { v1.4s, v2.4s }, [x1], #32
 ; CHECK-NEXT:    fcmgt v3.4s, v1.4s, v0.4s
-; CHECK-NEXT:    subs x12, x12, #4
 ; CHECK-NEXT:    fcmgt v4.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    fcmlt v5.4s, v1.4s, #0.0
+; CHECK-NEXT:    subs x12, x12, #4
 ; CHECK-NEXT:    bsl v3.16b, v0.16b, v1.16b
-; CHECK-NEXT:    bsl v4.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    fcmlt v1.4s, v2.4s, #0.0
+; CHECK-NEXT:    bsl v4.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    bic v2.16b, v3.16b, v5.16b
-; CHECK-NEXT:    fcvtzs v2.4s, v2.4s
 ; CHECK-NEXT:    bic v1.16b, v4.16b, v1.16b
+; CHECK-NEXT:    fcvtzs v2.4s, v2.4s
 ; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
 ; CHECK-NEXT:    xtn v2.4h, v2.4s
 ; CHECK-NEXT:    xtn v1.4h, v1.4s
@@ -320,100 +320,97 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
 ; CHECK-LABEL: loop3:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    subs w8, w2, #1
-; CHECK-NEXT:    b.lt .LBB2_7
+; CHECK-NEXT:    b.lt .LBB2_9
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-NEXT:    cmp w8, #2
-; CHECK-NEXT:    b.ls .LBB2_4
+; CHECK-NEXT:    b.ls .LBB2_6
 ; CHECK-NEXT:  // %bb.2: // %vector.memcheck
 ; CHECK-NEXT:    add x9, x8, w8, uxtw #1
 ; CHECK-NEXT:    add x9, x9, #3
 ; CHECK-NEXT:    add x10, x1, x9, lsl #2
+; CHECK-NEXT:    add x9, x0, x9
 ; CHECK-NEXT:    cmp x10, x0
-; CHECK-NEXT:    b.ls .LBB2_8
-; CHECK-NEXT:  // %bb.3: // %vector.memcheck
+; CHECK-NEXT:    ccmp x9, x1, #0, hi
+; CHECK-NEXT:    b.hi .LBB2_6
+; CHECK-NEXT:  // %bb.3: // %vector.ph
+; CHECK-NEXT:    add x11, x8, #1
+; CHECK-NEXT:    mov w8, #1132396544 // =0x437f0000
+; CHECK-NEXT:    adrp x12, .LCPI2_0
+; CHECK-NEXT:    and x10, x11, #0x1fffffffc
+; CHECK-NEXT:    dup v0.4s, w8
+; CHECK-NEXT:    ldr q1, [x12, :lo12:.LCPI2_0]
+; CHECK-NEXT:    add x9, x10, x10, lsl #1
+; CHECK-NEXT:    mov x12, x10
+; CHECK-NEXT:    add x8, x1, x9, lsl #2
 ; CHECK-NEXT:    add x9, x0, x9
-; CHECK-NEXT:    cmp x9, x1
-; CHECK-NEXT:    b.ls .LBB2_8
-; CHECK-NEXT:  .LBB2_4:
+; CHECK-NEXT:  .LBB2_4: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld3 { v2.4s, v3.4s, v4.4s }, [x1], #48
+; CHECK-NEXT:    fcmgt v5.4s, v2.4s, v0.4s
+; CHECK-NEXT:    fcmgt v6.4s, v3.4s, v0.4s
+; CHECK-NEXT:    fcmgt v7.4s, v4.4s, v0.4s
+; CHECK-NEXT:    fcmlt v16.4s, v2.4s, #0.0
+; CHECK-NEXT:    fcmlt v17.4s, v3.4s, #0.0
+; CHECK-NEXT:    add x13, x0, #8
+; CHECK-NEXT:    subs x12, x12, #4
+; CHECK-NEXT:    bsl v5.16b, v0.16b, v2.16b
+; CHECK-NEXT:    fcmlt v2.4s, v4.4s, #0.0
+; CHECK-NEXT:    bsl v6.16b, v0.16b, v3.16b
+; CHECK-NEXT:    bsl v7.16b, v0.16b, v4.16b
+; CHECK-NEXT:    bic v3.16b, v5.16b, v16.16b
+; CHECK-NEXT:    bic v4.16b, v6.16b, v17.16b
+; CHECK-NEXT:    bic v2.16b, v7.16b, v2.16b
+; CHECK-NEXT:    fcvtzs v3.4s, v3.4s
+; CHECK-NEXT:    fcvtzs v4.4s, v4.4s
+; CHECK-NEXT:    fcvtzs v2.4s, v2.4s
+; CHECK-NEXT:    xtn v5.4h, v3.4s
+; CHECK-NEXT:    xtn v6.4h, v4.4s
+; CHECK-NEXT:    xtn v7.4h, v2.4s
+; CHECK-NEXT:    tbl v2.16b, { v5.16b, v6.16b, v7.16b }, v1.16b
+; CHECK-NEXT:    st1 { v2.s }[2], [x13]
+; CHECK-NEXT:    str d2, [x0], #12
+; CHECK-NEXT:    b.ne .LBB2_4
+; CHECK-NEXT:  // %bb.5: // %middle.block
+; CHECK-NEXT:    cmp x11, x10
+; CHECK-NEXT:    b.ne .LBB2_7
+; CHECK-NEXT:    b .LBB2_9
+; CHECK-NEXT:  .LBB2_6:
 ; CHECK-NEXT:    mov w10, wzr
 ; CHECK-NEXT:    mov x8, x1
 ; CHECK-NEXT:    mov x9, x0
-; CHECK-NEXT:  .LBB2_5: // %for.body.preheader1
+; CHECK-NEXT:  .LBB2_7: // %for.body.preheader1
 ; CHECK-NEXT:    movi d0, #0000000000000000
 ; CHECK-NEXT:    sub w10, w2, w10
-; CHECK-NEXT:    mov w11, #1132396544
-; CHECK-NEXT:  .LBB2_6: // %for.body
+; CHECK-NEXT:    mov w11, #1132396544 // =0x437f0000
+; CHECK-NEXT:  .LBB2_8: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldp s2, s3, [x8]
-; CHECK-NEXT:    fmov s1, w11
-; CHECK-NEXT:    fcmp s2, s1
-; CHECK-NEXT:    fcsel s4, s1, s2, gt
-; CHECK-NEXT:    fcmp s2, #0.0
-; CHECK-NEXT:    fcsel s2, s0, s4, mi
-; CHECK-NEXT:    fcmp s3, s1
-; CHECK-NEXT:    fcsel s4, s1, s3, gt
+; CHECK-NEXT:    ldp s1, s3, [x8]
+; CHECK-NEXT:    fmov s2, w11
+; CHECK-NEXT:    fcmp s1, s2
+; CHECK-NEXT:    fcsel s4, s2, s1, gt
+; CHECK-NEXT:    fcmp s1, #0.0
+; CHECK-NEXT:    fcsel s1, s0, s4, mi
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    fcsel s4, s2, s3, gt
 ; CHECK-NEXT:    fcmp s3, #0.0
 ; CHECK-NEXT:    ldr s3, [x8, #8]
-; CHECK-NEXT:    fcvtzs w12, s2
+; CHECK-NEXT:    fcvtzs w12, s1
 ; CHECK-NEXT:    add x8, x8, #12
 ; CHECK-NEXT:    fcsel s4, s0, s4, mi
-; CHECK-NEXT:    fcmp s3, s1
+; CHECK-NEXT:    fcmp s3, s2
 ; CHECK-NEXT:    strb w12, [x9]
-; CHECK-NEXT:    fcsel s1, s1, s3, gt
+; CHECK-NEXT:    fcsel s2, s2, s3, gt
 ; CHECK-NEXT:    fcmp s3, #0.0
 ; CHECK-NEXT:    fcvtzs w13, s4
-; CHECK-NEXT:    fcsel s1, s0, s1, mi
-; CHECK-NEXT:    strb w13, [x9, #1]
+; CHECK-NEXT:    fcsel s2, s0, s2, mi
 ; CHECK-NEXT:    subs w10, w10, #1
-; CHECK-NEXT:    fcvtzs w14, s1
+; CHECK-NEXT:    strb w13, [x9, #1]
+; CHECK-NEXT:    fcvtzs w14, s2
 ; CHECK-NEXT:    strb w14, [x9, #2]
 ; CHECK-NEXT:    add x9, x9, #3
-; CHECK-NEXT:    b.ne .LBB2_6
-; CHECK-NEXT:  .LBB2_7: // %for.cond.cleanup
+; CHECK-NEXT:    b.ne .LBB2_8
+; CHECK-NEXT:  .LBB2_9: // %for.cond.cleanup
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB2_8: // %vector.ph
-; CHECK-NEXT:    add x11, x8, #1
-; CHECK-NEXT:    adrp x12, .LCPI2_0
-; CHECK-NEXT:    and x10, x11, #0x1fffffffc
-; CHECK-NEXT:    mov w13, #1132396544
-; CHECK-NEXT:    add x8, x10, x10, lsl #1
-; CHECK-NEXT:    ldr q0, [x12, :lo12:.LCPI2_0]
-; CHECK-NEXT:    add x9, x0, x8
-; CHECK-NEXT:    mov x12, x10
-; CHECK-NEXT:    add x8, x1, x8, lsl #2
-; CHECK-NEXT:    dup v1.4s, w13
-; CHECK-NEXT:  .LBB2_9: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ld3 { v2.4s, v3.4s, v4.4s }, [x1], #48
-; CHECK-NEXT:    fcmgt v5.4s, v2.4s, v1.4s
-; CHECK-NEXT:    add x13, x0, #8
-; CHECK-NEXT:    fcmgt v7.4s, v3.4s, v1.4s
-; CHECK-NEXT:    subs x12, x12, #4
-; CHECK-NEXT:    fcmgt v17.4s, v4.4s, v1.4s
-; CHECK-NEXT:    fcmlt v6.4s, v2.4s, #0.0
-; CHECK-NEXT:    bsl v5.16b, v1.16b, v2.16b
-; CHECK-NEXT:    fcmlt v16.4s, v3.4s, #0.0
-; CHECK-NEXT:    bsl v7.16b, v1.16b, v3.16b
-; CHECK-NEXT:    mov v2.16b, v17.16b
-; CHECK-NEXT:    bic v5.16b, v5.16b, v6.16b
-; CHECK-NEXT:    fcmlt v6.4s, v4.4s, #0.0
-; CHECK-NEXT:    bsl v2.16b, v1.16b, v4.16b
-; CHECK-NEXT:    bic v3.16b, v7.16b, v16.16b
-; CHECK-NEXT:    fcvtzs v4.4s, v5.4s
-; CHECK-NEXT:    fcvtzs v3.4s, v3.4s
-; CHECK-NEXT:    bic v2.16b, v2.16b, v6.16b
-; CHECK-NEXT:    fcvtzs v2.4s, v2.4s
-; CHECK-NEXT:    xtn v4.4h, v4.4s
-; CHECK-NEXT:    xtn v5.4h, v3.4s
-; CHECK-NEXT:    xtn v6.4h, v2.4s
-; CHECK-NEXT:    tbl v2.16b, { v4.16b, v5.16b, v6.16b }, v0.16b
-; CHECK-NEXT:    str d2, [x0], #12
-; CHECK-NEXT:    st1 { v2.s }[2], [x13]
-; CHECK-NEXT:    b.ne .LBB2_9
-; CHECK-NEXT:  // %bb.10: // %middle.block
-; CHECK-NEXT:    cmp x11, x10
-; CHECK-NEXT:    b.ne .LBB2_5
-; CHECK-NEXT:    b .LBB2_7
 entry:
   %cmp29 = icmp sgt i32 %width, 0
   br i1 %cmp29, label %for.body.preheader, label %for.cond.cleanup
@@ -553,82 +550,81 @@ define void @loop4(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
 ; CHECK-NEXT:  .LBB3_5: // %for.body.preheader1
 ; CHECK-NEXT:    movi d0, #0000000000000000
 ; CHECK-NEXT:    sub w10, w2, w10
-; CHECK-NEXT:    mov w11, #1132396544
+; CHECK-NEXT:    mov w11, #1132396544 // =0x437f0000
 ; CHECK-NEXT:  .LBB3_6: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldp s2, s3, [x8]
-; CHECK-NEXT:    fmov s1, w11
-; CHECK-NEXT:    fcmp s2, s1
-; CHECK-NEXT:    fcsel s4, s1, s2, gt
-; CHECK-NEXT:    fcmp s2, #0.0
-; CHECK-NEXT:    fcsel s2, s0, s4, mi
-; CHECK-NEXT:    fcmp s3, s1
-; CHECK-NEXT:    fcsel s4, s1, s3, gt
+; CHECK-NEXT:    ldp s1, s3, [x8]
+; CHECK-NEXT:    fmov s2, w11
+; CHECK-NEXT:    fcmp s1, s2
+; CHECK-NEXT:    fcsel s4, s2, s1, gt
+; CHECK-NEXT:    fcmp s1, #0.0
+; CHECK-NEXT:    fcsel s1, s0, s4, mi
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    fcsel s4, s2, s3, gt
 ; CHECK-NEXT:    fcmp s3, #0.0
 ; CHECK-NEXT:    ldp s3, s5, [x8, #8]
-; CHECK-NEXT:    fcvtzs w12, s2
+; CHECK-NEXT:    fcvtzs w12, s1
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    fcsel s4, s0, s4, mi
-; CHECK-NEXT:    fcmp s3, s1
+; CHECK-NEXT:    fcmp s3, s2
 ; CHECK-NEXT:    strb w12, [x9]
-; CHECK-NEXT:    fcsel s6, s1, s3, gt
+; CHECK-NEXT:    fcsel s6, s2, s3, gt
 ; CHECK-NEXT:    fcmp s3, #0.0
 ; CHECK-NEXT:    fcvtzs w13, s4
 ; CHECK-NEXT:    fcsel s3, s0, s6, mi
-; CHECK-NEXT:    fcmp s5, s1
+; CHECK-NEXT:    fcmp s5, s2
 ; CHECK-NEXT:    strb w13, [x9, #1]
-; CHECK-NEXT:    fcsel s1, s1, s5, gt
+; CHECK-NEXT:    fcsel s2, s2, s5, gt
 ; CHECK-NEXT:    fcmp s5, #0.0
 ; CHECK-NEXT:    fcvtzs w14, s3
-; CHECK-NEXT:    fcsel s1, s0, s1, mi
-; CHECK-NEXT:    strb w14, [x9, #2]
+; CHECK-NEXT:    fcsel s2, s0, s2, mi
 ; CHECK-NEXT:    subs w10, w10, #1
-; CHECK-NEXT:    fcvtzs w12, s1
-; CHECK-NEXT:    strb w12, [x9, #3]
+; CHECK-NEXT:    strb w14, [x9, #2]
+; CHECK-NEXT:    fcvtzs w15, s2
+; CHECK-NEXT:    strb w15, [x9, #3]
 ; CHECK-NEXT:    add x9, x9, #4
 ; CHECK-NEXT:    b.ne .LBB3_6
 ; CHECK-NEXT:  .LBB3_7: // %for.cond.cleanup
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB3_8: // %vector.ph
 ; CHECK-NEXT:    add x11, x8, #1
+; CHECK-NEXT:    mov w8, #1132396544 // =0x437f0000
 ; CHECK-NEXT:    adrp x12, .LCPI3_0
 ; CHECK-NEXT:    and x10, x11, #0x1fffffffc
-; CHECK-NEXT:    mov w13, #1132396544
+; CHECK-NEXT:    dup v0.4s, w8
+; CHECK-NEXT:    ldr q1, [x12, :lo12:.LCPI3_0]
 ; CHECK-NEXT:    add x8, x1, x10, lsl #4
 ; CHECK-NEXT:    add x9, x0, x10, lsl #2
-; CHECK-NEXT:    ldr q0, [x12, :lo12:.LCPI3_0]
 ; CHECK-NEXT:    mov x12, x10
-; CHECK-NEXT:    dup v1.4s, w13
 ; CHECK-NEXT:  .LBB3_9: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ld4 { v2.4s, v3.4s, v4.4s, v5.4s }, [x1], #64
-; CHECK-NEXT:    fcmgt v6.4s, v2.4s, v1.4s
+; CHECK-NEXT:    fcmgt v6.4s, v2.4s, v0.4s
+; CHECK-NEXT:    fcmgt v7.4s, v3.4s, v0.4s
+; CHECK-NEXT:    fcmgt v16.4s, v4.4s, v0.4s
+; CHECK-NEXT:    fcmgt v17.4s, v5.4s, v0.4s
+; CHECK-NEXT:    fcmlt v18.4s, v2.4s, #0.0
+; CHECK-NEXT:    fcmlt v19.4s, v3.4s, #0.0
 ; CHECK-NEXT:    subs x12, x12, #4
-; CHECK-NEXT:    fcmlt v7.4s, v2.4s, #0.0
-; CHECK-NEXT:    fcmgt v16.4s, v3.4s, v1.4s
-; CHECK-NEXT:    fcmgt v19.4s, v4.4s, v1.4s
-; CHECK-NEXT:    bsl v6.16b, v1.16b, v2.16b
-; CHECK-NEXT:    fcmlt v17.4s, v3.4s, #0.0
-; CHECK-NEXT:    bsl v16.16b, v1.16b, v3.16b
-; CHECK-NEXT:    fcmlt v18.4s, v4.4s, #0.0
-; CHECK-NEXT:    bic v6.16b, v6.16b, v7.16b
-; CHECK-NEXT:    fcmgt v7.4s, v5.4s, v1.4s
-; CHECK-NEXT:    bsl v19.16b, v1.16b, v4.16b
-; CHECK-NEXT:    bic v16.16b, v16.16b, v17.16b
-; CHECK-NEXT:    fcmlt v17.4s, v5.4s, #0.0
-; CHECK-NEXT:    mov v2.16b, v7.16b
-; CHECK-NEXT:    bsl v2.16b, v1.16b, v5.16b
-; CHECK-NEXT:    fcvtzs v4.4s, v6.4s
-; CHECK-NEXT:    bic v3.16b, v19.16b, v18.16b
-; CHECK-NEXT:    fcvtzs v5.4s, v16.4s
+; CHECK-NEXT:    fcmlt v20.4s, v4.4s, #0.0
+; CHECK-NEXT:    bsl v6.16b, v0.16b, v2.16b
+; CHECK-NEXT:    fcmlt v2.4s, v5.4s, #0.0
+; CHECK-NEXT:    bsl v7.16b, v0.16b, v3.16b
+; CHECK-NEXT:    bsl v16.16b, v0.16b, v4.16b
+; CHECK-NEXT:    bsl v17.16b, v0.16b, v5.16b
+; CHECK-NEXT:    bic v3.16b, v6.16b, v18.16b
+; CHECK-NEXT:    bic v4.16b, v7.16b, v19.16b
+; CHECK-NEXT:    bic v5.16b, v16.16b, v20.16b
+; CHECK-NEXT:    bic v2.16b, v17.16b, v2.16b
 ; CHECK-NEXT:    fcvtzs v3.4s, v3.4s
-; CHECK-NEXT:    bic v2.16b, v2.16b, v17.16b
+; CHECK-NEXT:    fcvtzs v4.4s, v4.4s
+; CHECK-NEXT:    fcvtzs v5.4s, v5.4s
 ; CHECK-NEXT:    fcvtzs v2.4s, v2.4s
-; CHECK-NEXT:    xtn v16.4h, v4.4s
-; CHECK-NEXT:    xtn v17.4h, v5.4s
-; CHECK-NEXT:    xtn v18.4h, v3.4s
+; CHECK-NEXT:    xtn v16.4h, v3.4s
+; CHECK-NEXT:    xtn v17.4h, v4.4s
+; CHECK-NEXT:    xtn v18.4h, v5.4s
 ; CHECK-NEXT:    xtn v19.4h, v2.4s
-; CHECK-NEXT:    tbl v2.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b
+; CHECK-NEXT:    tbl v2.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v1.16b
 ; CHECK-NEXT:    str q2, [x0], #16
 ; CHECK-NEXT:    b.ne .LBB3_9
 ; CHECK-NEXT:  // %bb.10: // %middle.block

diff  --git a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
index f26f183630e9aa..9650a9b121654b 100644
--- a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
@@ -43,10 +43,10 @@ define void @trunc_v16i32_to_v16i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-LABEL: trunc_v16i32_to_v16i8_in_loop:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh0:
-; CHECK-NEXT:    adrp x9, lCPI0_0 at PAGE
-; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    adrp x8, lCPI0_0 at PAGE
 ; CHECK-NEXT:  Lloh1:
-; CHECK-NEXT:    ldr q0, [x9, lCPI0_0 at PAGEOFF]
+; CHECK-NEXT:    ldr q0, [x8, lCPI0_0 at PAGEOFF]
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB0_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x9, x0, x8, lsl #6
@@ -71,10 +71,10 @@ define void @trunc_v16i32_to_v16i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    add x9, x0, x8, lsl #6
 ; CHECK-BE-NEXT:    add x10, x9, #16
-; CHECK-BE-NEXT:    add x11, x9, #32
 ; CHECK-BE-NEXT:    ld1 { v1.16b }, [x9]
-; CHECK-BE-NEXT:    add x9, x9, #48
+; CHECK-BE-NEXT:    add x11, x9, #32
 ; CHECK-BE-NEXT:    ld1 { v2.16b }, [x10]
+; CHECK-BE-NEXT:    add x9, x9, #48
 ; CHECK-BE-NEXT:    ld1 { v3.16b }, [x11]
 ; CHECK-BE-NEXT:    ld1 { v4.16b }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, x8, lsl #4
@@ -109,26 +109,26 @@ exit:
 define void @trunc_v16i32_to_v16i8_no_loop(ptr %A, ptr %dst) {
 ; CHECK-LABEL: trunc_v16i32_to_v16i8_no_loop:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    ldp q1, q0, [x0, #32]
-; CHECK-NEXT:    ldp q3, q2, [x0]
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ldp q3, q2, [x0, #32]
 ; CHECK-NEXT:    uzp1.8h v0, v1, v0
-; CHECK-NEXT:    uzp1.8h v1, v3, v2
-; CHECK-NEXT:    uzp1.16b v0, v1, v0
+; CHECK-NEXT:    uzp1.8h v2, v3, v2
+; CHECK-NEXT:    uzp1.16b v0, v0, v2
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: trunc_v16i32_to_v16i8_no_loop:
 ; CHECK-BE:       // %bb.0: // %entry
-; CHECK-BE-NEXT:    add x8, x0, #48
-; CHECK-BE-NEXT:    add x9, x0, #32
+; CHECK-BE-NEXT:    add x8, x0, #16
+; CHECK-BE-NEXT:    add x9, x0, #48
+; CHECK-BE-NEXT:    add x10, x0, #32
 ; CHECK-BE-NEXT:    ld1 { v0.4s }, [x0]
 ; CHECK-BE-NEXT:    ld1 { v1.4s }, [x8]
-; CHECK-BE-NEXT:    add x8, x0, #16
 ; CHECK-BE-NEXT:    ld1 { v2.4s }, [x9]
-; CHECK-BE-NEXT:    ld1 { v3.4s }, [x8]
-; CHECK-BE-NEXT:    uzp1 v1.8h, v2.8h, v1.8h
-; CHECK-BE-NEXT:    uzp1 v0.8h, v0.8h, v3.8h
-; CHECK-BE-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-BE-NEXT:    ld1 { v3.4s }, [x10]
+; CHECK-BE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-BE-NEXT:    uzp1 v2.8h, v3.8h, v2.8h
+; CHECK-BE-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
 ; CHECK-BE-NEXT:    st1 { v0.16b }, [x1]
 ; CHECK-BE-NEXT:    ret
 entry:
@@ -179,10 +179,10 @@ define void @trunc_v8i32_to_v8i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-LABEL: trunc_v8i32_to_v8i8_in_loop:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh2:
-; CHECK-NEXT:    adrp x9, lCPI2_0 at PAGE
-; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    adrp x8, lCPI2_0 at PAGE
 ; CHECK-NEXT:  Lloh3:
-; CHECK-NEXT:    ldr q0, [x9, lCPI2_0 at PAGEOFF]
+; CHECK-NEXT:    ldr q0, [x8, lCPI2_0 at PAGEOFF]
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB2_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x9, x0, x8, lsl #5
@@ -208,8 +208,8 @@ define void @trunc_v8i32_to_v8i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-BE-NEXT:    add x10, x9, #16
 ; CHECK-BE-NEXT:    ld1 { v1.16b }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, x8, lsl #3
-; CHECK-BE-NEXT:    add x8, x8, #1
 ; CHECK-BE-NEXT:    ld1 { v2.16b }, [x10]
+; CHECK-BE-NEXT:    add x8, x8, #1
 ; CHECK-BE-NEXT:    cmp x8, #1000
 ; CHECK-BE-NEXT:    tbl v1.16b, { v1.16b, v2.16b }, v0.16b
 ; CHECK-BE-NEXT:    st1 { v1.8b }, [x9]
@@ -274,18 +274,18 @@ define void @trunc_v16i64_to_v16i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-LABEL: trunc_v16i64_to_v16i8_in_loop:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh4:
-; CHECK-NEXT:    adrp x9, lCPI3_0 at PAGE
-; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    adrp x8, lCPI3_0 at PAGE
 ; CHECK-NEXT:  Lloh5:
-; CHECK-NEXT:    ldr q0, [x9, lCPI3_0 at PAGEOFF]
+; CHECK-NEXT:    ldr q0, [x8, lCPI3_0 at PAGEOFF]
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB3_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x9, x0, x8, lsl #7
 ; CHECK-NEXT:    ldp q1, q2, [x9]
-; CHECK-NEXT:    ldp q3, q4, [x9, #32]
 ; CHECK-NEXT:    ldp q16, q17, [x9, #64]
-; CHECK-NEXT:    tbl.16b v1, { v1, v2, v3, v4 }, v0
+; CHECK-NEXT:    ldp q3, q4, [x9, #32]
 ; CHECK-NEXT:    ldp q18, q19, [x9, #96]
+; CHECK-NEXT:    tbl.16b v1, { v1, v2, v3, v4 }, v0
 ; CHECK-NEXT:    tbl.16b v2, { v16, v17, v18, v19 }, v0
 ; CHECK-NEXT:    mov.d v1[1], v2[0]
 ; CHECK-NEXT:    str q1, [x1, x8, lsl #4]
@@ -305,25 +305,25 @@ define void @trunc_v16i64_to_v16i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-BE-NEXT:  .LBB3_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    add x9, x0, x8, lsl #7
-; CHECK-BE-NEXT:    add x10, x9, #16
-; CHECK-BE-NEXT:    add x11, x9, #32
+; CHECK-BE-NEXT:    add x13, x9, #64
+; CHECK-BE-NEXT:    add x12, x9, #80
+; CHECK-BE-NEXT:    add x14, x9, #16
 ; CHECK-BE-NEXT:    ld1 { v1.16b }, [x9]
-; CHECK-BE-NEXT:    ld1 { v2.16b }, [x10]
-; CHECK-BE-NEXT:    add x10, x9, #48
-; CHECK-BE-NEXT:    ld1 { v3.16b }, [x11]
-; CHECK-BE-NEXT:    add x11, x9, #64
-; CHECK-BE-NEXT:    ld1 { v4.16b }, [x10]
-; CHECK-BE-NEXT:    add x10, x9, #80
-; CHECK-BE-NEXT:    ld1 { v16.16b }, [x11]
+; CHECK-BE-NEXT:    ld1 { v16.16b }, [x13]
 ; CHECK-BE-NEXT:    add x11, x9, #96
-; CHECK-BE-NEXT:    add x9, x9, #112
-; CHECK-BE-NEXT:    ld1 { v17.16b }, [x10]
-; CHECK-BE-NEXT:    tbl v1.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v0.16b
+; CHECK-BE-NEXT:    add x13, x9, #32
+; CHECK-BE-NEXT:    ld1 { v2.16b }, [x14]
+; CHECK-BE-NEXT:    ld1 { v17.16b }, [x12]
+; CHECK-BE-NEXT:    add x10, x9, #112
+; CHECK-BE-NEXT:    add x9, x9, #48
+; CHECK-BE-NEXT:    ld1 { v3.16b }, [x13]
 ; CHECK-BE-NEXT:    ld1 { v18.16b }, [x11]
-; CHECK-BE-NEXT:    ld1 { v19.16b }, [x9]
+; CHECK-BE-NEXT:    ld1 { v4.16b }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, x8, lsl #4
+; CHECK-BE-NEXT:    ld1 { v19.16b }, [x10]
 ; CHECK-BE-NEXT:    add x8, x8, #1
 ; CHECK-BE-NEXT:    cmp x8, #1000
+; CHECK-BE-NEXT:    tbl v1.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v0.16b
 ; CHECK-BE-NEXT:    tbl v2.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b
 ; CHECK-BE-NEXT:    mov v1.d[1], v2.d[0]
 ; CHECK-BE-NEXT:    st1 { v1.16b }, [x9]
@@ -389,10 +389,10 @@ define void @trunc_v8i64_to_v8i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-LABEL: trunc_v8i64_to_v8i8_in_loop:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh6:
-; CHECK-NEXT:    adrp x9, lCPI4_0 at PAGE
-; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    adrp x8, lCPI4_0 at PAGE
 ; CHECK-NEXT:  Lloh7:
-; CHECK-NEXT:    ldr q0, [x9, lCPI4_0 at PAGEOFF]
+; CHECK-NEXT:    ldr q0, [x8, lCPI4_0 at PAGEOFF]
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB4_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x9, x0, x8, lsl #6
@@ -417,10 +417,10 @@ define void @trunc_v8i64_to_v8i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    add x9, x0, x8, lsl #6
 ; CHECK-BE-NEXT:    add x10, x9, #16
-; CHECK-BE-NEXT:    add x11, x9, #32
 ; CHECK-BE-NEXT:    ld1 { v1.16b }, [x9]
-; CHECK-BE-NEXT:    add x9, x9, #48
+; CHECK-BE-NEXT:    add x11, x9, #32
 ; CHECK-BE-NEXT:    ld1 { v2.16b }, [x10]
+; CHECK-BE-NEXT:    add x9, x9, #48
 ; CHECK-BE-NEXT:    ld1 { v3.16b }, [x11]
 ; CHECK-BE-NEXT:    ld1 { v4.16b }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, x8, lsl #3
@@ -458,25 +458,25 @@ define void @trunc_v8i19_to_v8i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-NEXT:  LBB5_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldp x10, x9, [x0]
-; CHECK-NEXT:    ldrb w11, [x0, #18]
-; CHECK-NEXT:    ldrh w13, [x0, #16]
+; CHECK-NEXT:    ldrb w13, [x0, #18]
+; CHECK-NEXT:    ldrh w14, [x0, #16]
 ; CHECK-NEXT:    add x0, x0, #32
-; CHECK-NEXT:    lsr x14, x10, #19
-; CHECK-NEXT:    fmov s0, w10
 ; CHECK-NEXT:    ubfx x12, x9, #12, #20
+; CHECK-NEXT:    fmov s0, w10
+; CHECK-NEXT:    lsr x11, x10, #19
 ; CHECK-NEXT:    lsr x15, x9, #31
-; CHECK-NEXT:    orr w11, w13, w11, lsl #16
-; CHECK-NEXT:    lsr x13, x9, #50
-; CHECK-NEXT:    mov.s v0[1], w14
 ; CHECK-NEXT:    fmov s1, w12
-; CHECK-NEXT:    lsr x12, x10, #38
-; CHECK-NEXT:    orr w13, w13, w11, lsl #14
+; CHECK-NEXT:    lsr x12, x9, #50
+; CHECK-NEXT:    mov.s v0[1], w11
+; CHECK-NEXT:    orr w11, w14, w13, lsl #16
+; CHECK-NEXT:    lsr x13, x10, #38
 ; CHECK-NEXT:    lsr x10, x10, #57
+; CHECK-NEXT:    mov.s v1[1], w15
+; CHECK-NEXT:    orr w12, w12, w11, lsl #14
 ; CHECK-NEXT:    orr w9, w10, w9, lsl #7
 ; CHECK-NEXT:    lsr w10, w11, #5
-; CHECK-NEXT:    mov.s v1[1], w15
-; CHECK-NEXT:    mov.s v0[2], w12
-; CHECK-NEXT:    mov.s v1[2], w13
+; CHECK-NEXT:    mov.s v0[2], w13
+; CHECK-NEXT:    mov.s v1[2], w12
 ; CHECK-NEXT:    mov.s v0[3], w9
 ; CHECK-NEXT:    mov.s v1[3], w10
 ; CHECK-NEXT:    uzp1.8h v0, v0, v1
@@ -494,33 +494,33 @@ define void @trunc_v8i19_to_v8i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-BE-NEXT:  .LBB5_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    ldp x10, x9, [x0]
-; CHECK-BE-NEXT:    ldrh w11, [x0, #16]
-; CHECK-BE-NEXT:    lsr x13, x10, #45
-; CHECK-BE-NEXT:    lsr x15, x10, #40
-; CHECK-BE-NEXT:    lsr x12, x9, #40
-; CHECK-BE-NEXT:    ubfx x14, x9, #33, #7
-; CHECK-BE-NEXT:    ubfx x16, x10, #26, #14
-; CHECK-BE-NEXT:    orr w12, w14, w12, lsl #7
-; CHECK-BE-NEXT:    ldrb w14, [x0, #18]
-; CHECK-BE-NEXT:    orr w15, w16, w15, lsl #14
-; CHECK-BE-NEXT:    fmov s0, w13
+; CHECK-BE-NEXT:    ldrb w16, [x0, #18]
+; CHECK-BE-NEXT:    lsr x11, x9, #40
+; CHECK-BE-NEXT:    ubfx x12, x9, #33, #7
+; CHECK-BE-NEXT:    lsr x15, x10, #45
+; CHECK-BE-NEXT:    lsr x13, x10, #40
+; CHECK-BE-NEXT:    ubfx x14, x10, #26, #14
+; CHECK-BE-NEXT:    orr w11, w12, w11, lsl #7
+; CHECK-BE-NEXT:    ldrh w12, [x0, #16]
+; CHECK-BE-NEXT:    fmov s0, w15
+; CHECK-BE-NEXT:    orr w13, w14, w13, lsl #14
+; CHECK-BE-NEXT:    ubfx x14, x9, #14, #18
 ; CHECK-BE-NEXT:    add x0, x0, #32
-; CHECK-BE-NEXT:    fmov s1, w12
-; CHECK-BE-NEXT:    ubfx x12, x9, #14, #18
-; CHECK-BE-NEXT:    orr w11, w14, w11, lsl #8
-; CHECK-BE-NEXT:    mov v0.s[1], w15
-; CHECK-BE-NEXT:    mov v1.s[1], w12
-; CHECK-BE-NEXT:    extr x12, x10, x9, #40
-; CHECK-BE-NEXT:    lsl x9, x9, #24
-; CHECK-BE-NEXT:    ubfx x10, x10, #7, #25
-; CHECK-BE-NEXT:    orr w9, w11, w9
-; CHECK-BE-NEXT:    lsr w9, w9, #19
-; CHECK-BE-NEXT:    mov v0.s[2], w10
-; CHECK-BE-NEXT:    ubfx x10, x12, #12, #20
-; CHECK-BE-NEXT:    mov v1.s[2], w9
+; CHECK-BE-NEXT:    fmov s1, w11
+; CHECK-BE-NEXT:    orr w11, w16, w12, lsl #8
+; CHECK-BE-NEXT:    lsl x12, x9, #24
+; CHECK-BE-NEXT:    mov v0.s[1], w13
+; CHECK-BE-NEXT:    ubfx x13, x10, #7, #25
+; CHECK-BE-NEXT:    extr x9, x10, x9, #40
+; CHECK-BE-NEXT:    orr w12, w11, w12
+; CHECK-BE-NEXT:    mov v1.s[1], w14
+; CHECK-BE-NEXT:    lsr w12, w12, #19
+; CHECK-BE-NEXT:    ubfx x9, x9, #12, #20
+; CHECK-BE-NEXT:    mov v0.s[2], w13
+; CHECK-BE-NEXT:    mov v1.s[2], w12
+; CHECK-BE-NEXT:    mov v0.s[3], w9
 ; CHECK-BE-NEXT:    add x9, x1, x8, lsl #3
 ; CHECK-BE-NEXT:    add x8, x8, #1
-; CHECK-BE-NEXT:    mov v0.s[3], w10
 ; CHECK-BE-NEXT:    cmp x8, #1000
 ; CHECK-BE-NEXT:    mov v1.s[3], w11
 ; CHECK-BE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
@@ -554,24 +554,24 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-NEXT:    mov w8, #1000 ; =0x3e8
 ; CHECK-NEXT:  LBB6_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldp q1, q0, [x0, #32]
+; CHECK-NEXT:    ldp q4, q1, [x0, #48]
 ; CHECK-NEXT:    add x9, x1, #8
-; CHECK-NEXT:    add x10, x1, #10
-; CHECK-NEXT:    subs x8, x8, #1
 ; CHECK-NEXT:    ldp q3, q2, [x0]
-; CHECK-NEXT:    uzp1.4s v0, v1, v0
-; CHECK-NEXT:    ldr d4, [x0, #80]
-; CHECK-NEXT:    ldr q1, [x0, #64]
+; CHECK-NEXT:    subs x8, x8, #1
+; CHECK-NEXT:    ldr d0, [x0, #80]
+; CHECK-NEXT:    ldr q5, [x0, #32]
 ; CHECK-NEXT:    add x0, x0, #128
+; CHECK-NEXT:    uzp1.4s v4, v5, v4
 ; CHECK-NEXT:    uzp1.4s v2, v3, v2
-; CHECK-NEXT:    uzp1.4s v1, v1, v4
-; CHECK-NEXT:    uzp1.8h v0, v2, v0
-; CHECK-NEXT:    xtn.4h v1, v1
-; CHECK-NEXT:    uzp1.16b v0, v0, v1
-; CHECK-NEXT:    xtn.8b v1, v1
-; CHECK-NEXT:    st1.b { v1 }[2], [x10]
-; CHECK-NEXT:    str d0, [x1], #16
-; CHECK-NEXT:    st1.h { v0 }[4], [x9]
+; CHECK-NEXT:    uzp1.4s v0, v1, v0
+; CHECK-NEXT:    uzp1.8h v1, v2, v4
+; CHECK-NEXT:    xtn.4h v0, v0
+; CHECK-NEXT:    uzp1.16b v1, v1, v0
+; CHECK-NEXT:    xtn.8b v0, v0
+; CHECK-NEXT:    st1.h { v1 }[4], [x9]
+; CHECK-NEXT:    add x9, x1, #10
+; CHECK-NEXT:    st1.b { v0 }[2], [x9]
+; CHECK-NEXT:    str d1, [x1], #16
 ; CHECK-NEXT:    b.eq LBB6_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
@@ -581,32 +581,32 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-BE-NEXT:    mov w8, #1000 // =0x3e8
 ; CHECK-BE-NEXT:  .LBB6_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT:    add x9, x0, #64
+; CHECK-BE-NEXT:    add x10, x0, #16
+; CHECK-BE-NEXT:    ld1 { v3.2d }, [x0]
+; CHECK-BE-NEXT:    ld1 { v0.2d }, [x9]
 ; CHECK-BE-NEXT:    add x9, x0, #48
+; CHECK-BE-NEXT:    ld1 { v1.2d }, [x10]
 ; CHECK-BE-NEXT:    add x10, x0, #32
-; CHECK-BE-NEXT:    ld1 { v0.2d }, [x0]
-; CHECK-BE-NEXT:    subs x8, x8, #1
-; CHECK-BE-NEXT:    ld1 { v1.2d }, [x9]
-; CHECK-BE-NEXT:    add x9, x0, #16
-; CHECK-BE-NEXT:    ld1 { v2.2d }, [x10]
-; CHECK-BE-NEXT:    add x10, x0, #64
-; CHECK-BE-NEXT:    ld1 { v3.2d }, [x9]
-; CHECK-BE-NEXT:    add x9, x1, #10
+; CHECK-BE-NEXT:    ld1 { v2.2d }, [x9]
+; CHECK-BE-NEXT:    ldr d5, [x0, #80]
 ; CHECK-BE-NEXT:    ld1 { v4.2d }, [x10]
-; CHECK-BE-NEXT:    add x10, x1, #8
-; CHECK-BE-NEXT:    uzp1 v1.4s, v2.4s, v1.4s
-; CHECK-BE-NEXT:    ldr d2, [x0, #80]
+; CHECK-BE-NEXT:    add x9, x1, #10
+; CHECK-BE-NEXT:    subs x8, x8, #1
+; CHECK-BE-NEXT:    uzp1 v1.4s, v3.4s, v1.4s
+; CHECK-BE-NEXT:    uzp1 v0.4s, v0.4s, v5.4s
 ; CHECK-BE-NEXT:    add x0, x0, #128
-; CHECK-BE-NEXT:    uzp1 v0.4s, v0.4s, v3.4s
 ; CHECK-BE-NEXT:    uzp1 v2.4s, v4.4s, v2.4s
-; CHECK-BE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-BE-NEXT:    xtn v1.4h, v2.4s
-; CHECK-BE-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
-; CHECK-BE-NEXT:    xtn v1.8b, v1.8h
-; CHECK-BE-NEXT:    st1 { v1.b }[2], [x9]
-; CHECK-BE-NEXT:    rev64 v2.16b, v0.16b
-; CHECK-BE-NEXT:    rev16 v0.16b, v0.16b
-; CHECK-BE-NEXT:    str d2, [x1], #16
-; CHECK-BE-NEXT:    st1 { v0.h }[4], [x10]
+; CHECK-BE-NEXT:    xtn v0.4h, v0.4s
+; CHECK-BE-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
+; CHECK-BE-NEXT:    uzp1 v1.16b, v1.16b, v0.16b
+; CHECK-BE-NEXT:    xtn v0.8b, v0.8h
+; CHECK-BE-NEXT:    rev16 v2.16b, v1.16b
+; CHECK-BE-NEXT:    rev64 v1.16b, v1.16b
+; CHECK-BE-NEXT:    st1 { v0.b }[2], [x9]
+; CHECK-BE-NEXT:    add x9, x1, #8
+; CHECK-BE-NEXT:    st1 { v2.h }[4], [x9]
+; CHECK-BE-NEXT:    str d1, [x1], #16
 ; CHECK-BE-NEXT:    b.eq .LBB6_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    ret
@@ -654,8 +654,8 @@ define void @trunc_v16i16_to_v16i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-BE-NEXT:    add x10, x9, #16
 ; CHECK-BE-NEXT:    ld1 { v0.8h }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, x8, lsl #4
-; CHECK-BE-NEXT:    add x8, x8, #1
 ; CHECK-BE-NEXT:    ld1 { v1.8h }, [x10]
+; CHECK-BE-NEXT:    add x8, x8, #1
 ; CHECK-BE-NEXT:    cmp x8, #1000
 ; CHECK-BE-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
 ; CHECK-BE-NEXT:    st1 { v0.16b }, [x9]

diff  --git a/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll b/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll
index 00d1069c1d13ba..ccfbf456693d7a 100644
--- a/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll
+++ b/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll
@@ -4,12 +4,12 @@
 define zeroext i16 @overflow_add(i16 zeroext %a, i16 zeroext %b) {
 ; CHECK-LABEL: overflow_add:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w8, w1, w0
-; CHECK-NEXT:    mov w9, #2
-; CHECK-NEXT:    orr w8, w8, #0x1
-; CHECK-NEXT:    and w8, w8, #0xffff
-; CHECK-NEXT:    cmp w8, #1024
-; CHECK-NEXT:    mov w8, #5
+; CHECK-NEXT:    add w9, w1, w0
+; CHECK-NEXT:    mov w8, #5 // =0x5
+; CHECK-NEXT:    orr w9, w9, #0x1
+; CHECK-NEXT:    and w9, w9, #0xffff
+; CHECK-NEXT:    cmp w9, #1024
+; CHECK-NEXT:    mov w9, #2 // =0x2
 ; CHECK-NEXT:    csel w0, w9, w8, hi
 ; CHECK-NEXT:    ret
   %add = add i16 %b, %a
@@ -22,12 +22,12 @@ define zeroext i16 @overflow_add(i16 zeroext %a, i16 zeroext %b) {
 define zeroext i16 @overflow_sub(i16 zeroext %a, i16 zeroext %b) {
 ; CHECK-LABEL: overflow_sub:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub w8, w0, w1
-; CHECK-NEXT:    mov w9, #2
-; CHECK-NEXT:    orr w8, w8, #0x1
-; CHECK-NEXT:    and w8, w8, #0xffff
-; CHECK-NEXT:    cmp w8, #1024
-; CHECK-NEXT:    mov w8, #5
+; CHECK-NEXT:    sub w9, w0, w1
+; CHECK-NEXT:    mov w8, #5 // =0x5
+; CHECK-NEXT:    orr w9, w9, #0x1
+; CHECK-NEXT:    and w9, w9, #0xffff
+; CHECK-NEXT:    cmp w9, #1024
+; CHECK-NEXT:    mov w9, #2 // =0x2
 ; CHECK-NEXT:    csel w0, w9, w8, hi
 ; CHECK-NEXT:    ret
   %add = sub i16 %a, %b
@@ -41,11 +41,11 @@ define zeroext i16 @overflow_mul(i16 zeroext %a, i16 zeroext %b) {
 ; CHECK-LABEL: overflow_mul:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mul w9, w1, w0
-; CHECK-NEXT:    mov w8, #5
+; CHECK-NEXT:    mov w8, #5 // =0x5
 ; CHECK-NEXT:    orr w9, w9, #0x1
 ; CHECK-NEXT:    and w9, w9, #0xffff
 ; CHECK-NEXT:    cmp w9, #1024
-; CHECK-NEXT:    mov w9, #2
+; CHECK-NEXT:    mov w9, #2 // =0x2
 ; CHECK-NEXT:    csel w0, w9, w8, hi
 ; CHECK-NEXT:    ret
   %add = mul i16 %b, %a
@@ -59,11 +59,11 @@ define zeroext i16 @overflow_shl(i16 zeroext %a, i16 zeroext %b) {
 ; CHECK-LABEL: overflow_shl:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    lsl w9, w0, w1
-; CHECK-NEXT:    mov w8, #5
+; CHECK-NEXT:    mov w8, #5 // =0x5
 ; CHECK-NEXT:    orr w9, w9, #0x1
 ; CHECK-NEXT:    and w9, w9, #0xffff
 ; CHECK-NEXT:    cmp w9, #1024
-; CHECK-NEXT:    mov w9, #2
+; CHECK-NEXT:    mov w9, #2 // =0x2
 ; CHECK-NEXT:    csel w0, w9, w8, hi
 ; CHECK-NEXT:    ret
   %add = shl i16 %a, %b
@@ -76,10 +76,10 @@ define zeroext i16 @overflow_shl(i16 zeroext %a, i16 zeroext %b) {
 define i32 @overflow_add_no_consts(i8 zeroext %a, i8 zeroext %b, i8 zeroext %limit) {
 ; CHECK-LABEL: overflow_add_no_consts:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w9, w1, w0
-; CHECK-NEXT:    mov w8, #16
-; CHECK-NEXT:    cmp w2, w9, uxtb
-; CHECK-NEXT:    mov w9, #8
+; CHECK-NEXT:    add w8, w1, w0
+; CHECK-NEXT:    mov w9, #8 // =0x8
+; CHECK-NEXT:    cmp w2, w8, uxtb
+; CHECK-NEXT:    mov w8, #16 // =0x10
 ; CHECK-NEXT:    csel w0, w9, w8, lo
 ; CHECK-NEXT:    ret
   %add = add i8 %b, %a
@@ -91,11 +91,11 @@ define i32 @overflow_add_no_consts(i8 zeroext %a, i8 zeroext %b, i8 zeroext %lim
 define i32 @overflow_add_const_limit(i8 zeroext %a, i8 zeroext %b) {
 ; CHECK-LABEL: overflow_add_const_limit:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w8, w1, w0
-; CHECK-NEXT:    mov w9, #8
-; CHECK-NEXT:    and w8, w8, #0xff
-; CHECK-NEXT:    cmp w8, #128
-; CHECK-NEXT:    mov w8, #16
+; CHECK-NEXT:    add w9, w1, w0
+; CHECK-NEXT:    mov w8, #16 // =0x10
+; CHECK-NEXT:    and w9, w9, #0xff
+; CHECK-NEXT:    cmp w9, #128
+; CHECK-NEXT:    mov w9, #8 // =0x8
 ; CHECK-NEXT:    csel w0, w9, w8, hi
 ; CHECK-NEXT:    ret
   %add = add i8 %b, %a
@@ -107,10 +107,10 @@ define i32 @overflow_add_const_limit(i8 zeroext %a, i8 zeroext %b) {
 define i32 @overflow_add_positive_const_limit(i8 zeroext %a) {
 ; CHECK-LABEL: overflow_add_positive_const_limit:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-1
-; CHECK-NEXT:    mov w9, #8
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    mov w9, #8 // =0x8
 ; CHECK-NEXT:    cmp w8, w0, sxtb
-; CHECK-NEXT:    mov w8, #16
+; CHECK-NEXT:    mov w8, #16 // =0x10
 ; CHECK-NEXT:    csel w0, w9, w8, gt
 ; CHECK-NEXT:    ret
   %cmp = icmp slt i8 %a, -1
@@ -121,9 +121,9 @@ define i32 @overflow_add_positive_const_limit(i8 zeroext %a) {
 define i32 @unsafe_add_underflow(i8 zeroext %a) {
 ; CHECK-LABEL: unsafe_add_underflow:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #16
+; CHECK-NEXT:    mov w8, #16 // =0x10
 ; CHECK-NEXT:    cmp w0, #1
-; CHECK-NEXT:    mov w9, #8
+; CHECK-NEXT:    mov w9, #8 // =0x8
 ; CHECK-NEXT:    csel w0, w9, w8, eq
 ; CHECK-NEXT:    ret
   %cmp = icmp eq i8 %a, 1
@@ -134,9 +134,9 @@ define i32 @unsafe_add_underflow(i8 zeroext %a) {
 define i32 @safe_add_underflow(i8 zeroext %a) {
 ; CHECK-LABEL: safe_add_underflow:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #16
+; CHECK-NEXT:    mov w8, #16 // =0x10
 ; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    mov w9, #8
+; CHECK-NEXT:    mov w9, #8 // =0x8
 ; CHECK-NEXT:    csel w0, w9, w8, eq
 ; CHECK-NEXT:    ret
   %cmp = icmp eq i8 %a, 0
@@ -148,9 +148,9 @@ define i32 @safe_add_underflow_neg(i8 zeroext %a) {
 ; CHECK-LABEL: safe_add_underflow_neg:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub w9, w0, #2
-; CHECK-NEXT:    mov w8, #16
+; CHECK-NEXT:    mov w8, #16 // =0x10
 ; CHECK-NEXT:    cmp w9, #251
-; CHECK-NEXT:    mov w9, #8
+; CHECK-NEXT:    mov w9, #8 // =0x8
 ; CHECK-NEXT:    csel w0, w9, w8, lo
 ; CHECK-NEXT:    ret
   %add = add i8 %a, -2
@@ -162,10 +162,10 @@ define i32 @safe_add_underflow_neg(i8 zeroext %a) {
 define i32 @overflow_sub_negative_const_limit(i8 zeroext %a) {
 ; CHECK-LABEL: overflow_sub_negative_const_limit:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-1
-; CHECK-NEXT:    mov w9, #8
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    mov w9, #8 // =0x8
 ; CHECK-NEXT:    cmp w8, w0, sxtb
-; CHECK-NEXT:    mov w8, #16
+; CHECK-NEXT:    mov w8, #16 // =0x10
 ; CHECK-NEXT:    csel w0, w9, w8, gt
 ; CHECK-NEXT:    ret
   %cmp = icmp slt i8 %a, -1
@@ -178,9 +178,9 @@ define i32 @sext_sub_underflow(i8 zeroext %a) {
 ; CHECK-LABEL: sext_sub_underflow:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub w9, w0, #6
-; CHECK-NEXT:    mov w8, #16
+; CHECK-NEXT:    mov w8, #16 // =0x10
 ; CHECK-NEXT:    cmn w9, #6
-; CHECK-NEXT:    mov w9, #8
+; CHECK-NEXT:    mov w9, #8 // =0x8
 ; CHECK-NEXT:    csel w0, w9, w8, hi
 ; CHECK-NEXT:    ret
   %sub = add i8 %a, -6
@@ -192,9 +192,9 @@ define i32 @sext_sub_underflow(i8 zeroext %a) {
 define i32 @safe_sub_underflow(i8 zeroext %a) {
 ; CHECK-LABEL: safe_sub_underflow:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #8
+; CHECK-NEXT:    mov w8, #8 // =0x8
 ; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    mov w9, #16
+; CHECK-NEXT:    mov w9, #16 // =0x10
 ; CHECK-NEXT:    csel w0, w9, w8, eq
 ; CHECK-NEXT:    ret
   %cmp.not = icmp eq i8 %a, 0
@@ -206,9 +206,9 @@ define i32 @safe_sub_underflow_neg(i8 zeroext %a) {
 ; CHECK-LABEL: safe_sub_underflow_neg:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub w9, w0, #4
-; CHECK-NEXT:    mov w8, #16
+; CHECK-NEXT:    mov w8, #16 // =0x10
 ; CHECK-NEXT:    cmp w9, #250
-; CHECK-NEXT:    mov w9, #8
+; CHECK-NEXT:    mov w9, #8 // =0x8
 ; CHECK-NEXT:    csel w0, w9, w8, hi
 ; CHECK-NEXT:    ret
   %sub = add i8 %a, -4
@@ -222,9 +222,9 @@ define i32 @sext_sub_underflow_neg(i8 zeroext %a) {
 ; CHECK-LABEL: sext_sub_underflow_neg:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub w9, w0, #4
-; CHECK-NEXT:    mov w8, #16
+; CHECK-NEXT:    mov w8, #16 // =0x10
 ; CHECK-NEXT:    cmn w9, #3
-; CHECK-NEXT:    mov w9, #8
+; CHECK-NEXT:    mov w9, #8 // =0x8
 ; CHECK-NEXT:    csel w0, w9, w8, lo
 ; CHECK-NEXT:    ret
   %sub = add i8 %a, -4
@@ -262,7 +262,7 @@ entry:
 define i32 @safe_add_imm_var(ptr nocapture readnone %b) {
 ; CHECK-LABEL: safe_add_imm_var:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 entry:
   ret i32 1
@@ -271,7 +271,7 @@ entry:
 define i32 @safe_add_var_imm(ptr nocapture readnone %b) {
 ; CHECK-LABEL: safe_add_var_imm:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
 entry:
   ret i32 1
@@ -281,12 +281,12 @@ define i8 @convert_add_order(i8 zeroext %arg) {
 ; CHECK-LABEL: convert_add_order:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    orr w9, w0, #0x1
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    sub w10, w9, #40
 ; CHECK-NEXT:    cmp w10, #20
 ; CHECK-NEXT:    cinc w8, w8, hs
 ; CHECK-NEXT:    cmp w9, #50
-; CHECK-NEXT:    mov w9, #255
+; CHECK-NEXT:    mov w9, #255 // =0xff
 ; CHECK-NEXT:    csel w8, w8, w9, lo
 ; CHECK-NEXT:    and w0, w8, w0
 ; CHECK-NEXT:    ret
@@ -304,12 +304,12 @@ define i8 @underflow_if_sub(i32 %arg, i8 zeroext %arg1) {
 ; CHECK-LABEL: underflow_if_sub:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    mov w9, #100
-; CHECK-NEXT:    cset w8, gt
-; CHECK-NEXT:    and w8, w8, w0
-; CHECK-NEXT:    add w8, w8, #245
-; CHECK-NEXT:    cmp w8, w1
-; CHECK-NEXT:    csel w0, w8, w9, lo
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cset w9, gt
+; CHECK-NEXT:    and w9, w9, w0
+; CHECK-NEXT:    add w9, w9, #245
+; CHECK-NEXT:    cmp w9, w1
+; CHECK-NEXT:    csel w0, w9, w8, lo
 ; CHECK-NEXT:    ret
   %cmp = icmp sgt i32 %arg, 0
   %conv = zext i1 %cmp to i32
@@ -325,12 +325,12 @@ define i8 @underflow_if_sub_signext(i32 %arg, i8 signext %arg1) {
 ; CHECK-LABEL: underflow_if_sub_signext:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    mov w9, #100
-; CHECK-NEXT:    cset w8, gt
-; CHECK-NEXT:    and w8, w8, w0
-; CHECK-NEXT:    add w8, w8, #245
-; CHECK-NEXT:    cmp w8, w1, uxtb
-; CHECK-NEXT:    csel w0, w8, w9, lo
+; CHECK-NEXT:    mov w8, #100 // =0x64
+; CHECK-NEXT:    cset w9, gt
+; CHECK-NEXT:    and w9, w9, w0
+; CHECK-NEXT:    add w9, w9, #245
+; CHECK-NEXT:    cmp w9, w1, uxtb
+; CHECK-NEXT:    csel w0, w9, w8, lo
 ; CHECK-NEXT:    ret
   %cmp = icmp sgt i32 %arg, 0
   %conv = zext i1 %cmp to i32

diff  --git a/llvm/test/CodeGen/AArch64/typepromotion-phisret.ll b/llvm/test/CodeGen/AArch64/typepromotion-phisret.ll
index 64b87a589fbc34..d60578b7bafe47 100644
--- a/llvm/test/CodeGen/AArch64/typepromotion-phisret.ll
+++ b/llvm/test/CodeGen/AArch64/typepromotion-phisret.ll
@@ -10,10 +10,10 @@ define void @phi_feeding_phi_args(i8 %a, i8 %b) {
 ; CHECK-NEXT:    csel w8, w8, w9, hi
 ; CHECK-NEXT:  .LBB0_1: // %loop
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    sub w9, w8, #2
-; CHECK-NEXT:    lsl w10, w8, #1
+; CHECK-NEXT:    lsl w9, w8, #1
+; CHECK-NEXT:    sub w10, w8, #2
 ; CHECK-NEXT:    cmp w8, #254
-; CHECK-NEXT:    csel w8, w9, w10, lo
+; CHECK-NEXT:    csel w8, w10, w9, lo
 ; CHECK-NEXT:    cmp w8, #255
 ; CHECK-NEXT:    b.ne .LBB0_1
 ; CHECK-NEXT:  // %bb.2: // %exit
@@ -58,10 +58,10 @@ define void @phi_feeding_phi_zeroext_args(i8 zeroext %a, i8 zeroext %b) {
 ; CHECK-NEXT:    csel w8, w0, w1, hi
 ; CHECK-NEXT:  .LBB1_1: // %loop
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    sub w9, w8, #2
-; CHECK-NEXT:    lsl w10, w8, #1
+; CHECK-NEXT:    lsl w9, w8, #1
+; CHECK-NEXT:    sub w10, w8, #2
 ; CHECK-NEXT:    cmp w8, #254
-; CHECK-NEXT:    csel w8, w9, w10, lo
+; CHECK-NEXT:    csel w8, w10, w9, lo
 ; CHECK-NEXT:    cmp w8, #255
 ; CHECK-NEXT:    b.ne .LBB1_1
 ; CHECK-NEXT:  // %bb.2: // %exit
@@ -103,7 +103,7 @@ define void @phi_i16() {
 ; CHECK-LABEL: phi_i16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    mov w9, #1
+; CHECK-NEXT:    mov w9, #1 // =0x1
 ; CHECK-NEXT:  .LBB2_1: // %loop
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    cmp w8, #128
@@ -142,7 +142,7 @@ define i8 @ret_i8() {
 ; CHECK-LABEL: ret_i8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:  .LBB3_1: // %loop
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    cmp w0, #128
@@ -181,7 +181,7 @@ define i16 @phi_multiple_undefs(i16 zeroext %arg) {
 ; CHECK-LABEL: phi_multiple_undefs:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    mov w9, #1
+; CHECK-NEXT:    mov w9, #1 // =0x1
 ; CHECK-NEXT:  .LBB4_1: // %loop
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    cmp w8, #128
@@ -237,21 +237,21 @@ define i16 @promote_arg_return(i16 zeroext %arg1, i16 zeroext %arg2, ptr %res) {
 define i16 @signext_bitcast_phi_select(i16 signext %start, ptr %in) {
 ; CHECK-LABEL: signext_bitcast_phi_select:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    mov w9, #-1
-; CHECK-NEXT:    cmp w9, w8, sxth
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    and w9, w0, #0xffff
+; CHECK-NEXT:    cmp w8, w9, sxth
 ; CHECK-NEXT:    b.lt .LBB6_3
 ; CHECK-NEXT:  .LBB6_1: // %if.then
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldrh w0, [x1, w8, sxtw #1]
-; CHECK-NEXT:    cmp w0, w8
+; CHECK-NEXT:    ldrh w0, [x1, w9, sxtw #1]
+; CHECK-NEXT:    cmp w0, w9
 ; CHECK-NEXT:    b.eq .LBB6_4
 ; CHECK-NEXT:  // %bb.2: // %if.else
 ; CHECK-NEXT:    // in Loop: Header=BB6_1 Depth=1
-; CHECK-NEXT:    lsr w10, w8, #15
+; CHECK-NEXT:    lsr w10, w9, #15
 ; CHECK-NEXT:    eor w10, w10, #0x1
-; CHECK-NEXT:    add w8, w10, w8
-; CHECK-NEXT:    cmp w9, w8, sxth
+; CHECK-NEXT:    add w9, w10, w9
+; CHECK-NEXT:    cmp w8, w9, sxth
 ; CHECK-NEXT:    b.ge .LBB6_1
 ; CHECK-NEXT:  .LBB6_3:
 ; CHECK-NEXT:    mov w0, wzr

diff  --git a/llvm/test/CodeGen/AArch64/typepromotion-signed.ll b/llvm/test/CodeGen/AArch64/typepromotion-signed.ll
index b94825c28a5617..212f02d86850b8 100644
--- a/llvm/test/CodeGen/AArch64/typepromotion-signed.ll
+++ b/llvm/test/CodeGen/AArch64/typepromotion-signed.ll
@@ -57,11 +57,11 @@ define i32 @test_signext_b(ptr nocapture readonly %ptr, i8 signext %arg) {
 ; CHECK-LABEL: test_signext_b:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldrb w9, [x0]
-; CHECK-NEXT:    mov w8, #20894
+; CHECK-NEXT:    mov w8, #20894 // =0x519e
 ; CHECK-NEXT:    add w9, w9, w1
 ; CHECK-NEXT:    sxtb w9, w9
 ; CHECK-NEXT:    cmp w9, #0
-; CHECK-NEXT:    mov w9, #42
+; CHECK-NEXT:    mov w9, #42 // =0x2a
 ; CHECK-NEXT:    csel w0, w9, w8, ge
 ; CHECK-NEXT:    ret
 entry:
@@ -75,12 +75,12 @@ entry:
 define i32 @test_signext_b_ult_slt(ptr nocapture readonly %ptr, i8 signext %arg) {
 ; CHECK-LABEL: test_signext_b_ult_slt:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrb w8, [x0]
-; CHECK-NEXT:    add w9, w8, w1, uxtb
-; CHECK-NEXT:    cmp w9, #127
-; CHECK-NEXT:    mov w9, #42
-; CHECK-NEXT:    ccmp w8, #0, #0, ne
-; CHECK-NEXT:    mov w8, #57
+; CHECK-NEXT:    ldrb w9, [x0]
+; CHECK-NEXT:    mov w8, #57 // =0x39
+; CHECK-NEXT:    add w10, w9, w1, uxtb
+; CHECK-NEXT:    cmp w10, #127
+; CHECK-NEXT:    ccmp w9, #0, #0, ne
+; CHECK-NEXT:    mov w9, #42 // =0x2a
 ; CHECK-NEXT:    csel w0, w9, w8, eq
 ; CHECK-NEXT:    ret
 entry:
@@ -97,11 +97,11 @@ define i32 @test_signext_h(ptr nocapture readonly %ptr, i16 signext %arg) {
 ; CHECK-LABEL: test_signext_h:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldrh w9, [x0]
-; CHECK-NEXT:    mov w8, #20894
+; CHECK-NEXT:    mov w8, #20894 // =0x519e
 ; CHECK-NEXT:    add w9, w9, w1
 ; CHECK-NEXT:    sxth w9, w9
 ; CHECK-NEXT:    cmp w9, #0
-; CHECK-NEXT:    mov w9, #42
+; CHECK-NEXT:    mov w9, #42 // =0x2a
 ; CHECK-NEXT:    csel w0, w9, w8, ge
 ; CHECK-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/uadd_sat.ll b/llvm/test/CodeGen/AArch64/uadd_sat.ll
index 69131e27f61206..984cc8fcffbb6b 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat.ll
@@ -31,7 +31,7 @@ define i16 @func16(i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: func16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    mov w9, #65535
+; CHECK-NEXT:    mov w9, #65535 // =0xffff
 ; CHECK-NEXT:    add w8, w8, w1, uxth
 ; CHECK-NEXT:    cmp w8, w9
 ; CHECK-NEXT:    csel w0, w8, w9, lo
@@ -43,11 +43,11 @@ define i16 @func16(i16 %x, i16 %y) nounwind {
 define i8 @func8(i8 %x, i8 %y) nounwind {
 ; CHECK-LABEL: func8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xff
-; CHECK-NEXT:    mov w9, #255
-; CHECK-NEXT:    add w8, w8, w1, uxtb
-; CHECK-NEXT:    cmp w8, #255
-; CHECK-NEXT:    csel w0, w8, w9, lo
+; CHECK-NEXT:    and w9, w0, #0xff
+; CHECK-NEXT:    mov w8, #255 // =0xff
+; CHECK-NEXT:    add w9, w9, w1, uxtb
+; CHECK-NEXT:    cmp w9, #255
+; CHECK-NEXT:    csel w0, w9, w8, lo
 ; CHECK-NEXT:    ret
   %tmp = call i8 @llvm.uadd.sat.i8(i8 %x, i8 %y);
   ret i8 %tmp;
@@ -56,12 +56,12 @@ define i8 @func8(i8 %x, i8 %y) nounwind {
 define i4 @func3(i4 %x, i4 %y) nounwind {
 ; CHECK-LABEL: func3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w1, #0xf
-; CHECK-NEXT:    and w9, w0, #0xf
-; CHECK-NEXT:    add w8, w9, w8
-; CHECK-NEXT:    mov w9, #15
-; CHECK-NEXT:    cmp w8, #15
-; CHECK-NEXT:    csel w0, w8, w9, lo
+; CHECK-NEXT:    and w9, w1, #0xf
+; CHECK-NEXT:    and w10, w0, #0xf
+; CHECK-NEXT:    mov w8, #15 // =0xf
+; CHECK-NEXT:    add w9, w10, w9
+; CHECK-NEXT:    cmp w9, #15
+; CHECK-NEXT:    csel w0, w9, w8, lo
 ; CHECK-NEXT:    ret
   %tmp = call i4 @llvm.uadd.sat.i4(i4 %x, i4 %y);
   ret i4 %tmp;

diff  --git a/llvm/test/CodeGen/AArch64/uadd_sat_plus.ll b/llvm/test/CodeGen/AArch64/uadd_sat_plus.ll
index 06558b3e6cd696..705ee747f9e20b 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_plus.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_plus.ll
@@ -33,12 +33,12 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
 ; CHECK-LABEL: func16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul w9, w1, w2
-; CHECK-NEXT:    and w10, w0, #0xffff
-; CHECK-NEXT:    mov w8, #65535
-; CHECK-NEXT:    add w9, w10, w9, uxth
-; CHECK-NEXT:    cmp w9, w8
-; CHECK-NEXT:    csel w0, w9, w8, lo
+; CHECK-NEXT:    mul w8, w1, w2
+; CHECK-NEXT:    and w9, w0, #0xffff
+; CHECK-NEXT:    add w8, w9, w8, uxth
+; CHECK-NEXT:    mov w9, #65535 // =0xffff
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    csel w0, w8, w9, lo
 ; CHECK-NEXT:    ret
   %a = mul i16 %y, %z
   %tmp = call i16 @llvm.uadd.sat.i16(i16 %x, i16 %a)
@@ -48,12 +48,12 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
 define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind {
 ; CHECK-LABEL: func8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul w9, w1, w2
-; CHECK-NEXT:    and w10, w0, #0xff
-; CHECK-NEXT:    mov w8, #255
-; CHECK-NEXT:    add w9, w10, w9, uxtb
-; CHECK-NEXT:    cmp w9, #255
-; CHECK-NEXT:    csel w0, w9, w8, lo
+; CHECK-NEXT:    mul w8, w1, w2
+; CHECK-NEXT:    and w9, w0, #0xff
+; CHECK-NEXT:    add w8, w9, w8, uxtb
+; CHECK-NEXT:    mov w9, #255 // =0xff
+; CHECK-NEXT:    cmp w8, #255
+; CHECK-NEXT:    csel w0, w8, w9, lo
 ; CHECK-NEXT:    ret
   %a = mul i8 %y, %z
   %tmp = call i8 @llvm.uadd.sat.i8(i8 %x, i8 %a)
@@ -63,13 +63,13 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind {
 define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind {
 ; CHECK-LABEL: func4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul w9, w1, w2
-; CHECK-NEXT:    and w10, w0, #0xf
-; CHECK-NEXT:    mov w8, #15
-; CHECK-NEXT:    and w9, w9, #0xf
-; CHECK-NEXT:    add w9, w10, w9
-; CHECK-NEXT:    cmp w9, #15
-; CHECK-NEXT:    csel w0, w9, w8, lo
+; CHECK-NEXT:    mul w8, w1, w2
+; CHECK-NEXT:    and w9, w0, #0xf
+; CHECK-NEXT:    and w8, w8, #0xf
+; CHECK-NEXT:    add w8, w9, w8
+; CHECK-NEXT:    mov w9, #15 // =0xf
+; CHECK-NEXT:    cmp w8, #15
+; CHECK-NEXT:    csel w0, w8, w9, lo
 ; CHECK-NEXT:    ret
   %a = mul i4 %y, %z
   %tmp = call i4 @llvm.uadd.sat.i4(i4 %x, i4 %a)

diff  --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
index 1d0592bab6f65b..cf43adb13ebfc4 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
@@ -44,8 +44,8 @@ define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
 ; CHECK-LABEL: v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    uqadd v1.16b, v1.16b, v3.16b
+; CHECK-NEXT:    uqadd v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %z = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %x, <32 x i8> %y)
   ret <32 x i8> %z
@@ -75,8 +75,8 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
 ; CHECK-LABEL: v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v0.8h, v0.8h, v2.8h
 ; CHECK-NEXT:    uqadd v1.8h, v1.8h, v3.8h
+; CHECK-NEXT:    uqadd v0.8h, v0.8h, v2.8h
 ; CHECK-NEXT:    ret
   %z = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %x, <16 x i16> %y)
   ret <16 x i16> %z
@@ -97,9 +97,9 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
 define void @v8i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-LABEL: v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    uqadd v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    uqadd v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <8 x i8>, ptr %px
@@ -112,11 +112,11 @@ define void @v8i8(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-LABEL: v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    movi d2, #0xff00ff00ff00ff
-; CHECK-NEXT:    ldr s1, [x1]
-; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    umin v0.4h, v0.4h, v2.4h
+; CHECK-NEXT:    ldr s1, [x0]
+; CHECK-NEXT:    ldr s2, [x1]
+; CHECK-NEXT:    movi d0, #0xff00ff00ff00ff
+; CHECK-NEXT:    uaddl v1.8h, v1.8b, v2.8b
+; CHECK-NEXT:    umin v0.4h, v1.4h, v0.4h
 ; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    str s0, [x2]
 ; CHECK-NEXT:    ret
@@ -130,17 +130,17 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-LABEL: v2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x1]
-; CHECK-NEXT:    movi d0, #0x0000ff000000ff
-; CHECK-NEXT:    ldrb w9, [x0]
-; CHECK-NEXT:    ldrb w10, [x1, #1]
-; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    ldrb w9, [x1]
+; CHECK-NEXT:    movi d2, #0x0000ff000000ff
+; CHECK-NEXT:    ldrb w10, [x0, #1]
+; CHECK-NEXT:    ldrb w11, [x1, #1]
+; CHECK-NEXT:    fmov s0, w8
 ; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    ldrb w9, [x0, #1]
-; CHECK-NEXT:    mov v2.s[1], w10
-; CHECK-NEXT:    mov v1.s[1], w9
-; CHECK-NEXT:    add v1.2s, v1.2s, v2.2s
-; CHECK-NEXT:    umin v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    mov v0.s[1], w10
+; CHECK-NEXT:    mov v1.s[1], w11
+; CHECK-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    umin v0.2s, v0.2s, v2.2s
 ; CHECK-NEXT:    mov w8, v0.s[1]
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    strb w9, [x2]
@@ -156,9 +156,9 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-LABEL: v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    uqadd v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    uqadd v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <4 x i16>, ptr %px
@@ -171,17 +171,17 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-LABEL: v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x1]
-; CHECK-NEXT:    movi d0, #0x00ffff0000ffff
-; CHECK-NEXT:    ldrh w9, [x0]
-; CHECK-NEXT:    ldrh w10, [x1, #2]
-; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    ldrh w9, [x1]
+; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-NEXT:    ldrh w10, [x0, #2]
+; CHECK-NEXT:    ldrh w11, [x1, #2]
+; CHECK-NEXT:    fmov s0, w8
 ; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    ldrh w9, [x0, #2]
-; CHECK-NEXT:    mov v2.s[1], w10
-; CHECK-NEXT:    mov v1.s[1], w9
-; CHECK-NEXT:    add v1.2s, v1.2s, v2.2s
-; CHECK-NEXT:    umin v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    mov v0.s[1], w10
+; CHECK-NEXT:    mov v1.s[1], w11
+; CHECK-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    umin v0.2s, v0.2s, v2.2s
 ; CHECK-NEXT:    mov w8, v0.s[1]
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    strh w9, [x2]
@@ -223,9 +223,9 @@ define void @v12i16(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v1i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-LABEL: v1i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr b0, [x1]
-; CHECK-NEXT:    ldr b1, [x0]
-; CHECK-NEXT:    uqadd v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    ldr b1, [x1]
+; CHECK-NEXT:    uqadd v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    st1 { v0.b }[0], [x2]
 ; CHECK-NEXT:    ret
   %x = load <1 x i8>, ptr %px
@@ -238,9 +238,9 @@ define void @v1i8(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v1i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-LABEL: v1i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr h0, [x1]
-; CHECK-NEXT:    ldr h1, [x0]
-; CHECK-NEXT:    uqadd v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ldr h1, [x1]
+; CHECK-NEXT:    uqadd v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    str h0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <1 x i16>, ptr %px
@@ -293,8 +293,8 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; CHECK-LABEL: v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    uqadd v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    uqadd v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
   %z = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
   ret <8 x i32> %z
@@ -324,8 +324,8 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; CHECK-LABEL: v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqadd v0.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    uqadd v1.2d, v1.2d, v3.2d
+; CHECK-NEXT:    uqadd v0.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    ret
   %z = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
   ret <4 x i64> %z

diff  --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
index 8e79c1b03df87e..08045e814a35ef 100644
--- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
@@ -4,19 +4,19 @@
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; AARCH-LABEL: muloti_test:
 ; AARCH:       // %bb.0: // %start
+; AARCH-NEXT:    mul x9, x3, x0
 ; AARCH-NEXT:    cmp x1, #0
-; AARCH-NEXT:    umulh x8, x1, x2
 ; AARCH-NEXT:    ccmp x3, #0, #4, ne
-; AARCH-NEXT:    umulh x9, x3, x0
+; AARCH-NEXT:    umulh x8, x1, x2
+; AARCH-NEXT:    umulh x10, x3, x0
+; AARCH-NEXT:    madd x9, x1, x2, x9
 ; AARCH-NEXT:    ccmp xzr, x8, #0, eq
-; AARCH-NEXT:    mul x8, x3, x0
-; AARCH-NEXT:    madd x8, x1, x2, x8
-; AARCH-NEXT:    ccmp xzr, x9, #0, eq
-; AARCH-NEXT:    umulh x9, x0, x2
+; AARCH-NEXT:    umulh x11, x0, x2
+; AARCH-NEXT:    ccmp xzr, x10, #0, eq
 ; AARCH-NEXT:    mul x0, x0, x2
-; AARCH-NEXT:    cset w10, ne
-; AARCH-NEXT:    adds x1, x9, x8
-; AARCH-NEXT:    csinc w2, w10, wzr, lo
+; AARCH-NEXT:    cset w8, ne
+; AARCH-NEXT:    adds x1, x11, x9
+; AARCH-NEXT:    csinc w2, w8, wzr, lo
 ; AARCH-NEXT:    ret
 start:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
@@ -35,41 +35,41 @@ start:
 define i128 @__muloti4(i128 %0, i128 %1, ptr nocapture nonnull writeonly align 4 %2) #2 {
 ; AARCH-LABEL: __muloti4:
 ; AARCH:       // %bb.0: // %Entry
-; AARCH-NEXT:    asr x9, x1, #63
-; AARCH-NEXT:    asr x10, x3, #63
+; AARCH-NEXT:    asr x10, x1, #63
+; AARCH-NEXT:    asr x9, x3, #63
 ; AARCH-NEXT:    umulh x14, x0, x2
 ; AARCH-NEXT:    mov x8, x1
-; AARCH-NEXT:    mul x11, x2, x9
 ; AARCH-NEXT:    str wzr, [x4]
-; AARCH-NEXT:    umulh x12, x10, x0
-; AARCH-NEXT:    umulh x13, x2, x9
-; AARCH-NEXT:    madd x12, x10, x1, x12
-; AARCH-NEXT:    add x13, x13, x11
-; AARCH-NEXT:    mul x10, x10, x0
-; AARCH-NEXT:    madd x9, x3, x9, x13
-; AARCH-NEXT:    add x12, x12, x10
-; AARCH-NEXT:    adds x10, x10, x11
-; AARCH-NEXT:    mul x11, x1, x2
-; AARCH-NEXT:    adc x9, x12, x9
+; AARCH-NEXT:    mul x12, x2, x10
+; AARCH-NEXT:    umulh x13, x2, x10
+; AARCH-NEXT:    umulh x11, x9, x0
+; AARCH-NEXT:    mul x15, x1, x2
+; AARCH-NEXT:    add x13, x13, x12
+; AARCH-NEXT:    madd x11, x9, x1, x11
+; AARCH-NEXT:    mul x9, x9, x0
+; AARCH-NEXT:    madd x10, x3, x10, x13
 ; AARCH-NEXT:    umulh x13, x1, x2
-; AARCH-NEXT:    mul x12, x0, x3
-; AARCH-NEXT:    adds x11, x11, x14
-; AARCH-NEXT:    umulh x14, x0, x3
+; AARCH-NEXT:    add x11, x11, x9
+; AARCH-NEXT:    adds x9, x9, x12
+; AARCH-NEXT:    mul x16, x0, x3
+; AARCH-NEXT:    adc x10, x11, x10
+; AARCH-NEXT:    adds x11, x15, x14
+; AARCH-NEXT:    umulh x17, x0, x3
 ; AARCH-NEXT:    cinc x13, x13, hs
-; AARCH-NEXT:    adds x1, x12, x11
-; AARCH-NEXT:    mul x12, x8, x3
-; AARCH-NEXT:    cinc x11, x14, hs
+; AARCH-NEXT:    mul x12, x1, x3
+; AARCH-NEXT:    adds x1, x16, x11
+; AARCH-NEXT:    umulh x11, x8, x3
+; AARCH-NEXT:    cinc x14, x17, hs
+; AARCH-NEXT:    adds x13, x13, x14
 ; AARCH-NEXT:    mul x0, x0, x2
-; AARCH-NEXT:    adds x11, x13, x11
-; AARCH-NEXT:    umulh x13, x8, x3
 ; AARCH-NEXT:    cset w14, hs
-; AARCH-NEXT:    adds x11, x12, x11
-; AARCH-NEXT:    adc x12, x13, x14
-; AARCH-NEXT:    adds x10, x11, x10
-; AARCH-NEXT:    asr x11, x1, #63
-; AARCH-NEXT:    adc x9, x12, x9
-; AARCH-NEXT:    cmp x10, x11
-; AARCH-NEXT:    ccmp x9, x11, #0, eq
+; AARCH-NEXT:    adds x12, x12, x13
+; AARCH-NEXT:    asr x13, x1, #63
+; AARCH-NEXT:    adc x11, x11, x14
+; AARCH-NEXT:    adds x9, x12, x9
+; AARCH-NEXT:    adc x10, x11, x10
+; AARCH-NEXT:    cmp x9, x13
+; AARCH-NEXT:    ccmp x10, x13, #0, eq
 ; AARCH-NEXT:    cset w9, ne
 ; AARCH-NEXT:    tbz x8, #63, .LBB1_2
 ; AARCH-NEXT:  // %bb.1: // %Entry
@@ -79,7 +79,7 @@ define i128 @__muloti4(i128 %0, i128 %1, ptr nocapture nonnull writeonly align 4
 ; AARCH-NEXT:  .LBB1_2: // %Else2
 ; AARCH-NEXT:    cbz w9, .LBB1_4
 ; AARCH-NEXT:  .LBB1_3: // %Then7
-; AARCH-NEXT:    mov w8, #1
+; AARCH-NEXT:    mov w8, #1 // =0x1
 ; AARCH-NEXT:    str w8, [x4]
 ; AARCH-NEXT:  .LBB1_4: // %Block9
 ; AARCH-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll
index 5bd82475b278a9..565e014fa5736e 100644
--- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll
+++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll
@@ -205,8 +205,8 @@ define i32 @in_multiuse_A_constmask(i32 %x, i32 %y, i32 %z) nounwind {
 ; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
 ; CHECK-NEXT:    eor w8, w0, w1
 ; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    and w20, w8, #0xffff00
 ; CHECK-NEXT:    mov w19, w1
+; CHECK-NEXT:    and w20, w8, #0xffff00
 ; CHECK-NEXT:    mov w0, w20
 ; CHECK-NEXT:    bl use32
 ; CHECK-NEXT:    eor w0, w20, w19

diff  --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-interleavedbits.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-interleavedbits.ll
index f7a378412d3e37..08a6e386a2a8f3 100644
--- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-interleavedbits.ll
+++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-interleavedbits.ll
@@ -9,8 +9,8 @@
 define i8 @out8_constmask(i8 %x, i8 %y) {
 ; CHECK-LABEL: out8_constmask:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #85
-; CHECK-NEXT:    mov w9, #-86
+; CHECK-NEXT:    mov w8, #85 // =0x55
+; CHECK-NEXT:    mov w9, #-86 // =0xffffffaa
 ; CHECK-NEXT:    and w8, w0, w8
 ; CHECK-NEXT:    and w9, w1, w9
 ; CHECK-NEXT:    orr w0, w8, w9
@@ -24,8 +24,8 @@ define i8 @out8_constmask(i8 %x, i8 %y) {
 define i16 @out16_constmask(i16 %x, i16 %y) {
 ; CHECK-LABEL: out16_constmask:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #21845
-; CHECK-NEXT:    mov w9, #-21846
+; CHECK-NEXT:    mov w8, #21845 // =0x5555
+; CHECK-NEXT:    mov w9, #-21846 // =0xffffaaaa
 ; CHECK-NEXT:    and w8, w0, w8
 ; CHECK-NEXT:    and w9, w1, w9
 ; CHECK-NEXT:    orr w0, w8, w9
@@ -69,9 +69,9 @@ define i64 @out64_constmask(i64 %x, i64 %y) {
 define i8 @in8_constmask(i8 %x, i8 %y) {
 ; CHECK-LABEL: in8_constmask:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor w8, w0, w1
-; CHECK-NEXT:    mov w9, #85
-; CHECK-NEXT:    and w8, w8, w9
+; CHECK-NEXT:    mov w8, #85 // =0x55
+; CHECK-NEXT:    eor w9, w0, w1
+; CHECK-NEXT:    and w8, w9, w8
 ; CHECK-NEXT:    eor w0, w8, w1
 ; CHECK-NEXT:    ret
   %n0 = xor i8 %x, %y
@@ -83,9 +83,9 @@ define i8 @in8_constmask(i8 %x, i8 %y) {
 define i16 @in16_constmask(i16 %x, i16 %y) {
 ; CHECK-LABEL: in16_constmask:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor w8, w0, w1
-; CHECK-NEXT:    mov w9, #21845
-; CHECK-NEXT:    and w8, w8, w9
+; CHECK-NEXT:    mov w8, #21845 // =0x5555
+; CHECK-NEXT:    eor w9, w0, w1
+; CHECK-NEXT:    and w8, w9, w8
 ; CHECK-NEXT:    eor w0, w8, w1
 ; CHECK-NEXT:    ret
   %n0 = xor i16 %x, %y
@@ -211,8 +211,8 @@ define i32 @in_multiuse_A_constmask(i32 %x, i32 %y, i32 %z) nounwind {
 ; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
 ; CHECK-NEXT:    eor w8, w0, w1
 ; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    and w20, w8, #0x55555555
 ; CHECK-NEXT:    mov w19, w1
+; CHECK-NEXT:    and w20, w8, #0x55555555
 ; CHECK-NEXT:    mov w0, w20
 ; CHECK-NEXT:    bl use32
 ; CHECK-NEXT:    eor w0, w20, w19
@@ -251,7 +251,7 @@ define i32 @in_multiuse_B_constmask(i32 %x, i32 %y, i32 %z) nounwind {
 define i32 @n0_badconstmask(i32 %x, i32 %y) {
 ; CHECK-LABEL: n0_badconstmask:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #43691
+; CHECK-NEXT:    mov w8, #43691 // =0xaaab
 ; CHECK-NEXT:    and w9, w0, #0x55555555
 ; CHECK-NEXT:    movk w8, #43690, lsl #16
 ; CHECK-NEXT:    and w8, w1, w8

diff  --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll
index ae0e1d70aef718..2653508ddbf025 100644
--- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll
+++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll
@@ -21,8 +21,8 @@ define i8 @out8_constmask(i8 %x, i8 %y) {
 define i16 @out16_constmask(i16 %x, i16 %y) {
 ; CHECK-LABEL: out16_constmask:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #3855
-; CHECK-NEXT:    mov w9, #-3856
+; CHECK-NEXT:    mov w8, #3855 // =0xf0f
+; CHECK-NEXT:    mov w9, #-3856 // =0xfffff0f0
 ; CHECK-NEXT:    and w8, w0, w8
 ; CHECK-NEXT:    and w9, w1, w9
 ; CHECK-NEXT:    orr w0, w8, w9
@@ -79,9 +79,9 @@ define i8 @in8_constmask(i8 %x, i8 %y) {
 define i16 @in16_constmask(i16 %x, i16 %y) {
 ; CHECK-LABEL: in16_constmask:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor w8, w0, w1
-; CHECK-NEXT:    mov w9, #3855
-; CHECK-NEXT:    and w8, w8, w9
+; CHECK-NEXT:    mov w8, #3855 // =0xf0f
+; CHECK-NEXT:    eor w9, w0, w1
+; CHECK-NEXT:    and w8, w9, w8
 ; CHECK-NEXT:    eor w0, w8, w1
 ; CHECK-NEXT:    ret
   %n0 = xor i16 %x, %y
@@ -207,8 +207,8 @@ define i32 @in_multiuse_A_constmask(i32 %x, i32 %y, i32 %z) nounwind {
 ; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
 ; CHECK-NEXT:    eor w8, w0, w1
 ; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    and w20, w8, #0xf0f0f0f
 ; CHECK-NEXT:    mov w19, w1
+; CHECK-NEXT:    and w20, w8, #0xf0f0f0f
 ; CHECK-NEXT:    mov w0, w20
 ; CHECK-NEXT:    bl use32
 ; CHECK-NEXT:    eor w0, w20, w19
@@ -247,7 +247,7 @@ define i32 @in_multiuse_B_constmask(i32 %x, i32 %y, i32 %z) nounwind {
 define i32 @n0_badconstmask(i32 %x, i32 %y) {
 ; CHECK-LABEL: n0_badconstmask:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #61681
+; CHECK-NEXT:    mov w8, #61681 // =0xf0f1
 ; CHECK-NEXT:    and w9, w0, #0xf0f0f0f
 ; CHECK-NEXT:    movk w8, #61680, lsl #16
 ; CHECK-NEXT:    and w8, w1, w8

diff  --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-lowhigh.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-lowhigh.ll
index db3a2b2b5619a7..67b90c5f02aaeb 100644
--- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-lowhigh.ll
+++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-lowhigh.ll
@@ -200,8 +200,8 @@ define i32 @in_multiuse_A_constmask(i32 %x, i32 %y, i32 %z) nounwind {
 ; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
 ; CHECK-NEXT:    eor w8, w0, w1
 ; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    and w20, w8, #0xffff
 ; CHECK-NEXT:    mov w19, w1
+; CHECK-NEXT:    and w20, w8, #0xffff
 ; CHECK-NEXT:    mov w0, w20
 ; CHECK-NEXT:    bl use32
 ; CHECK-NEXT:    eor w0, w20, w19

diff  --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-variablemask.ll
index 79f299d7595e5a..e2e560e26b5717 100644
--- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-variablemask.ll
@@ -6,9 +6,9 @@
 define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 ; CHECK-LABEL: out8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bic w8, w1, w2
-; CHECK-NEXT:    and w9, w0, w2
-; CHECK-NEXT:    orr w0, w9, w8
+; CHECK-NEXT:    and w8, w0, w2
+; CHECK-NEXT:    bic w9, w1, w2
+; CHECK-NEXT:    orr w0, w8, w9
 ; CHECK-NEXT:    ret
   %mx = and i8 %x, %mask
   %notmask = xor i8 %mask, -1
@@ -20,9 +20,9 @@ define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 define i16 @out16(i16 %x, i16 %y, i16 %mask) {
 ; CHECK-LABEL: out16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bic w8, w1, w2
-; CHECK-NEXT:    and w9, w0, w2
-; CHECK-NEXT:    orr w0, w9, w8
+; CHECK-NEXT:    and w8, w0, w2
+; CHECK-NEXT:    bic w9, w1, w2
+; CHECK-NEXT:    orr w0, w8, w9
 ; CHECK-NEXT:    ret
   %mx = and i16 %x, %mask
   %notmask = xor i16 %mask, -1
@@ -34,9 +34,9 @@ define i16 @out16(i16 %x, i16 %y, i16 %mask) {
 define i32 @out32(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-LABEL: out32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bic w8, w1, w2
-; CHECK-NEXT:    and w9, w0, w2
-; CHECK-NEXT:    orr w0, w9, w8
+; CHECK-NEXT:    and w8, w0, w2
+; CHECK-NEXT:    bic w9, w1, w2
+; CHECK-NEXT:    orr w0, w8, w9
 ; CHECK-NEXT:    ret
   %mx = and i32 %x, %mask
   %notmask = xor i32 %mask, -1
@@ -48,9 +48,9 @@ define i32 @out32(i32 %x, i32 %y, i32 %mask) {
 define i64 @out64(i64 %x, i64 %y, i64 %mask) {
 ; CHECK-LABEL: out64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bic x8, x1, x2
-; CHECK-NEXT:    and x9, x0, x2
-; CHECK-NEXT:    orr x0, x9, x8
+; CHECK-NEXT:    and x8, x0, x2
+; CHECK-NEXT:    bic x9, x1, x2
+; CHECK-NEXT:    orr x0, x8, x9
 ; CHECK-NEXT:    ret
   %mx = and i64 %x, %mask
   %notmask = xor i64 %mask, -1
@@ -155,9 +155,9 @@ define i32 @in_commutativity_0_1_1(i32 %x, i32 %y, i32 %mask) {
 define i32 @in_commutativity_1_0_0(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-LABEL: in_commutativity_1_0_0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w1, w2
-; CHECK-NEXT:    bic w9, w0, w2
-; CHECK-NEXT:    orr w0, w8, w9
+; CHECK-NEXT:    bic w8, w0, w2
+; CHECK-NEXT:    and w9, w1, w2
+; CHECK-NEXT:    orr w0, w9, w8
 ; CHECK-NEXT:    ret
   %n0 = xor i32 %x, %y
   %n1 = and i32 %n0, %mask
@@ -167,9 +167,9 @@ define i32 @in_commutativity_1_0_0(i32 %x, i32 %y, i32 %mask) {
 define i32 @in_commutativity_1_0_1(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-LABEL: in_commutativity_1_0_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w1, w2
-; CHECK-NEXT:    bic w9, w0, w2
-; CHECK-NEXT:    orr w0, w8, w9
+; CHECK-NEXT:    bic w8, w0, w2
+; CHECK-NEXT:    and w9, w1, w2
+; CHECK-NEXT:    orr w0, w9, w8
 ; CHECK-NEXT:    ret
   %n0 = xor i32 %x, %y
   %n1 = and i32 %mask, %n0 ; swapped
@@ -179,9 +179,9 @@ define i32 @in_commutativity_1_0_1(i32 %x, i32 %y, i32 %mask) {
 define i32 @in_commutativity_1_1_0(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-LABEL: in_commutativity_1_1_0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w1, w2
-; CHECK-NEXT:    bic w9, w0, w2
-; CHECK-NEXT:    orr w0, w8, w9
+; CHECK-NEXT:    bic w8, w0, w2
+; CHECK-NEXT:    and w9, w1, w2
+; CHECK-NEXT:    orr w0, w9, w8
 ; CHECK-NEXT:    ret
   %n0 = xor i32 %x, %y
   %n1 = and i32 %n0, %mask
@@ -191,9 +191,9 @@ define i32 @in_commutativity_1_1_0(i32 %x, i32 %y, i32 %mask) {
 define i32 @in_commutativity_1_1_1(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-LABEL: in_commutativity_1_1_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w1, w2
-; CHECK-NEXT:    bic w9, w0, w2
-; CHECK-NEXT:    orr w0, w8, w9
+; CHECK-NEXT:    bic w8, w0, w2
+; CHECK-NEXT:    and w9, w1, w2
+; CHECK-NEXT:    orr w0, w9, w8
 ; CHECK-NEXT:    ret
   %n0 = xor i32 %x, %y
   %n1 = and i32 %mask, %n0 ; swapped
@@ -268,11 +268,11 @@ define i32 @in_complex_m1(i32 %x, i32 %y, i32 %m_a, i32 %m_b) {
 define i32 @in_complex_y0_m0(i32 %x, i32 %y_hi, i32 %y_low, i32 %m_a, i32 %m_b) {
 ; CHECK-LABEL: in_complex_y0_m0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w1, w2
-; CHECK-NEXT:    eor w9, w3, w4
-; CHECK-NEXT:    bic w8, w8, w9
-; CHECK-NEXT:    and w9, w0, w9
-; CHECK-NEXT:    orr w0, w9, w8
+; CHECK-NEXT:    eor w8, w3, w4
+; CHECK-NEXT:    and w9, w1, w2
+; CHECK-NEXT:    bic w9, w9, w8
+; CHECK-NEXT:    and w8, w0, w8
+; CHECK-NEXT:    orr w0, w8, w9
 ; CHECK-NEXT:    ret
   %y = and i32 %y_hi, %y_low
   %mask = xor i32 %m_a, %m_b
@@ -284,11 +284,11 @@ define i32 @in_complex_y0_m0(i32 %x, i32 %y_hi, i32 %y_low, i32 %m_a, i32 %m_b)
 define i32 @in_complex_y1_m0(i32 %x, i32 %y_hi, i32 %y_low, i32 %m_a, i32 %m_b) {
 ; CHECK-LABEL: in_complex_y1_m0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w1, w2
-; CHECK-NEXT:    eor w9, w3, w4
-; CHECK-NEXT:    bic w8, w8, w9
-; CHECK-NEXT:    and w9, w0, w9
-; CHECK-NEXT:    orr w0, w9, w8
+; CHECK-NEXT:    eor w8, w3, w4
+; CHECK-NEXT:    and w9, w1, w2
+; CHECK-NEXT:    bic w9, w9, w8
+; CHECK-NEXT:    and w8, w0, w8
+; CHECK-NEXT:    orr w0, w8, w9
 ; CHECK-NEXT:    ret
   %y = and i32 %y_hi, %y_low
   %mask = xor i32 %m_a, %m_b
@@ -300,11 +300,11 @@ define i32 @in_complex_y1_m0(i32 %x, i32 %y_hi, i32 %y_low, i32 %m_a, i32 %m_b)
 define i32 @in_complex_y0_m1(i32 %x, i32 %y_hi, i32 %y_low, i32 %m_a, i32 %m_b) {
 ; CHECK-LABEL: in_complex_y0_m1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w1, w2
-; CHECK-NEXT:    eor w9, w3, w4
-; CHECK-NEXT:    bic w8, w8, w9
-; CHECK-NEXT:    and w9, w0, w9
-; CHECK-NEXT:    orr w0, w9, w8
+; CHECK-NEXT:    eor w8, w3, w4
+; CHECK-NEXT:    and w9, w1, w2
+; CHECK-NEXT:    bic w9, w9, w8
+; CHECK-NEXT:    and w8, w0, w8
+; CHECK-NEXT:    orr w0, w8, w9
 ; CHECK-NEXT:    ret
   %y = and i32 %y_hi, %y_low
   %mask = xor i32 %m_a, %m_b
@@ -316,11 +316,11 @@ define i32 @in_complex_y0_m1(i32 %x, i32 %y_hi, i32 %y_low, i32 %m_a, i32 %m_b)
 define i32 @in_complex_y1_m1(i32 %x, i32 %y_hi, i32 %y_low, i32 %m_a, i32 %m_b) {
 ; CHECK-LABEL: in_complex_y1_m1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w1, w2
-; CHECK-NEXT:    eor w9, w3, w4
-; CHECK-NEXT:    bic w8, w8, w9
-; CHECK-NEXT:    and w9, w0, w9
-; CHECK-NEXT:    orr w0, w9, w8
+; CHECK-NEXT:    eor w8, w3, w4
+; CHECK-NEXT:    and w9, w1, w2
+; CHECK-NEXT:    bic w9, w9, w8
+; CHECK-NEXT:    and w8, w0, w8
+; CHECK-NEXT:    orr w0, w8, w9
 ; CHECK-NEXT:    ret
   %y = and i32 %y_hi, %y_low
   %mask = xor i32 %m_a, %m_b
@@ -384,7 +384,7 @@ define i32 @in_constant_varx_mone_invmask(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_varx_42(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-LABEL: out_constant_varx_42:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #42
+; CHECK-NEXT:    mov w8, #42 // =0x2a
 ; CHECK-NEXT:    and w9, w2, w0
 ; CHECK-NEXT:    bic w8, w8, w2
 ; CHECK-NEXT:    orr w0, w9, w8
@@ -398,7 +398,7 @@ define i32 @out_constant_varx_42(i32 %x, i32 %y, i32 %mask) {
 define i32 @in_constant_varx_42(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-LABEL: in_constant_varx_42:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #42
+; CHECK-NEXT:    mov w8, #42 // =0x2a
 ; CHECK-NEXT:    and w9, w0, w2
 ; CHECK-NEXT:    bic w8, w8, w2
 ; CHECK-NEXT:    orr w0, w9, w8
@@ -412,7 +412,7 @@ define i32 @in_constant_varx_42(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_varx_42_invmask(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-LABEL: out_constant_varx_42_invmask:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #42
+; CHECK-NEXT:    mov w8, #42 // =0x2a
 ; CHECK-NEXT:    bic w9, w0, w2
 ; CHECK-NEXT:    and w8, w2, w8
 ; CHECK-NEXT:    orr w0, w9, w8
@@ -427,7 +427,7 @@ define i32 @out_constant_varx_42_invmask(i32 %x, i32 %y, i32 %mask) {
 define i32 @in_constant_varx_42_invmask(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-LABEL: in_constant_varx_42_invmask:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #42
+; CHECK-NEXT:    mov w8, #42 // =0x2a
 ; CHECK-NEXT:    bic w9, w0, w2
 ; CHECK-NEXT:    and w8, w2, w8
 ; CHECK-NEXT:    orr w0, w9, w8
@@ -487,7 +487,7 @@ define i32 @in_constant_mone_vary_invmask(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_42_vary(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-LABEL: out_constant_42_vary:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #42
+; CHECK-NEXT:    mov w8, #42 // =0x2a
 ; CHECK-NEXT:    bic w9, w1, w2
 ; CHECK-NEXT:    and w8, w2, w8
 ; CHECK-NEXT:    orr w0, w8, w9
@@ -501,7 +501,7 @@ define i32 @out_constant_42_vary(i32 %x, i32 %y, i32 %mask) {
 define i32 @in_constant_42_vary(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-LABEL: in_constant_42_vary:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #42
+; CHECK-NEXT:    mov w8, #42 // =0x2a
 ; CHECK-NEXT:    bic w9, w1, w2
 ; CHECK-NEXT:    and w8, w2, w8
 ; CHECK-NEXT:    orr w0, w8, w9
@@ -515,7 +515,7 @@ define i32 @in_constant_42_vary(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_42_vary_invmask(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-LABEL: out_constant_42_vary_invmask:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #42
+; CHECK-NEXT:    mov w8, #42 // =0x2a
 ; CHECK-NEXT:    and w9, w2, w1
 ; CHECK-NEXT:    bic w8, w8, w2
 ; CHECK-NEXT:    orr w0, w8, w9
@@ -530,7 +530,7 @@ define i32 @out_constant_42_vary_invmask(i32 %x, i32 %y, i32 %mask) {
 define i32 @in_constant_42_vary_invmask(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-LABEL: in_constant_42_vary_invmask:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #42
+; CHECK-NEXT:    mov w8, #42 // =0x2a
 ; CHECK-NEXT:    and w9, w1, w2
 ; CHECK-NEXT:    bic w8, w8, w2
 ; CHECK-NEXT:    orr w0, w8, w9
@@ -552,8 +552,8 @@ define i32 @in_multiuse_A(i32 %x, i32 %y, i32 %z, i32 %mask) nounwind {
 ; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
 ; CHECK-NEXT:    eor w8, w0, w1
 ; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    and w20, w8, w3
 ; CHECK-NEXT:    mov w19, w1
+; CHECK-NEXT:    and w20, w8, w3
 ; CHECK-NEXT:    mov w0, w20
 ; CHECK-NEXT:    bl use32
 ; CHECK-NEXT:    eor w0, w20, w19
@@ -589,9 +589,9 @@ define i32 @in_multiuse_B(i32 %x, i32 %y, i32 %z, i32 %mask) nounwind {
 define i32 @n0_badmask(i32 %x, i32 %y, i32 %mask, i32 %mask2) {
 ; CHECK-LABEL: n0_badmask:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, w2
-; CHECK-NEXT:    bic w9, w1, w3
-; CHECK-NEXT:    orr w0, w8, w9
+; CHECK-NEXT:    bic w8, w1, w3
+; CHECK-NEXT:    and w9, w0, w2
+; CHECK-NEXT:    orr w0, w9, w8
 ; CHECK-NEXT:    ret
   %mx = and i32 %x, %mask
   %notmask = xor i32 %mask2, -1 ; %mask2 instead of %mask

diff  --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll
index 607f5dd3dc772f..d015ce956f0fc1 100644
--- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll
@@ -31,8 +31,8 @@ define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi d3, #0x0000ff000000ff
 ; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT:    eor v2.8b, v2.8b, v3.8b
-; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-NEXT:    eor v3.8b, v2.8b, v3.8b
+; CHECK-NEXT:    and v1.8b, v1.8b, v3.8b
 ; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %mx = and <2 x i8> %x, %mask
@@ -63,8 +63,8 @@ define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi d3, #0xff00ff00ff00ff
 ; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT:    eor v2.8b, v2.8b, v3.8b
-; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-NEXT:    eor v3.8b, v2.8b, v3.8b
+; CHECK-NEXT:    and v1.8b, v1.8b, v3.8b
 ; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %mx = and <4 x i8> %x, %mask
@@ -79,8 +79,8 @@ define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwi
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi d3, #0xff00ff00ff00ff
 ; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT:    eor v2.8b, v2.8b, v3.8b
-; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-NEXT:    eor v3.8b, v2.8b, v3.8b
+; CHECK-NEXT:    and v1.8b, v1.8b, v3.8b
 ; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %mx = and <4 x i8> %x, %mask
@@ -95,8 +95,8 @@ define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwin
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi d3, #0x00ffff0000ffff
 ; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT:    eor v2.8b, v2.8b, v3.8b
-; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-NEXT:    eor v3.8b, v2.8b, v3.8b
+; CHECK-NEXT:    and v1.8b, v1.8b, v3.8b
 ; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    ret
   %mx = and <2 x i16> %x, %mask

diff  --git a/llvm/test/CodeGen/AArch64/urem-lkk.ll b/llvm/test/CodeGen/AArch64/urem-lkk.ll
index 2eb46b97603a54..2212e0a6334143 100644
--- a/llvm/test/CodeGen/AArch64/urem-lkk.ll
+++ b/llvm/test/CodeGen/AArch64/urem-lkk.ll
@@ -4,13 +4,13 @@
 define i32 @fold_urem_positive_odd(i32 %x) {
 ; CHECK-LABEL: fold_urem_positive_odd:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #8969
+; CHECK-NEXT:    mov w8, #8969 // =0x2309
 ; CHECK-NEXT:    movk w8, #22765, lsl #16
 ; CHECK-NEXT:    umull x8, w0, w8
 ; CHECK-NEXT:    lsr x8, x8, #32
 ; CHECK-NEXT:    sub w9, w0, w8
 ; CHECK-NEXT:    add w8, w8, w9, lsr #1
-; CHECK-NEXT:    mov w9, #95
+; CHECK-NEXT:    mov w9, #95 // =0x5f
 ; CHECK-NEXT:    lsr w8, w8, #6
 ; CHECK-NEXT:    msub w0, w8, w9, w0
 ; CHECK-NEXT:    ret
@@ -22,8 +22,8 @@ define i32 @fold_urem_positive_odd(i32 %x) {
 define i32 @fold_urem_positive_even(i32 %x) {
 ; CHECK-LABEL: fold_urem_positive_even:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #16323
-; CHECK-NEXT:    mov w9, #1060
+; CHECK-NEXT:    mov w8, #16323 // =0x3fc3
+; CHECK-NEXT:    mov w9, #1060 // =0x424
 ; CHECK-NEXT:    movk w8, #63310, lsl #16
 ; CHECK-NEXT:    umull x8, w0, w8
 ; CHECK-NEXT:    lsr x8, x8, #42
@@ -38,13 +38,13 @@ define i32 @fold_urem_positive_even(i32 %x) {
 define i32 @combine_urem_udiv(i32 %x) {
 ; CHECK-LABEL: combine_urem_udiv:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #8969
+; CHECK-NEXT:    mov w8, #8969 // =0x2309
 ; CHECK-NEXT:    movk w8, #22765, lsl #16
 ; CHECK-NEXT:    umull x8, w0, w8
 ; CHECK-NEXT:    lsr x8, x8, #32
 ; CHECK-NEXT:    sub w9, w0, w8
 ; CHECK-NEXT:    add w8, w8, w9, lsr #1
-; CHECK-NEXT:    mov w9, #95
+; CHECK-NEXT:    mov w9, #95 // =0x5f
 ; CHECK-NEXT:    lsr w8, w8, #6
 ; CHECK-NEXT:    msub w9, w8, w9, w0
 ; CHECK-NEXT:    add w0, w9, w8
@@ -88,13 +88,13 @@ define i32 @dont_fold_urem_i32_umax(i32 %x) {
 define i64 @dont_fold_urem_i64(i64 %x) {
 ; CHECK-LABEL: dont_fold_urem_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #58849
-; CHECK-NEXT:    lsr x9, x0, #1
-; CHECK-NEXT:    movk x8, #48148, lsl #16
-; CHECK-NEXT:    movk x8, #33436, lsl #32
-; CHECK-NEXT:    movk x8, #21399, lsl #48
-; CHECK-NEXT:    umulh x8, x9, x8
-; CHECK-NEXT:    mov w9, #98
+; CHECK-NEXT:    mov x9, #58849 // =0xe5e1
+; CHECK-NEXT:    lsr x8, x0, #1
+; CHECK-NEXT:    movk x9, #48148, lsl #16
+; CHECK-NEXT:    movk x9, #33436, lsl #32
+; CHECK-NEXT:    movk x9, #21399, lsl #48
+; CHECK-NEXT:    umulh x8, x8, x9
+; CHECK-NEXT:    mov w9, #98 // =0x62
 ; CHECK-NEXT:    lsr x8, x8, #4
 ; CHECK-NEXT:    msub x0, x8, x9, x0
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll
index 029bf8361f18d1..c4f6e7d7528d88 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll
@@ -67,25 +67,25 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov s0, w0
 ; CHECK-NEXT:    adrp x8, .LCPI4_0
-; CHECK-NEXT:    adrp x9, .LCPI4_1
-; CHECK-NEXT:    mov v0.h[1], w1
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI4_0]
-; CHECK-NEXT:    ldr d2, [x9, :lo12:.LCPI4_1]
-; CHECK-NEXT:    adrp x8, .LCPI4_2
+; CHECK-NEXT:    adrp x8, .LCPI4_1
+; CHECK-NEXT:    mov v0.h[1], w1
 ; CHECK-NEXT:    mov v0.h[2], w2
 ; CHECK-NEXT:    sub v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    movi d1, #0x0000000000ffff
-; CHECK-NEXT:    mul v0.4h, v0.4h, v2.4h
-; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI4_2]
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI4_1]
+; CHECK-NEXT:    adrp x8, .LCPI4_2
+; CHECK-NEXT:    ldr d3, [x8, :lo12:.LCPI4_2]
 ; CHECK-NEXT:    adrp x8, .LCPI4_3
-; CHECK-NEXT:    add v3.4h, v0.4h, v0.4h
+; CHECK-NEXT:    mul v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    movi d1, #0x0000000000ffff
+; CHECK-NEXT:    add v2.4h, v0.4h, v0.4h
 ; CHECK-NEXT:    bic v0.4h, #248, lsl #8
+; CHECK-NEXT:    ushl v2.4h, v2.4h, v3.4h
 ; CHECK-NEXT:    ushl v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    ushl v1.4h, v3.4h, v2.4h
-; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI4_3]
-; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI4_3]
+; CHECK-NEXT:    orr v0.8b, v0.8b, v2.8b
 ; CHECK-NEXT:    bic v0.4h, #248, lsl #8
-; CHECK-NEXT:    cmhi v0.4h, v0.4h, v2.4h
+; CHECK-NEXT:    cmhi v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    umov w0, v0.h[0]
 ; CHECK-NEXT:    umov w1, v0.h[1]
 ; CHECK-NEXT:    umov w2, v0.h[2]

diff  --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
index 5dcb9edfd1f34c..8b72a324a9ab28 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
@@ -7,7 +7,6 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI0_0
 ; CHECK-NEXT:    adrp x9, .LCPI0_2
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI0_0]
 ; CHECK-NEXT:    adrp x8, .LCPI0_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI0_2]
@@ -16,10 +15,11 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI0_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI0_3]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI0_3]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 14, i32 25, i32 100>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -71,7 +71,6 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI3_0
 ; CHECK-NEXT:    adrp x9, .LCPI3_2
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI3_0]
 ; CHECK-NEXT:    adrp x8, .LCPI3_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI3_2]
@@ -80,10 +79,11 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI3_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI3_3]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI3_3]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 14, i32 14, i32 4294967295, i32 14>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -95,7 +95,6 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI4_0
 ; CHECK-NEXT:    adrp x9, .LCPI4_2
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI4_0]
 ; CHECK-NEXT:    adrp x8, .LCPI4_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI4_2]
@@ -104,10 +103,11 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI4_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI4_3]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmhi v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI4_3]
+; CHECK-NEXT:    cmhi v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 14, i32 14, i32 4294967295, i32 14>
   %cmp = icmp ne <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -121,7 +121,6 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI5_0
 ; CHECK-NEXT:    adrp x9, .LCPI5_2
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI5_0]
 ; CHECK-NEXT:    adrp x8, .LCPI5_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI5_2]
@@ -130,10 +129,11 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI5_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI5_3]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI5_3]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 14, i32 4294967295, i32 100>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -145,7 +145,6 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI6_0
 ; CHECK-NEXT:    adrp x9, .LCPI6_2
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI6_0]
 ; CHECK-NEXT:    adrp x8, .LCPI6_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI6_2]
@@ -154,10 +153,11 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI6_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI6_3]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmhi v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI6_3]
+; CHECK-NEXT:    cmhi v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 14, i32 4294967295, i32 100>
   %cmp = icmp ne <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -173,7 +173,6 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI7_0
 ; CHECK-NEXT:    adrp x9, .LCPI7_2
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI7_0]
 ; CHECK-NEXT:    adrp x8, .LCPI7_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI7_2]
@@ -182,10 +181,11 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI7_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI7_3]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI7_3]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 5, i32 16, i32 5>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -199,7 +199,6 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI8_0
 ; CHECK-NEXT:    adrp x9, .LCPI8_2
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI8_0]
 ; CHECK-NEXT:    adrp x8, .LCPI8_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI8_2]
@@ -208,10 +207,11 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI8_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI8_3]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI8_3]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 14, i32 14, i32 16, i32 14>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -225,7 +225,6 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI9_0
 ; CHECK-NEXT:    adrp x9, .LCPI9_2
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI9_0]
 ; CHECK-NEXT:    adrp x8, .LCPI9_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI9_2]
@@ -234,10 +233,11 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI9_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI9_3]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI9_3]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 14, i32 16, i32 100>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -252,8 +252,8 @@ define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_urem_odd_one:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #52429 // =0xcccd
-; CHECK-NEXT:    movk w8, #52428, lsl #16
 ; CHECK-NEXT:    movi v2.4s, #1
+; CHECK-NEXT:    movk w8, #52428, lsl #16
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    adrp x8, .LCPI10_0
 ; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
@@ -272,8 +272,8 @@ define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_urem_even_one:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #28087 // =0x6db7
-; CHECK-NEXT:    movk w8, #46811, lsl #16
 ; CHECK-NEXT:    movi v2.4s, #1
+; CHECK-NEXT:    movk w8, #46811, lsl #16
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    adrp x8, .LCPI11_0
 ; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
@@ -295,7 +295,6 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI12_0
 ; CHECK-NEXT:    adrp x9, .LCPI12_2
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI12_0]
 ; CHECK-NEXT:    adrp x8, .LCPI12_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI12_2]
@@ -304,10 +303,11 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI12_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI12_3]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI12_3]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 14, i32 1, i32 100>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -323,7 +323,6 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI13_0
 ; CHECK-NEXT:    adrp x9, .LCPI13_2
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI13_0]
 ; CHECK-NEXT:    adrp x8, .LCPI13_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI13_2]
@@ -332,10 +331,11 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI13_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI13_3]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI13_3]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 5, i32 2147483648, i32 5>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -349,7 +349,6 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI14_0
 ; CHECK-NEXT:    adrp x9, .LCPI14_2
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI14_0]
 ; CHECK-NEXT:    adrp x8, .LCPI14_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI14_2]
@@ -358,10 +357,11 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI14_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI14_3]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI14_3]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 14, i32 14, i32 2147483648, i32 14>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -375,7 +375,6 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-NEXT:    adrp x9, .LCPI15_2
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI15_0]
 ; CHECK-NEXT:    adrp x8, .LCPI15_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI15_2]
@@ -384,10 +383,11 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI15_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI15_3]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI15_3]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 14, i32 2147483648, i32 100>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -403,7 +403,6 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI16_0
 ; CHECK-NEXT:    adrp x9, .LCPI16_2
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
 ; CHECK-NEXT:    adrp x8, .LCPI16_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI16_2]
@@ -412,10 +411,11 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI16_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI16_3]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI16_3]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 5>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -429,7 +429,6 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI17_0
 ; CHECK-NEXT:    adrp x9, .LCPI17_2
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI17_0]
 ; CHECK-NEXT:    adrp x8, .LCPI17_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI17_2]
@@ -438,10 +437,11 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI17_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI17_3]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI17_3]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 14, i32 4294967295, i32 16, i32 14>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -455,7 +455,6 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI18_0
 ; CHECK-NEXT:    adrp x9, .LCPI18_2
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI18_0]
 ; CHECK-NEXT:    adrp x8, .LCPI18_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI18_2]
@@ -464,10 +463,11 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
 ; CHECK-NEXT:    adrp x8, .LCPI18_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI18_3]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI18_3]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 100>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -502,7 +502,6 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI20_0
 ; CHECK-NEXT:    adrp x9, .LCPI20_2
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI20_0]
 ; CHECK-NEXT:    adrp x8, .LCPI20_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI20_2]
@@ -511,10 +510,11 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI20_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI20_3]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI20_3]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 14, i32 4294967295, i32 1, i32 14>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -528,7 +528,6 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI21_0
 ; CHECK-NEXT:    adrp x9, .LCPI21_2
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI21_0]
 ; CHECK-NEXT:    adrp x8, .LCPI21_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI21_2]
@@ -537,10 +536,11 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI21_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI21_3]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI21_3]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 1, i32 100>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -556,7 +556,6 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI22_0
 ; CHECK-NEXT:    adrp x9, .LCPI22_2
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI22_0]
 ; CHECK-NEXT:    adrp x8, .LCPI22_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI22_2]
@@ -565,10 +564,11 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI22_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI22_3]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI22_3]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 5>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -582,7 +582,6 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI23_0
 ; CHECK-NEXT:    adrp x9, .LCPI23_2
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI23_0]
 ; CHECK-NEXT:    adrp x8, .LCPI23_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI23_2]
@@ -591,10 +590,11 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI23_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI23_3]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI23_3]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 14, i32 16, i32 1, i32 14>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -608,7 +608,6 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI24_0
 ; CHECK-NEXT:    adrp x9, .LCPI24_2
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI24_0]
 ; CHECK-NEXT:    adrp x8, .LCPI24_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI24_2]
@@ -617,10 +616,11 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    adrp x8, .LCPI24_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI24_3]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI24_3]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 16, i32 1, i32 100>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -635,7 +635,6 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI25_0
 ; CHECK-NEXT:    adrp x9, .LCPI25_2
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI25_0]
 ; CHECK-NEXT:    adrp x8, .LCPI25_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI25_2]
@@ -644,10 +643,11 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 ; CHECK-NEXT:    adrp x8, .LCPI25_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI25_3]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI25_3]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 5, i32 4294967295, i32 16, i32 1>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -660,7 +660,6 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI26_0
 ; CHECK-NEXT:    adrp x9, .LCPI26_2
-; CHECK-NEXT:    movi v3.4s, #1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI26_0]
 ; CHECK-NEXT:    adrp x8, .LCPI26_1
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI26_2]
@@ -669,10 +668,11 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
 ; CHECK-NEXT:    adrp x8, .LCPI26_3
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI26_3]
+; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmhs v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI26_3]
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 14, i32 4294967295, i32 16, i32 1>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>

diff  --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
index d66e81939a8b8a..ab67be9445ed3a 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll
@@ -6,8 +6,8 @@ define <4 x i32> @test_urem_odd_25(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_urem_odd_25:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #23593 // =0x5c29
-; CHECK-NEXT:    movk w8, #49807, lsl #16
 ; CHECK-NEXT:    movi v2.4s, #1
+; CHECK-NEXT:    movk w8, #49807, lsl #16
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    mov w8, #28835 // =0x70a3
 ; CHECK-NEXT:    movk w8, #2621, lsl #16
@@ -27,8 +27,8 @@ define <4 x i32> @test_urem_even_100(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_urem_even_100:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #23593 // =0x5c29
-; CHECK-NEXT:    movk w8, #49807, lsl #16
 ; CHECK-NEXT:    movi v2.4s, #1
+; CHECK-NEXT:    movk w8, #49807, lsl #16
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    mov w8, #23592 // =0x5c28
 ; CHECK-NEXT:    movk w8, #655, lsl #16
@@ -165,10 +165,10 @@ define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_urem_pow2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.4s, #15
-; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 16, i32 16, i32 16, i32 16>
   %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
@@ -180,8 +180,8 @@ define <4 x i32> @test_urem_pow2(<4 x i32> %X) nounwind {
 define <4 x i32> @test_urem_int_min(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_urem_int_min:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    bic v0.4s, #128, lsl #24
+; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll
index a0936ae6fafee9..50d6b9fc1026cc 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll
@@ -5,11 +5,11 @@ define <4 x i1> @t0_all_tautological(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: t0_all_tautological:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI0_0
-; CHECK-NEXT:    adrp x9, .LCPI0_1
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI0_0]
-; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI0_1]
+; CHECK-NEXT:    adrp x8, .LCPI0_1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI0_1]
+; CHECK-NEXT:    cmeq v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 1, i32 1, i32 2, i32 2>
@@ -76,16 +76,16 @@ define <8 x i1> @t2_narrow(<8 x i16> %X) nounwind {
 define <2 x i1> @t3_wide(<2 x i64> %X) nounwind {
 ; CHECK-LABEL: t3_wide:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov x10, d0
 ; CHECK-NEXT:    mov x8, #-6148914691236517206 // =0xaaaaaaaaaaaaaaaa
-; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    mov x9, v0.d[1]
 ; CHECK-NEXT:    movk x8, #43691
-; CHECK-NEXT:    mov x10, v0.d[1]
-; CHECK-NEXT:    mul x9, x9, x8
-; CHECK-NEXT:    mul x8, x10, x8
-; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    adrp x9, .LCPI4_0
+; CHECK-NEXT:    mul x10, x10, x8
+; CHECK-NEXT:    mul x8, x9, x8
+; CHECK-NEXT:    fmov d0, x10
 ; CHECK-NEXT:    mov v0.d[1], x8
-; CHECK-NEXT:    ldr q1, [x9, :lo12:.LCPI4_0]
+; CHECK-NEXT:    adrp x8, .LCPI4_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI4_0]
 ; CHECK-NEXT:    cmhs v0.2d, v1.2d, v0.2d
 ; CHECK-NEXT:    movi d1, #0xffffffff00000000
 ; CHECK-NEXT:    xtn v0.2s, v0.2d

diff  --git a/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll
index 86eb825e14fb49..dc021bc3bfcc74 100644
--- a/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll
@@ -5,23 +5,23 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
 ; CHECK-LABEL: fold_urem_vec_1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI0_0
-; CHECK-NEXT:    adrp x9, .LCPI0_1
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT:    adrp x8, .LCPI0_1
+; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI0_1]
 ; CHECK-NEXT:    adrp x8, .LCPI0_2
-; CHECK-NEXT:    ldr d2, [x9, :lo12:.LCPI0_1]
-; CHECK-NEXT:    adrp x9, .LCPI0_4
 ; CHECK-NEXT:    ushl v1.4h, v0.4h, v1.4h
-; CHECK-NEXT:    umull v1.4s, v1.4h, v2.4h
-; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI0_2]
+; CHECK-NEXT:    ldr d3, [x8, :lo12:.LCPI0_2]
 ; CHECK-NEXT:    adrp x8, .LCPI0_3
+; CHECK-NEXT:    umull v1.4s, v1.4h, v2.4h
 ; CHECK-NEXT:    shrn v1.4h, v1.4s, #16
-; CHECK-NEXT:    sub v3.4h, v0.4h, v1.4h
-; CHECK-NEXT:    umull v2.4s, v3.4h, v2.4h
-; CHECK-NEXT:    ldr d3, [x8, :lo12:.LCPI0_3]
+; CHECK-NEXT:    sub v2.4h, v0.4h, v1.4h
+; CHECK-NEXT:    umull v2.4s, v2.4h, v3.4h
 ; CHECK-NEXT:    shrn v2.4h, v2.4s, #16
 ; CHECK-NEXT:    add v1.4h, v2.4h, v1.4h
-; CHECK-NEXT:    ldr d2, [x9, :lo12:.LCPI0_4]
-; CHECK-NEXT:    ushl v1.4h, v1.4h, v3.4h
+; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI0_3]
+; CHECK-NEXT:    adrp x8, .LCPI0_4
+; CHECK-NEXT:    ushl v1.4h, v1.4h, v2.4h
+; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI0_4]
 ; CHECK-NEXT:    mls v0.4h, v1.4h, v2.4h
 ; CHECK-NEXT:    ret
   %1 = urem <4 x i16> %x, <i16 95, i16 124, i16 98, i16 1003>
@@ -69,15 +69,15 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
 ; CHECK-LABEL: dont_fold_urem_power_of_two:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI3_0
-; CHECK-NEXT:    adrp x9, .LCPI3_2
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI3_0]
 ; CHECK-NEXT:    adrp x8, .LCPI3_1
-; CHECK-NEXT:    ldr d3, [x9, :lo12:.LCPI3_2]
-; CHECK-NEXT:    umull v1.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI3_1]
+; CHECK-NEXT:    adrp x8, .LCPI3_2
+; CHECK-NEXT:    umull v1.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    shrn v1.4h, v1.4s, #16
 ; CHECK-NEXT:    ushl v1.4h, v1.4h, v2.4h
-; CHECK-NEXT:    mls v0.4h, v1.4h, v3.4h
+; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI3_2]
+; CHECK-NEXT:    mls v0.4h, v1.4h, v2.4h
 ; CHECK-NEXT:    ret
   %1 = urem <4 x i16> %x, <i16 64, i16 32, i16 8, i16 95>
   ret <4 x i16> %1
@@ -88,26 +88,26 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
 ; CHECK-LABEL: dont_fold_urem_one:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI4_0
+; CHECK-NEXT:    movi d4, #0x0000000000ffff
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI4_0]
 ; CHECK-NEXT:    adrp x8, .LCPI4_1
-; CHECK-NEXT:    umull v1.4s, v0.4h, v1.4h
-; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI4_1]
+; CHECK-NEXT:    ldr d3, [x8, :lo12:.LCPI4_1]
 ; CHECK-NEXT:    adrp x8, .LCPI4_2
+; CHECK-NEXT:    umull v1.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    shrn v1.4h, v1.4s, #16
-; CHECK-NEXT:    ldr d4, [x8, :lo12:.LCPI4_2]
-; CHECK-NEXT:    adrp x8, .LCPI4_3
-; CHECK-NEXT:    sub v3.4h, v0.4h, v1.4h
-; CHECK-NEXT:    umull v2.4s, v3.4h, v2.4h
+; CHECK-NEXT:    sub v2.4h, v0.4h, v1.4h
+; CHECK-NEXT:    umull v2.4s, v2.4h, v3.4h
 ; CHECK-NEXT:    movi d3, #0xffffffffffff0000
 ; CHECK-NEXT:    shrn v2.4h, v2.4s, #16
 ; CHECK-NEXT:    add v1.4h, v2.4h, v1.4h
-; CHECK-NEXT:    movi d2, #0x0000000000ffff
-; CHECK-NEXT:    ushl v1.4h, v1.4h, v4.4h
-; CHECK-NEXT:    ldr d4, [x8, :lo12:.LCPI4_3]
+; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI4_2]
+; CHECK-NEXT:    adrp x8, .LCPI4_3
+; CHECK-NEXT:    ushl v1.4h, v1.4h, v2.4h
+; CHECK-NEXT:    and v2.8b, v0.8b, v4.8b
 ; CHECK-NEXT:    and v1.8b, v1.8b, v3.8b
-; CHECK-NEXT:    and v2.8b, v0.8b, v2.8b
 ; CHECK-NEXT:    orr v1.8b, v2.8b, v1.8b
-; CHECK-NEXT:    mls v0.4h, v1.4h, v4.4h
+; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI4_3]
+; CHECK-NEXT:    mls v0.4h, v1.4h, v2.4h
 ; CHECK-NEXT:    ret
   %1 = urem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>
   ret <4 x i16> %1
@@ -128,35 +128,35 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov x8, #17097 // =0x42c9
 ; CHECK-NEXT:    fmov x9, d1
+; CHECK-NEXT:    mov x10, v1.d[1]
 ; CHECK-NEXT:    movk x8, #45590, lsl #16
-; CHECK-NEXT:    mov x13, #21445 // =0x53c5
-; CHECK-NEXT:    movk x8, #34192, lsl #32
-; CHECK-NEXT:    movk x13, #1603, lsl #16
-; CHECK-NEXT:    movk x8, #25644, lsl #48
-; CHECK-NEXT:    movk x13, #15432, lsl #32
-; CHECK-NEXT:    mov x10, v0.d[1]
-; CHECK-NEXT:    movk x13, #25653, lsl #48
-; CHECK-NEXT:    umulh x8, x9, x8
-; CHECK-NEXT:    mov x11, v1.d[1]
-; CHECK-NEXT:    sub x12, x9, x8
-; CHECK-NEXT:    lsr x14, x10, #1
-; CHECK-NEXT:    add x8, x8, x12, lsr #1
+; CHECK-NEXT:    mov x11, v0.d[1]
 ; CHECK-NEXT:    mov x12, #12109 // =0x2f4d
+; CHECK-NEXT:    movk x8, #34192, lsl #32
 ; CHECK-NEXT:    movk x12, #52170, lsl #16
-; CHECK-NEXT:    umulh x13, x14, x13
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    movk x8, #25644, lsl #48
 ; CHECK-NEXT:    movk x12, #28749, lsl #32
-; CHECK-NEXT:    mov w14, #23 // =0x17
+; CHECK-NEXT:    umulh x8, x9, x8
 ; CHECK-NEXT:    movk x12, #49499, lsl #48
+; CHECK-NEXT:    lsr x13, x11, #1
+; CHECK-NEXT:    umulh x12, x10, x12
+; CHECK-NEXT:    sub x14, x9, x8
+; CHECK-NEXT:    add x8, x8, x14, lsr #1
+; CHECK-NEXT:    mov x14, #21445 // =0x53c5
+; CHECK-NEXT:    movk x14, #1603, lsl #16
+; CHECK-NEXT:    movk x14, #15432, lsl #32
 ; CHECK-NEXT:    lsr x8, x8, #4
-; CHECK-NEXT:    lsr x13, x13, #7
-; CHECK-NEXT:    umulh x12, x11, x12
+; CHECK-NEXT:    movk x14, #25653, lsl #48
+; CHECK-NEXT:    umulh x13, x13, x14
+; CHECK-NEXT:    mov w14, #23 // =0x17
 ; CHECK-NEXT:    msub x8, x8, x14, x9
-; CHECK-NEXT:    mov w9, #5423 // =0x152f
-; CHECK-NEXT:    lsr x12, x12, #12
-; CHECK-NEXT:    mov w14, #654 // =0x28e
-; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    msub x9, x12, x9, x11
-; CHECK-NEXT:    msub x10, x13, x14, x10
+; CHECK-NEXT:    lsr x9, x12, #12
+; CHECK-NEXT:    mov w12, #5423 // =0x152f
+; CHECK-NEXT:    msub x9, x9, x12, x10
+; CHECK-NEXT:    mov w12, #654 // =0x28e
+; CHECK-NEXT:    lsr x10, x13, #7
+; CHECK-NEXT:    msub x10, x10, x12, x11
 ; CHECK-NEXT:    fmov d1, x8
 ; CHECK-NEXT:    mov v1.d[1], x9
 ; CHECK-NEXT:    mov v0.d[1], x10
@@ -261,18 +261,18 @@ define <2 x i32> @fold_urem_v2i32(<2 x i32> %x) {
 define <2 x i64> @fold_urem_v2i64(<2 x i64> %x) {
 ; CHECK-LABEL: fold_urem_v2i64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov x10, d0
 ; CHECK-NEXT:    mov x8, #-3689348814741910324 // =0xcccccccccccccccc
-; CHECK-NEXT:    fmov x9, d0
+; CHECK-NEXT:    mov x9, v0.d[1]
 ; CHECK-NEXT:    movk x8, #52429
 ; CHECK-NEXT:    mov w12, #10 // =0xa
-; CHECK-NEXT:    mov x10, v0.d[1]
-; CHECK-NEXT:    umulh x11, x9, x8
+; CHECK-NEXT:    umulh x11, x10, x8
+; CHECK-NEXT:    umulh x8, x9, x8
 ; CHECK-NEXT:    lsr x11, x11, #3
-; CHECK-NEXT:    umulh x8, x10, x8
-; CHECK-NEXT:    msub x9, x11, x12, x9
+; CHECK-NEXT:    msub x10, x11, x12, x10
 ; CHECK-NEXT:    lsr x8, x8, #3
-; CHECK-NEXT:    msub x8, x8, x12, x10
-; CHECK-NEXT:    fmov d0, x9
+; CHECK-NEXT:    msub x8, x8, x12, x9
+; CHECK-NEXT:    fmov d0, x10
 ; CHECK-NEXT:    mov v0.d[1], x8
 ; CHECK-NEXT:    ret
   %1 = urem <2 x i64> %x, <i64 10, i64 10>
@@ -283,10 +283,10 @@ define <1 x i64> @fold_urem_v1i64(<1 x i64> %x) {
 ; CHECK-LABEL: fold_urem_v1i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov x8, #-3689348814741910324 // =0xcccccccccccccccc
 ; CHECK-NEXT:    fmov x9, d0
-; CHECK-NEXT:    movk x8, #52429
+; CHECK-NEXT:    mov x8, #-3689348814741910324 // =0xcccccccccccccccc
 ; CHECK-NEXT:    mov w10, #10 // =0xa
+; CHECK-NEXT:    movk x8, #52429
 ; CHECK-NEXT:    umulh x8, x9, x8
 ; CHECK-NEXT:    lsr x8, x8, #3
 ; CHECK-NEXT:    msub x8, x8, x10, x9

diff  --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
index 039e11654bab7c..6c8ee89b50bffa 100644
--- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
@@ -45,8 +45,8 @@ define <16 x i8> @v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 define <32 x i8> @v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
 ; CHECK-LABEL: v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    uqsub v1.16b, v1.16b, v3.16b
+; CHECK-NEXT:    uqsub v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %z = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %x, <32 x i8> %y)
   ret <32 x i8> %z
@@ -76,8 +76,8 @@ define <8 x i16> @v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 define <16 x i16> @v16i16(<16 x i16> %x, <16 x i16> %y) nounwind {
 ; CHECK-LABEL: v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v0.8h, v0.8h, v2.8h
 ; CHECK-NEXT:    uqsub v1.8h, v1.8h, v3.8h
+; CHECK-NEXT:    uqsub v0.8h, v0.8h, v2.8h
 ; CHECK-NEXT:    ret
   %z = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %x, <16 x i16> %y)
   ret <16 x i16> %z
@@ -98,9 +98,9 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
 define void @v8i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-LABEL: v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    uqsub v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    uqsub v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <8 x i8>, ptr %px
@@ -131,14 +131,14 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-LABEL: v2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x1]
-; CHECK-NEXT:    ldrb w9, [x0]
-; CHECK-NEXT:    ldrb w10, [x1, #1]
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    fmov s0, w9
-; CHECK-NEXT:    ldrb w9, [x0, #1]
-; CHECK-NEXT:    mov v1.s[1], w10
-; CHECK-NEXT:    mov v0.s[1], w9
+; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    ldrb w9, [x1]
+; CHECK-NEXT:    ldrb w10, [x0, #1]
+; CHECK-NEXT:    ldrb w11, [x1, #1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    mov v0.s[1], w10
+; CHECK-NEXT:    mov v1.s[1], w11
 ; CHECK-NEXT:    uqsub v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    mov w8, v0.s[1]
 ; CHECK-NEXT:    fmov w9, s0
@@ -155,9 +155,9 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-LABEL: v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    uqsub v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    uqsub v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    str d0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <4 x i16>, ptr %px
@@ -170,14 +170,14 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-LABEL: v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x1]
-; CHECK-NEXT:    ldrh w9, [x0]
-; CHECK-NEXT:    ldrh w10, [x1, #2]
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    fmov s0, w9
-; CHECK-NEXT:    ldrh w9, [x0, #2]
-; CHECK-NEXT:    mov v1.s[1], w10
-; CHECK-NEXT:    mov v0.s[1], w9
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    ldrh w9, [x1]
+; CHECK-NEXT:    ldrh w10, [x0, #2]
+; CHECK-NEXT:    ldrh w11, [x1, #2]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    mov v0.s[1], w10
+; CHECK-NEXT:    mov v1.s[1], w11
 ; CHECK-NEXT:    uqsub v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    mov w8, v0.s[1]
 ; CHECK-NEXT:    fmov w9, s0
@@ -220,9 +220,9 @@ define void @v12i16(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v1i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-LABEL: v1i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr b0, [x1]
-; CHECK-NEXT:    ldr b1, [x0]
-; CHECK-NEXT:    uqsub v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    ldr b1, [x1]
+; CHECK-NEXT:    uqsub v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    st1 { v0.b }[0], [x2]
 ; CHECK-NEXT:    ret
   %x = load <1 x i8>, ptr %px
@@ -235,9 +235,9 @@ define void @v1i8(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v1i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-LABEL: v1i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr h0, [x1]
-; CHECK-NEXT:    ldr h1, [x0]
-; CHECK-NEXT:    uqsub v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ldr h1, [x1]
+; CHECK-NEXT:    uqsub v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    str h0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <1 x i16>, ptr %px
@@ -291,8 +291,8 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; CHECK-LABEL: v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    uqsub v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    uqsub v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
   %z = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
   ret <8 x i32> %z
@@ -322,8 +322,8 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; CHECK-LABEL: v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uqsub v0.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    uqsub v1.2d, v1.2d, v3.2d
+; CHECK-NEXT:    uqsub v0.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    ret
   %z = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
   ret <4 x i64> %z

diff  --git a/llvm/test/CodeGen/AArch64/vcvt-oversize.ll b/llvm/test/CodeGen/AArch64/vcvt-oversize.ll
index 53d166b4c65694..380bdbcc7f7408 100644
--- a/llvm/test/CodeGen/AArch64/vcvt-oversize.ll
+++ b/llvm/test/CodeGen/AArch64/vcvt-oversize.ll
@@ -4,14 +4,14 @@
 define <8 x i8> @float_to_i8(ptr %in) {
 ; CHECK-LABEL: float_to_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    fadd v0.4s, v0.4s, v0.4s
+; CHECK-NEXT:    ldp q1, q0, [x0]
 ; CHECK-NEXT:    fadd v1.4s, v1.4s, v1.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v0.4s
 ; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
 ; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
 ; CHECK-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-NEXT:    xtn v1.4h, v1.4s
-; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    uzp1 v0.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    ret
   %l = load <8 x float>, ptr %in
   %scale = fmul <8 x float> %l, <float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0>

diff  --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index dcef874d7cc3d5..1b22e2f900ddb7 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -verify-machineinstrs < %s | FileCheck %s
 
 ; Basic tests from input vector to bitmask
@@ -6,38 +7,22 @@
 
 define i16 @convert_to_bitmask16(<16 x i8> %vec) {
 ; Bits used in mask
-; CHECK-LABEL: lCPI0_0
-; CHECK-NEXT: .byte	1
-; CHECK-NEXT: .byte	2
-; CHECK-NEXT: .byte	4
-; CHECK-NEXT: .byte	8
-; CHECK-NEXT: .byte	16
-; CHECK-NEXT: .byte	32
-; CHECK-NEXT: .byte	64
-; CHECK-NEXT: .byte	128
-; CHECK-NEXT: .byte	1
-; CHECK-NEXT: .byte	2
-; CHECK-NEXT: .byte	4
-; CHECK-NEXT: .byte	8
-; CHECK-NEXT: .byte	16
-; CHECK-NEXT: .byte	32
-; CHECK-NEXT: .byte	64
-; CHECK-NEXT: .byte	128
+; CHECK-LABEL: convert_to_bitmask16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:  Lloh0:
+; CHECK-NEXT:    adrp x8, lCPI0_0 at PAGE
+; CHECK-NEXT:    cmeq.16b v0, v0, #0
+; CHECK-NEXT:  Lloh1:
+; CHECK-NEXT:    ldr q1, [x8, lCPI0_0 at PAGEOFF]
+; CHECK-NEXT:    bic.16b v0, v1, v0
+; CHECK-NEXT:    ext.16b v1, v0, v0, #8
+; CHECK-NEXT:    zip1.16b v0, v0, v1
+; CHECK-NEXT:    addv.8h h0, v0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh0, Lloh1
 
 ; Actual conversion
-; CHECK-LABEL: convert_to_bitmask16
-; CHECK:       ; %bb.0:
-; CHECK-NEXT: Lloh0:
-; CHECK-NEXT:  adrp	    x8, lCPI0_0 at PAGE
-; CHECK-NEXT:  cmeq.16b v0, v0, #0
-; CHECK-NEXT: Lloh1:
-; CHECK-NEXT:  ldr	    q1, [x8, lCPI0_0 at PAGEOFF]
-; CHECK-NEXT:  bic.16b	v0, v1, v0
-; CHECK-NEXT:  ext.16b	v1, v0, v0, #8
-; CHECK-NEXT:  zip1.16b	v0, v0, v1
-; CHECK-NEXT:  addv.8h	h0, v0
-; CHECK-NEXT:  fmov	    w0, s0
-; CHECK-NEXT:  ret
 
   %cmp_result = icmp ne <16 x i8> %vec, zeroinitializer
   %bitmask = bitcast <16 x i1> %cmp_result to i16
@@ -45,28 +30,20 @@ define i16 @convert_to_bitmask16(<16 x i8> %vec) {
 }
 
 define i16 @convert_to_bitmask8(<8 x i16> %vec) {
-; CHECK-LABEL: lCPI1_0:
-; CHECK-NEXT: .short	1
-; CHECK-NEXT: .short	2
-; CHECK-NEXT: .short	4
-; CHECK-NEXT: .short	8
-; CHECK-NEXT: .short	16
-; CHECK-NEXT: .short	32
-; CHECK-NEXT: .short	64
-; CHECK-NEXT: .short	128
-
-; CHECK-LABEL: convert_to_bitmask8
+; CHECK-LABEL: convert_to_bitmask8:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT: Lloh2:
-; CHECK-NEXT:  adrp	    x8, lCPI1_0 at PAGE
-; CHECK-NEXT:  cmeq.8h	v0, v0, #0
-; CHECK-NEXT: Lloh3:
-; CHECK-NEXT:  ldr	    q1, [x8, lCPI1_0 at PAGEOFF]
-; CHECK-NEXT:  bic.16b	v0, v1, v0
-; CHECK-NEXT:  addv.8h	h0, v0
-; CHECK-NEXT:  fmov	    w8, s0
-; CHECK-NEXT:  and	    w0, w8, #0xff
-; CHECK-NEXT:  ret
+; CHECK-NEXT:  Lloh2:
+; CHECK-NEXT:    adrp x8, lCPI1_0 at PAGE
+; CHECK-NEXT:    cmeq.8h v0, v0, #0
+; CHECK-NEXT:  Lloh3:
+; CHECK-NEXT:    ldr q1, [x8, lCPI1_0 at PAGEOFF]
+; CHECK-NEXT:    bic.16b v0, v1, v0
+; CHECK-NEXT:    addv.8h h0, v0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    and w0, w8, #0xff
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh2, Lloh3
+
 
   %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer
   %bitmask = bitcast <8 x i1> %cmp_result to i8
@@ -75,23 +52,19 @@ define i16 @convert_to_bitmask8(<8 x i16> %vec) {
 }
 
 define i4 @convert_to_bitmask4(<4 x i32> %vec) {
-; CHECK-LABEL: lCPI2_0:
-; CHECK-NEXT:  .long	1
-; CHECK-NEXT:  .long	2
-; CHECK-NEXT:  .long	4
-; CHECK-NEXT:  .long	8
-
-; CHECK-LABEL: convert_to_bitmask4
+; CHECK-LABEL: convert_to_bitmask4:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT: Lloh4:
-; CHECK-NEXT:  adrp	    x8, lCPI2_0 at PAGE
-; CHECK-NEXT:  cmeq.4s	v0, v0, #0
-; CHECK-NEXT: Lloh5:
-; CHECK-NEXT:  ldr	    q1, [x8, lCPI2_0 at PAGEOFF]
-; CHECK-NEXT:  bic.16b	v0, v1, v0
-; CHECK-NEXT:  addv.4s	s0, v0
-; CHECK-NEXT:  fmov	    w0, s0
-; CHECK-NEXT:  ret
+; CHECK-NEXT:  Lloh4:
+; CHECK-NEXT:    adrp x8, lCPI2_0 at PAGE
+; CHECK-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-NEXT:  Lloh5:
+; CHECK-NEXT:    ldr q1, [x8, lCPI2_0 at PAGEOFF]
+; CHECK-NEXT:    bic.16b v0, v1, v0
+; CHECK-NEXT:    addv.4s s0, v0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh4, Lloh5
+
 
   %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer
   %bitmask = bitcast <4 x i1> %cmp_result to i4
@@ -99,22 +72,20 @@ define i4 @convert_to_bitmask4(<4 x i32> %vec) {
 }
 
 define i8 @convert_to_bitmask2(<2 x i64> %vec) {
-; CHECK-LABEL: lCPI3_0:
-; CHECK-NEXT: .quad	1
-; CHECK-NEXT: .quad	2
-
-; CHECK-LABEL: convert_to_bitmask2
+; CHECK-LABEL: convert_to_bitmask2:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT: Lloh6:
-; CHECK-NEXT:  adrp	    x8, lCPI3_0 at PAGE
-; CHECK-NEXT:  cmeq.2d	v0, v0, #0
-; CHECK-NEXT: Lloh7:
-; CHECK-NEXT:  ldr	    q1, [x8, lCPI3_0 at PAGEOFF]
-; CHECK-NEXT:  bic.16b	v0, v1, v0
-; CHECK-NEXT:  addp.2d	d0, v0
-; CHECK-NEXT:  fmov	    x8, d0
-; CHECK-NEXT:  and	    w0, w8, #0x3
-; CHECK-NEXT:  ret
+; CHECK-NEXT:  Lloh6:
+; CHECK-NEXT:    adrp x8, lCPI3_0 at PAGE
+; CHECK-NEXT:    cmeq.2d v0, v0, #0
+; CHECK-NEXT:  Lloh7:
+; CHECK-NEXT:    ldr q1, [x8, lCPI3_0 at PAGEOFF]
+; CHECK-NEXT:    bic.16b v0, v1, v0
+; CHECK-NEXT:    addp.2d d0, v0
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    and w0, w8, #0x3
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh6, Lloh7
+
 
   %cmp_result = icmp ne <2 x i64> %vec, zeroinitializer
   %bitmask = bitcast <2 x i1> %cmp_result to i2
@@ -124,23 +95,19 @@ define i8 @convert_to_bitmask2(<2 x i64> %vec) {
 
 ; Clang's __builtin_convertvector adds an undef vector concat for vectors with <8 elements.
 define i8 @clang_builtins_undef_concat_convert_to_bitmask4(<4 x i32> %vec) {
-; CHECK-LABEL: lCPI4_0:
-; CHECK-NEXT:  .long	1
-; CHECK-NEXT:  .long	2
-; CHECK-NEXT:  .long	4
-; CHECK-NEXT:  .long	8
-
-; CHECK-LABEL: clang_builtins_undef_concat_convert_to_bitmask4
+; CHECK-LABEL: clang_builtins_undef_concat_convert_to_bitmask4:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT: Lloh8:
-; CHECK-NEXT:  adrp	    x8, lCPI4_0 at PAGE
-; CHECK-NEXT:  cmeq.4s	v0, v0, #0
-; CHECK-NEXT: Lloh9:
-; CHECK-NEXT:  ldr	    q1, [x8, lCPI4_0 at PAGEOFF]
-; CHECK-NEXT:  bic.16b	v0, v1, v0
-; CHECK-NEXT:  addv.4s	s0, v0
-; CHECK-NEXT:  fmov	    w0, s0
-; CHECK-NEXT:  ret
+; CHECK-NEXT:  Lloh8:
+; CHECK-NEXT:    adrp x8, lCPI4_0 at PAGE
+; CHECK-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-NEXT:  Lloh9:
+; CHECK-NEXT:    ldr q1, [x8, lCPI4_0 at PAGEOFF]
+; CHECK-NEXT:    bic.16b v0, v1, v0
+; CHECK-NEXT:    addv.4s s0, v0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh8, Lloh9
+
 
   %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer
   %vector_pad = shufflevector <4 x i1> %cmp_result, <4 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -150,25 +117,21 @@ define i8 @clang_builtins_undef_concat_convert_to_bitmask4(<4 x i32> %vec) {
 
 
 define i4 @convert_to_bitmask_no_compare(<4 x i32> %vec1, <4 x i32> %vec2) {
-; CHECK-LABEL: lCPI5_0:
-; CHECK-NEXT:  .long	1
-; CHECK-NEXT:  .long	2
-; CHECK-NEXT:  .long	4
-; CHECK-NEXT:  .long	8
-
-; CHECK-LABEL: convert_to_bitmask_no_compare
+; CHECK-LABEL: convert_to_bitmask_no_compare:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT: Lloh10:
-; CHECK-NEXT:  adrp	    x8, lCPI5_0 at PAGE
-; CHECK-NEXT:  and.16b  v0, v0, v1
-; CHECK-NEXT:  shl.4s	v0, v0, #31
-; CHECK-NEXT: Lloh11:
-; CHECK-NEXT:  ldr	    q1, [x8, lCPI5_0 at PAGEOFF]
-; CHECK-NEXT:  cmlt.4s	v0, v0, #0
-; CHECK-NEXT:  and.16b	v0, v0, v1
-; CHECK-NEXT:  addv.4s	s0, v0
-; CHECK-NEXT:  fmov	    w0, s0
-; CHECK-NEXT:  ret
+; CHECK-NEXT:    and.16b v0, v0, v1
+; CHECK-NEXT:  Lloh10:
+; CHECK-NEXT:    adrp x8, lCPI5_0 at PAGE
+; CHECK-NEXT:  Lloh11:
+; CHECK-NEXT:    ldr q1, [x8, lCPI5_0 at PAGEOFF]
+; CHECK-NEXT:    shl.4s v0, v0, #31
+; CHECK-NEXT:    cmlt.4s v0, v0, #0
+; CHECK-NEXT:    and.16b v0, v0, v1
+; CHECK-NEXT:    addv.4s s0, v0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh10, Lloh11
+
 
   %cmp = and <4 x i32> %vec1, %vec2
   %trunc = trunc <4 x i32> %cmp to <4 x i1>
@@ -177,25 +140,21 @@ define i4 @convert_to_bitmask_no_compare(<4 x i32> %vec1, <4 x i32> %vec2) {
 }
 
 define i4 @convert_to_bitmask_with_compare_chain(<4 x i32> %vec1, <4 x i32> %vec2) {
-; CHECK-LABEL: lCPI6_0:
-; CHECK-NEXT:  .long	1
-; CHECK-NEXT:  .long	2
-; CHECK-NEXT:  .long	4
-; CHECK-NEXT:  .long	8
-
-; CHECK-LABEL: convert_to_bitmask_with_compare_chain
+; CHECK-LABEL: convert_to_bitmask_with_compare_chain:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT: Lloh12:
-; CHECK-NEXT:  adrp	    x8, lCPI6_0 at PAGE
-; CHECK-NEXT:  cmeq.4s	v2, v0, #0
-; CHECK-NEXT:  cmeq.4s	v0, v0, v1
-; CHECK-NEXT: Lloh13:
-; CHECK-NEXT:  ldr	    q1, [x8, lCPI6_0 at PAGEOFF]
-; CHECK-NEXT:  bic.16b	v0, v0, v2
-; CHECK-NEXT:  and.16b	v0, v0, v1
-; CHECK-NEXT:  addv.4s	s0, v0
-; CHECK-NEXT:  fmov	    w0, s0
-; CHECK-NEXT:  ret
+; CHECK-NEXT:    cmeq.4s v2, v0, #0
+; CHECK-NEXT:    cmeq.4s v0, v0, v1
+; CHECK-NEXT:  Lloh12:
+; CHECK-NEXT:    adrp x8, lCPI6_0 at PAGE
+; CHECK-NEXT:  Lloh13:
+; CHECK-NEXT:    ldr q1, [x8, lCPI6_0 at PAGEOFF]
+; CHECK-NEXT:    bic.16b v0, v0, v2
+; CHECK-NEXT:    and.16b v0, v0, v1
+; CHECK-NEXT:    addv.4s s0, v0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh12, Lloh13
+
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
   %cmp2 = icmp eq <4 x i32> %vec1, %vec2
@@ -205,26 +164,22 @@ define i4 @convert_to_bitmask_with_compare_chain(<4 x i32> %vec1, <4 x i32> %vec
 }
 
 define i4 @convert_to_bitmask_with_trunc_in_chain(<4 x i32> %vec1, <4 x i32> %vec2) {
-; CHECK-LABEL: lCPI7_0:
-; CHECK-NEXT:  .long	1
-; CHECK-NEXT:  .long	2
-; CHECK-NEXT:  .long	4
-; CHECK-NEXT:  .long	8
-
-; CHECK-LABEL: convert_to_bitmask_with_trunc_in_chain
+; CHECK-LABEL: convert_to_bitmask_with_trunc_in_chain:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:  cmeq.4s	v0, v0, #0
-; CHECK-NEXT: Lloh14:
-; CHECK-NEXT:  adrp	    x8, lCPI7_0 at PAGE
-; CHECK-NEXT:  bic.16b	v0, v1, v0
-; CHECK-NEXT:  shl.4s	v0, v0, #31
-; CHECK-NEXT: Lloh15:
-; CHECK-NEXT:  ldr	    q1, [x8, lCPI7_0 at PAGEOFF]
-; CHECK-NEXT:  cmlt.4s	v0, v0, #0
-; CHECK-NEXT:  and.16b	v0, v0, v1
-; CHECK-NEXT:  addv.4s	s0, v0
-; CHECK-NEXT:  fmov	    w0, s0
-; CHECK-NEXT:  ret
+; CHECK-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-NEXT:  Lloh14:
+; CHECK-NEXT:    adrp x8, lCPI7_0 at PAGE
+; CHECK-NEXT:    bic.16b v0, v1, v0
+; CHECK-NEXT:  Lloh15:
+; CHECK-NEXT:    ldr q1, [x8, lCPI7_0 at PAGEOFF]
+; CHECK-NEXT:    shl.4s v0, v0, #31
+; CHECK-NEXT:    cmlt.4s v0, v0, #0
+; CHECK-NEXT:    and.16b v0, v0, v1
+; CHECK-NEXT:    addv.4s s0, v0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh14, Lloh15
+
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
   %trunc_vec = trunc <4 x i32> %vec2 to <4 x i1>
@@ -234,38 +189,34 @@ define i4 @convert_to_bitmask_with_trunc_in_chain(<4 x i32> %vec1, <4 x i32> %ve
 }
 
 define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, <4 x i32> %vec2) {
-; CHECK-LABEL: lCPI8_0:
-; CHECK-NEXT:  .short	1
-; CHECK-NEXT:  .short	2
-; CHECK-NEXT:  .short	4
-; CHECK-NEXT:  .short	8
-
-; CHECK-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain
-; CHECK:      ; %bb.0:
-; CHECK-NEXT: 	cmeq.4s	v0, v0, #0
-; CHECK-NEXT: Lloh16:
-; CHECK-NEXT: 	adrp	x8, lCPI8_0 at PAGE
-; CHECK-NEXT: 	cmeq.4s	v1, v1, #0
-; CHECK-NEXT: 	movi	d2, #0x000000ffffffff
-; CHECK-NEXT: 	bic.16b	v0, v1, v0
-; CHECK-NEXT: 	movi	d1, #0xffff0000ffff0000
-; CHECK-NEXT: 	xtn.4h	v0, v0
-; CHECK-NEXT: 	movi	d3, #0x00ffffffffffff
-; CHECK-NEXT: 	orr.8b	v0, v0, v2
-; CHECK-NEXT: 	movi	d2, #0x00ffffffff0000
-; CHECK-NEXT: 	eor.8b	v1, v0, v1
-; CHECK-NEXT: 	mov.h	v1[2], wzr
-; CHECK-NEXT: 	eor.8b	v0, v0, v2
-; CHECK-NEXT: 	orr.8b	v0, v0, v3
-; CHECK-NEXT: 	orr.8b	v0, v1, v0
-; CHECK-NEXT: Lloh17:
-; CHECK-NEXT: 	ldr	d1, [x8, lCPI8_0 at PAGEOFF]
-; CHECK-NEXT: 	shl.4h	v0, v0, #15
-; CHECK-NEXT: 	cmlt.4h	v0, v0, #0
-; CHECK-NEXT: 	and.8b	v0, v0, v1
-; CHECK-NEXT: 	addv.4h	h0, v0
-; CHECK-NEXT: 	fmov	w0, s0
-; CHECK-NEXT: 	ret
+; CHECK-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-NEXT:  Lloh16:
+; CHECK-NEXT:    adrp x8, lCPI8_0 at PAGE
+; CHECK-NEXT:    movi d2, #0x000000ffffffff
+; CHECK-NEXT:    movi d3, #0x00ffffffffffff
+; CHECK-NEXT:    bic.16b v0, v1, v0
+; CHECK-NEXT:    movi d1, #0xffff0000ffff0000
+; CHECK-NEXT:    xtn.4h v0, v0
+; CHECK-NEXT:    orr.8b v0, v0, v2
+; CHECK-NEXT:    movi d2, #0x00ffffffff0000
+; CHECK-NEXT:    eor.8b v1, v0, v1
+; CHECK-NEXT:    eor.8b v0, v0, v2
+; CHECK-NEXT:    mov.h v1[2], wzr
+; CHECK-NEXT:    orr.8b v0, v0, v3
+; CHECK-NEXT:    orr.8b v0, v1, v0
+; CHECK-NEXT:  Lloh17:
+; CHECK-NEXT:    ldr d1, [x8, lCPI8_0 at PAGEOFF]
+; CHECK-NEXT:    shl.4h v0, v0, #15
+; CHECK-NEXT:    cmlt.4h v0, v0, #0
+; CHECK-NEXT:    and.8b v0, v0, v1
+; CHECK-NEXT:    addv.4h h0, v0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh16, Lloh17
+
 
   %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer
   %cmp2 = icmp eq <4 x i32> %vec2, zeroinitializer
@@ -283,26 +234,22 @@ define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, <
 }
 
 define i4 @convert_to_bitmask_with_
diff erent_types_in_chain(<4 x i16> %vec1, <4 x i32> %vec2) {
-; CHECK-LABEL: lCPI9_0:
-; CHECK-NEXT:  .short	1
-; CHECK-NEXT:  .short	2
-; CHECK-NEXT:  .short	4
-; CHECK-NEXT:  .short	8
-
-; CHECK-LABEL: convert_to_bitmask_with_
diff erent_types_in_chain
-; CHECK:      ; %bb.0:
-; CHECK-NEXT: Lloh18:
-; CHECK-NEXT: 	adrp	x8, lCPI9_0 at PAGE
-; CHECK-NEXT: 	cmeq.4h	v0, v0, #0
-; CHECK-NEXT: 	cmeq.4s	v1, v1, #0
-; CHECK-NEXT: 	xtn.4h	v1, v1
-; CHECK-NEXT: Lloh19:
-; CHECK-NEXT: 	ldr	d2, [x8, lCPI9_0 at PAGEOFF]
-; CHECK-NEXT: 	orn.8b	v0, v1, v0
-; CHECK-NEXT: 	and.8b	v0, v0, v2
-; CHECK-NEXT: 	addv.4h	h0, v0
-; CHECK-NEXT: 	fmov	w0, s0
-; CHECK-NEXT: 	ret
+; CHECK-LABEL: convert_to_bitmask_with_
diff erent_types_in_chain:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-NEXT:    cmeq.4h v0, v0, #0
+; CHECK-NEXT:  Lloh18:
+; CHECK-NEXT:    adrp x8, lCPI9_0 at PAGE
+; CHECK-NEXT:    xtn.4h v1, v1
+; CHECK-NEXT:    orn.8b v0, v1, v0
+; CHECK-NEXT:  Lloh19:
+; CHECK-NEXT:    ldr d1, [x8, lCPI9_0 at PAGEOFF]
+; CHECK-NEXT:    and.8b v0, v0, v1
+; CHECK-NEXT:    addv.4h h0, v0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh18, Lloh19
+
 
   %cmp1 = icmp ne <4 x i16> %vec1, zeroinitializer
   %cmp2 = icmp eq <4 x i32> %vec2, zeroinitializer
@@ -314,35 +261,37 @@ define i4 @convert_to_bitmask_with_
diff erent_types_in_chain(<4 x i16> %vec1, <4
 define i16 @convert_to_bitmask_without_knowing_type(<16 x i1> %vec) {
 ; CHECK-LABEL: convert_to_bitmask_without_knowing_type:
 ; CHECK:       ; %bb.0:
+; CHECK-NEXT:    shl.16b v0, v0, #7
 ; CHECK-NEXT:  Lloh20:
 ; CHECK-NEXT:    adrp x8, lCPI10_0 at PAGE
-; CHECK-NEXT:    shl.16b v0, v0, #7
-; CHECK-NEXT:    cmlt.16b v0, v0, #0
 ; CHECK-NEXT:  Lloh21:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI10_0 at PAGEOFF]
+; CHECK-NEXT:    cmlt.16b v0, v0, #0
 ; CHECK-NEXT:    and.16b v0, v0, v1
 ; CHECK-NEXT:    ext.16b v1, v0, v0, #8
 ; CHECK-NEXT:    zip1.16b v0, v0, v1
 ; CHECK-NEXT:    addv.8h h0, v0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh20, Lloh21
 
   %bitmask = bitcast <16 x i1> %vec to i16
   ret i16 %bitmask
 }
 
 define i2 @convert_to_bitmask_2xi32(<2 x i32> %vec) {
-; CHECK-LABEL: convert_to_bitmask_2xi32
+; CHECK-LABEL: convert_to_bitmask_2xi32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:  Lloh22:
-; CHECK-NEXT:  	adrp	x8, lCPI11_0 at PAGE
-; CHECK-NEXT:  	cmeq.2s	v0, v0, #0
+; CHECK-NEXT:    adrp x8, lCPI11_0 at PAGE
+; CHECK-NEXT:    cmeq.2s v0, v0, #0
 ; CHECK-NEXT:  Lloh23:
-; CHECK-NEXT:  	ldr	d1, [x8, lCPI11_0 at PAGEOFF]
-; CHECK-NEXT:  	bic.8b	v0, v1, v0
-; CHECK-NEXT:  	addp.2s	v0, v0, v0
-; CHECK-NEXT:  	fmov	w0, s0
-; CHECK-NEXT:  	ret
+; CHECK-NEXT:    ldr d1, [x8, lCPI11_0 at PAGEOFF]
+; CHECK-NEXT:    bic.8b v0, v1, v0
+; CHECK-NEXT:    addp.2s v0, v0, v0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh22, Lloh23
 
   %cmp_result = icmp ne <2 x i32> %vec, zeroinitializer
   %bitmask = bitcast <2 x i1> %cmp_result to i2
@@ -350,18 +299,19 @@ define i2 @convert_to_bitmask_2xi32(<2 x i32> %vec) {
 }
 
 define i4 @convert_to_bitmask_4xi8(<4 x i8> %vec) {
-; CHECK-LABEL: convert_to_bitmask_4xi8
+; CHECK-LABEL: convert_to_bitmask_4xi8:
 ; CHECK:       ; %bb.0:
+; CHECK-NEXT:    bic.4h v0, #255, lsl #8
 ; CHECK-NEXT:  Lloh24:
-; CHECK-NEXT:  	adrp	x8, lCPI12_0 at PAGE
-; CHECK-NEXT:  	bic.4h	v0, #255, lsl #8
-; CHECK-NEXT:  	cmeq.4h	v0, v0, #0
+; CHECK-NEXT:    adrp x8, lCPI12_0 at PAGE
 ; CHECK-NEXT:  Lloh25:
-; CHECK-NEXT:  	ldr	d1, [x8, lCPI12_0 at PAGEOFF]
-; CHECK-NEXT:  	bic.8b	v0, v1, v0
-; CHECK-NEXT:  	addv.4h	h0, v0
-; CHECK-NEXT:  	fmov	w0, s0
-; CHECK-NEXT:  	ret
+; CHECK-NEXT:    ldr d1, [x8, lCPI12_0 at PAGEOFF]
+; CHECK-NEXT:    cmeq.4h v0, v0, #0
+; CHECK-NEXT:    bic.8b v0, v1, v0
+; CHECK-NEXT:    addv.4h h0, v0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh24, Lloh25
 
   %cmp_result = icmp ne <4 x i8> %vec, zeroinitializer
   %bitmask = bitcast <4 x i1> %cmp_result to i4
@@ -369,19 +319,20 @@ define i4 @convert_to_bitmask_4xi8(<4 x i8> %vec) {
 }
 
 define i8 @convert_to_bitmask_8xi2(<8 x i2> %vec) {
-; CHECK-LABEL: convert_to_bitmask_8xi2
+; CHECK-LABEL: convert_to_bitmask_8xi2:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:  	movi.8b v1, #3
+; CHECK-NEXT:    movi.8b v1, #3
 ; CHECK-NEXT:  Lloh26:
-; CHECK-NEXT:  	adrp	x8, lCPI13_0 at PAGE
-; CHECK-NEXT:  	and.8b	v0, v0, v1
+; CHECK-NEXT:    adrp x8, lCPI13_0 at PAGE
+; CHECK-NEXT:    and.8b v0, v0, v1
 ; CHECK-NEXT:  Lloh27:
-; CHECK-NEXT:  	ldr	d1, [x8, lCPI13_0 at PAGEOFF]
-; CHECK-NEXT:  	cmeq.8b	v0, v0, #0
-; CHECK-NEXT:  	bic.8b	v0, v1, v0
-; CHECK-NEXT:  	addv.8b	b0, v0
-; CHECK-NEXT:  	fmov	w0, s0
-; CHECK-NEXT:  	ret
+; CHECK-NEXT:    ldr d1, [x8, lCPI13_0 at PAGEOFF]
+; CHECK-NEXT:    cmeq.8b v0, v0, #0
+; CHECK-NEXT:    bic.8b v0, v1, v0
+; CHECK-NEXT:    addv.8b b0, v0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh26, Lloh27
 
   %cmp_result = icmp ne <8 x i2> %vec, zeroinitializer
   %bitmask = bitcast <8 x i1> %cmp_result to i8
@@ -389,25 +340,21 @@ define i8 @convert_to_bitmask_8xi2(<8 x i2> %vec) {
 }
 
 define i4 @convert_to_bitmask_float(<4 x float> %vec) {
-; CHECK-LABEL: lCPI14_0:
-; CHECK-NEXT:  .long	1
-; CHECK-NEXT:  .long	2
-; CHECK-NEXT:  .long	4
-; CHECK-NEXT:  .long	8
-
-; CHECK-LABEL: convert_to_bitmask_float
+; CHECK-LABEL: convert_to_bitmask_float:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT: Lloh28:
-; CHECK-NEXT:  adrp	    x8, lCPI14_0 at PAGE
-; CHECK-NEXT:  fcmgt.4s	v1, v0, #0.0
-; CHECK-NEXT:  fcmlt.4s	v0, v0, #0.0
-; CHECK-NEXT: Lloh29:
-; CHECK-NEXT:  ldr	    q2, [x8, lCPI14_0 at PAGEOFF]
-; CHECK-NEXT:  orr.16b	v0, v0, v1
-; CHECK-NEXT:  and.16b	v0, v0, v2
-; CHECK-NEXT:  addv.4s	s0, v0
-; CHECK-NEXT:  fmov	    w0, s0
-; CHECK-NEXT:  ret
+; CHECK-NEXT:    fcmgt.4s v1, v0, #0.0
+; CHECK-NEXT:    fcmlt.4s v0, v0, #0.0
+; CHECK-NEXT:  Lloh28:
+; CHECK-NEXT:    adrp x8, lCPI14_0 at PAGE
+; CHECK-NEXT:    orr.16b v0, v0, v1
+; CHECK-NEXT:  Lloh29:
+; CHECK-NEXT:    ldr q1, [x8, lCPI14_0 at PAGEOFF]
+; CHECK-NEXT:    and.16b v0, v0, v1
+; CHECK-NEXT:    addv.4s s0, v0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh28, Lloh29
+
 
   %cmp_result = fcmp one <4 x float> %vec, zeroinitializer
   %bitmask = bitcast <4 x i1> %cmp_result to i4
@@ -417,30 +364,25 @@ define i4 @convert_to_bitmask_float(<4 x float> %vec) {
 ; Larger vector types don't map directly, but the can be split/truncated and then converted.
 ; After the comparison against 0, this is truncated to <8 x i16>, which is valid again.
 define i8 @convert_large_vector(<8 x i32> %vec) {
-; CHECK-LABEL: lCPI15_0:
-; CHECK-NEXT:  .short	1
-; CHECK-NEXT:  .short	2
-; CHECK-NEXT:  .short	4
-; CHECK-NEXT:  .short	8
-; CHECK-NEXT:  .short	16
-; CHECK-NEXT:  .short	32
-; CHECK-NEXT:  .short	64
-; CHECK-NEXT:  .short	128
-
 ; CHECK-LABEL: convert_large_vector:
-; CHECK:      Lloh30:
-; CHECK-NEXT:  adrp	x8, lCPI15_0 at PAGE
-; CHECK-NEXT:  cmeq.4s	v1, v1, #0
-; CHECK-NEXT:  cmeq.4s	v0, v0, #0
-; CHECK-NEXT:  uzp1.8h	v0, v0, v1
-; CHECK-NEXT: Lloh31:
-; CHECK-NEXT:  ldr	q1, [x8, lCPI15_0 at PAGEOFF]
-; CHECK-NEXT:  bic.16b	v0, v1, v0
-; CHECK-NEXT:  addv.8h	h0, v0
-; CHECK-NEXT:  fmov	w8, s0
-; CHECK-NEXT:  and	w0, w8, #0xff
-; CHECK-NEXT:  add	sp, sp, #16
-; CHECK-NEXT:  ret
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-NEXT:  Lloh30:
+; CHECK-NEXT:    adrp x8, lCPI15_0 at PAGE
+; CHECK-NEXT:    uzp1.8h v0, v0, v1
+; CHECK-NEXT:  Lloh31:
+; CHECK-NEXT:    ldr q1, [x8, lCPI15_0 at PAGEOFF]
+; CHECK-NEXT:    bic.16b v0, v1, v0
+; CHECK-NEXT:    addv.8h h0, v0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    and w0, w8, #0xff
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh30, Lloh31
+
 
    %cmp_result = icmp ne <8 x i32> %vec, zeroinitializer
    %bitmask = bitcast <8 x i1> %cmp_result to i8
@@ -448,19 +390,20 @@ define i8 @convert_large_vector(<8 x i32> %vec) {
 }
 
 define i4 @convert_legalized_illegal_element_size(<4 x i22> %vec) {
-; CHECK-LABEL: convert_legalized_illegal_element_size
+; CHECK-LABEL: convert_legalized_illegal_element_size:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:  movi.4s  v1, #63, msl #16
+; CHECK-NEXT:    movi.4s v1, #63, msl #16
 ; CHECK-NEXT:  Lloh32:
-; CHECK-NEXT:  adrp	    x8, lCPI16_0 at PAGE
-; CHECK-NEXT:  cmtst.4s v0, v0, v1
+; CHECK-NEXT:    adrp x8, lCPI16_0 at PAGE
+; CHECK-NEXT:    cmtst.4s v0, v0, v1
 ; CHECK-NEXT:  Lloh33:
-; CHECK-NEXT:  ldr	    d1, [x8, lCPI16_0 at PAGEOFF]
-; CHECK-NEXT:  xtn.4h   v0, v0
-; CHECK-NEXT:  and.8b   v0, v0, v1
-; CHECK-NEXT:  addv.4h  h0, v0
-; CHECK-NEXT:  fmov     w0, s0
-; CHECK-NEXT:  ret
+; CHECK-NEXT:    ldr d1, [x8, lCPI16_0 at PAGEOFF]
+; CHECK-NEXT:    xtn.4h v0, v0
+; CHECK-NEXT:    and.8b v0, v0, v1
+; CHECK-NEXT:    addv.4h h0, v0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh32, Lloh33
 
   %cmp_result = icmp ne <4 x i22> %vec, zeroinitializer
   %bitmask = bitcast <4 x i1> %cmp_result to i4
@@ -470,8 +413,28 @@ define i4 @convert_legalized_illegal_element_size(<4 x i22> %vec) {
 ; This may still be converted as a v8i8 after the vector concat (but not as v4iX).
 define i8 @no_direct_convert_for_bad_concat(<4 x i32> %vec) {
 ; CHECK-LABEL: no_direct_convert_for_bad_concat:
-; CHECK:     cmtst.4s v0, v0, v0
-; CHECK-NOT: addv.4
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmtst.4s v0, v0, v0
+; CHECK-NEXT:  Lloh34:
+; CHECK-NEXT:    adrp x8, lCPI17_0 at PAGE
+; CHECK-NEXT:    xtn.4h v0, v0
+; CHECK-NEXT:    umov.h w9, v0[0]
+; CHECK-NEXT:    mov.b v1[4], w9
+; CHECK-NEXT:    umov.h w9, v0[1]
+; CHECK-NEXT:    mov.b v1[5], w9
+; CHECK-NEXT:    umov.h w9, v0[2]
+; CHECK-NEXT:    mov.b v1[6], w9
+; CHECK-NEXT:    umov.h w9, v0[3]
+; CHECK-NEXT:    mov.b v1[7], w9
+; CHECK-NEXT:    shl.8b v0, v1, #7
+; CHECK-NEXT:  Lloh35:
+; CHECK-NEXT:    ldr d1, [x8, lCPI17_0 at PAGEOFF]
+; CHECK-NEXT:    cmlt.8b v0, v0, #0
+; CHECK-NEXT:    and.8b v0, v0, v1
+; CHECK-NEXT:    addv.8b b0, v0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh34, Lloh35
 
   %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer
   %vector_pad = shufflevector <4 x i1> poison, <4 x i1> %cmp_result, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7>
@@ -481,16 +444,46 @@ define i8 @no_direct_convert_for_bad_concat(<4 x i32> %vec) {
 
 define <8 x i1> @no_convert_without_direct_bitcast(<8 x i16> %vec) {
 ; CHECK-LABEL: no_convert_without_direct_bitcast:
-; CHECK:     cmtst.8h v0, v0, v0
-; CHECK-NOT: addv.4s	s0, v0
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmtst.8h v0, v0, v0
+; CHECK-NEXT:    xtn.8b v0, v0
+; CHECK-NEXT:    ret
 
    %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer
    ret <8 x i1> %cmp_result
 }
 
 define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) {
-; CHECK-LABEL: no_combine_illegal_num_elements
-; CHECK-NOT: addv
+; CHECK-LABEL: no_combine_illegal_num_elements:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    fmov s1, w4
+; CHECK-NEXT:    mov.s v0[1], w1
+; CHECK-NEXT:    mov.s v1[1], w5
+; CHECK-NEXT:    mov.s v0[2], w2
+; CHECK-NEXT:    cmeq.4s v1, v1, #0
+; CHECK-NEXT:    mov.s v0[3], w3
+; CHECK-NEXT:    cmeq.4s v0, v0, #0
+; CHECK-NEXT:    uzp1.8h v0, v0, v1
+; CHECK-NEXT:    mvn.16b v0, v0
+; CHECK-NEXT:    xtn.8b v0, v0
+; CHECK-NEXT:    umov.b w8, v0[0]
+; CHECK-NEXT:    umov.b w9, v0[1]
+; CHECK-NEXT:    umov.b w10, v0[2]
+; CHECK-NEXT:    and w8, w8, #0x1
+; CHECK-NEXT:    bfi w8, w9, #1, #1
+; CHECK-NEXT:    umov.b w9, v0[3]
+; CHECK-NEXT:    bfi w8, w10, #2, #1
+; CHECK-NEXT:    umov.b w10, v0[4]
+; CHECK-NEXT:    bfi w8, w9, #3, #1
+; CHECK-NEXT:    umov.b w9, v0[5]
+; CHECK-NEXT:    bfi w8, w10, #4, #1
+; CHECK-NEXT:    orr w8, w8, w9, lsl #5
+; CHECK-NEXT:    and w0, w8, #0x3f
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
 
   %cmp_result = icmp ne <6 x i32> %vec, zeroinitializer
   %bitmask = bitcast <6 x i1> %cmp_result to i6

diff  --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll
index ccc66fd000397a..9c6ab8da0fa756 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll
@@ -1,39 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -verify-machineinstrs < %s | FileCheck %s
 
 define void @store_16_elements(<16 x i8> %vec, ptr %out) {
 ; Bits used in mask
-; CHECK-LABEL: lCPI0_0
-; CHECK-NEXT: .byte	1
-; CHECK-NEXT: .byte	2
-; CHECK-NEXT: .byte	4
-; CHECK-NEXT: .byte	8
-; CHECK-NEXT: .byte	16
-; CHECK-NEXT: .byte	32
-; CHECK-NEXT: .byte	64
-; CHECK-NEXT: .byte	128
-; CHECK-NEXT: .byte	1
-; CHECK-NEXT: .byte	2
-; CHECK-NEXT: .byte	4
-; CHECK-NEXT: .byte	8
-; CHECK-NEXT: .byte	16
-; CHECK-NEXT: .byte	32
-; CHECK-NEXT: .byte	64
-; CHECK-NEXT: .byte	128
-
-; Actual conversion
-; CHECK-LABEL: store_16_elements
+; CHECK-LABEL: store_16_elements:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:  Lloh0:
-; CHECK-NEXT:  adrp	    x8, lCPI0_0 at PAGE
-; CHECK-NEXT:  cmeq.16b	v0, v0, #0
+; CHECK-NEXT:    adrp x8, lCPI0_0 at PAGE
+; CHECK-NEXT:    cmeq.16b v0, v0, #0
 ; CHECK-NEXT:  Lloh1:
-; CHECK-NEXT:  ldr	    q1, [x8, lCPI0_0 at PAGEOFF]
-; CHECK-NEXT:  bic.16b	v0, v1, v0
-; CHECK-NEXT:  ext.16b	v1, v0, v0, #8
-; CHECK-NEXT:  zip1.16b	v0, v0, v1
-; CHECK-NEXT:  addv.8h	h0, v0
-; CHECK-NEXT:  str	    h0, [x0]
-; CHECK-NEXT:  ret
+; CHECK-NEXT:    ldr q1, [x8, lCPI0_0 at PAGEOFF]
+; CHECK-NEXT:    bic.16b v0, v1, v0
+; CHECK-NEXT:    ext.16b v1, v0, v0, #8
+; CHECK-NEXT:    zip1.16b v0, v0, v1
+; CHECK-NEXT:    addv.8h h0, v0
+; CHECK-NEXT:    str h0, [x0]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh0, Lloh1
+
+; Actual conversion
 
   %cmp_result = icmp ne <16 x i8> %vec, zeroinitializer
   store <16 x i1> %cmp_result, ptr %out
@@ -41,28 +26,20 @@ define void @store_16_elements(<16 x i8> %vec, ptr %out) {
 }
 
 define void @store_8_elements(<8 x i16> %vec, ptr %out) {
-; CHECK-LABEL: lCPI1_0:
-; CHECK-NEXT: .short	1
-; CHECK-NEXT: .short	2
-; CHECK-NEXT: .short	4
-; CHECK-NEXT: .short	8
-; CHECK-NEXT: .short	16
-; CHECK-NEXT: .short	32
-; CHECK-NEXT: .short	64
-; CHECK-NEXT: .short	128
-
-; CHECK-LABEL: store_8_elements
+; CHECK-LABEL: store_8_elements:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:  Lloh2:
-; CHECK-NEXT:  adrp	    x8, lCPI1_0 at PAGE
-; CHECK-NEXT:  cmeq.8h	v0, v0, #0
+; CHECK-NEXT:    adrp x8, lCPI1_0 at PAGE
+; CHECK-NEXT:    cmeq.8h v0, v0, #0
 ; CHECK-NEXT:  Lloh3:
-; CHECK-NEXT:  ldr	    q1, [x8, lCPI1_0 at PAGEOFF]
-; CHECK-NEXT:  bic.16b	v0, v1, v0
-; CHECK-NEXT:  addv.8h	h0, v0
-; CHECK-NEXT:  fmov	    w8, s0
-; CHECK-NEXT:  strb	    w8, [x0]
-; CHECK-NEXT:  ret
+; CHECK-NEXT:    ldr q1, [x8, lCPI1_0 at PAGEOFF]
+; CHECK-NEXT:    bic.16b v0, v1, v0
+; CHECK-NEXT:    addv.8h h0, v0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strb w8, [x0]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh2, Lloh3
+
 
   %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer
   store <8 x i1> %cmp_result, ptr %out
@@ -70,24 +47,20 @@ define void @store_8_elements(<8 x i16> %vec, ptr %out) {
 }
 
 define void @store_4_elements(<4 x i32> %vec, ptr %out) {
-; CHECK-LABEL: lCPI2_0:
-; CHECK-NEXT:  .long	1
-; CHECK-NEXT:  .long	2
-; CHECK-NEXT:  .long	4
-; CHECK-NEXT:  .long	8
-
-; CHECK-LABEL: store_4_elements
+; CHECK-LABEL: store_4_elements:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:  Lloh4:
-; CHECK-NEXT:  adrp	x8, lCPI2_0 at PAGE
-; CHECK-NEXT:  cmeq.4s	v0, v0, #0
+; CHECK-NEXT:    adrp x8, lCPI2_0 at PAGE
+; CHECK-NEXT:    cmeq.4s v0, v0, #0
 ; CHECK-NEXT:  Lloh5:
-; CHECK-NEXT:  ldr	    q1, [x8, lCPI2_0 at PAGEOFF]
-; CHECK-NEXT:  bic.16b	v0, v1, v0
-; CHECK-NEXT:  addv.4s	s0, v0
-; CHECK-NEXT:  fmov	    w8, s0
-; CHECK-NEXT:  strb	    w8, [x0]
-; CHECK-NEXT:  ret
+; CHECK-NEXT:    ldr q1, [x8, lCPI2_0 at PAGEOFF]
+; CHECK-NEXT:    bic.16b v0, v1, v0
+; CHECK-NEXT:    addv.4s s0, v0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strb w8, [x0]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh4, Lloh5
+
 
   %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer
   store <4 x i1> %cmp_result, ptr %out
@@ -95,22 +68,20 @@ define void @store_4_elements(<4 x i32> %vec, ptr %out) {
 }
 
 define void @store_2_elements(<2 x i64> %vec, ptr %out) {
-; CHECK-LABEL: lCPI3_0:
-; CHECK-NEXT: .quad	1
-; CHECK-NEXT: .quad	2
-
-; CHECK-LABEL: store_2_elements
+; CHECK-LABEL: store_2_elements:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:  Lloh6:
-; CHECK-NEXT:  adrp	x8, lCPI3_0 at PAGE
-; CHECK-NEXT:  cmeq.2d	v0, v0, #0
+; CHECK-NEXT:    adrp x8, lCPI3_0 at PAGE
+; CHECK-NEXT:    cmeq.2d v0, v0, #0
 ; CHECK-NEXT:  Lloh7:
-; CHECK-NEXT:  ldr	    q1, [x8, lCPI3_0 at PAGEOFF]
-; CHECK-NEXT:  bic.16b	v0, v1, v0
-; CHECK-NEXT:  addp.2d	d0, v0
-; CHECK-NEXT:  fmov	    x8, d0
-; CHECK-NEXT:  strb	    w8, [x0]
-; CHECK-NEXT:  ret
+; CHECK-NEXT:    ldr q1, [x8, lCPI3_0 at PAGEOFF]
+; CHECK-NEXT:    bic.16b v0, v1, v0
+; CHECK-NEXT:    addp.2d d0, v0
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    strb w8, [x0]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh6, Lloh7
+
 
   %cmp_result = icmp ne <2 x i64> %vec, zeroinitializer
   store <2 x i1> %cmp_result, ptr %out
@@ -118,25 +89,21 @@ define void @store_2_elements(<2 x i64> %vec, ptr %out) {
 }
 
 define void @add_trunc_compare_before_store(<4 x i32> %vec, ptr %out) {
-; CHECK-LABEL: lCPI4_0:
-; CHECK-NEXT:  .long	1
-; CHECK-NEXT:  .long	2
-; CHECK-NEXT:  .long	4
-; CHECK-NEXT:  .long	8
-
-; CHECK-LABEL: add_trunc_compare_before_store
+; CHECK-LABEL: add_trunc_compare_before_store:
 ; CHECK:       ; %bb.0:
+; CHECK-NEXT:    shl.4s v0, v0, #31
 ; CHECK-NEXT:  Lloh8:
-; CHECK-NEXT:  adrp	    x8, lCPI4_0 at PAGE
-; CHECK-NEXT:  shl.4s	v0, v0, #31
-; CHECK-NEXT:  cmlt.4s	v0, v0, #0
+; CHECK-NEXT:    adrp x8, lCPI4_0 at PAGE
 ; CHECK-NEXT:  Lloh9:
-; CHECK-NEXT:  ldr	    q1, [x8, lCPI4_0 at PAGEOFF]
-; CHECK-NEXT:  and.16b	v0, v0, v1
-; CHECK-NEXT:  addv.4s	s0, v0
-; CHECK-NEXT:  fmov 	w8, s0
-; CHECK-NEXT:  strb	    w8, [x0]
-; CHECK-NEXT:  ret
+; CHECK-NEXT:    ldr q1, [x8, lCPI4_0 at PAGEOFF]
+; CHECK-NEXT:    cmlt.4s v0, v0, #0
+; CHECK-NEXT:    and.16b v0, v0, v1
+; CHECK-NEXT:    addv.4s s0, v0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strb w8, [x0]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh8, Lloh9
+
 
   %trunc = trunc <4 x i32> %vec to <4 x i1>
   store <4 x i1> %trunc, ptr %out
@@ -144,52 +111,40 @@ define void @add_trunc_compare_before_store(<4 x i32> %vec, ptr %out) {
 }
 
 define void @add_trunc_mask_unknown_vector_type(<4 x i1> %vec, ptr %out) {
-; CHECK-LABEL: lCPI5_0:
-; CHECK: .short	1
-; CHECK: .short	2
-; CHECK: .short	4
-; CHECK: .short	8
-
-; CHECK-LABEL: add_trunc_mask_unknown_vector_type
+; CHECK-LABEL: add_trunc_mask_unknown_vector_type:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT: Lloh10:
-; CHECK-NEXT:  adrp	    x8, lCPI5_0 at PAGE
-; CHECK-NEXT:  shl.4h	v0, v0, #15
-; CHECK-NEXT:  cmlt.4h	v0, v0, #0
-; CHECK-NEXT: Lloh11:
-; CHECK-NEXT:  ldr	    d1, [x8, lCPI5_0 at PAGEOFF]
-; CHECK-NEXT:  and.8b    v0, v0, v1
-; CHECK-NEXT:  addv.4h	h0, v0
-; CHECK-NEXT:  fmov	    w8, s0
-; CHECK-NEXT:  strb	    w8, [x0]
-; CHECK-NEXT:  ret
+; CHECK-NEXT:    shl.4h v0, v0, #15
+; CHECK-NEXT:  Lloh10:
+; CHECK-NEXT:    adrp x8, lCPI5_0 at PAGE
+; CHECK-NEXT:  Lloh11:
+; CHECK-NEXT:    ldr d1, [x8, lCPI5_0 at PAGEOFF]
+; CHECK-NEXT:    cmlt.4h v0, v0, #0
+; CHECK-NEXT:    and.8b v0, v0, v1
+; CHECK-NEXT:    addv.4h h0, v0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strb w8, [x0]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh10, Lloh11
+
 
   store <4 x i1> %vec, ptr %out
   ret void
 }
 
 define void @store_8_elements_64_bit_vector(<8 x i8> %vec, ptr %out) {
-; CHECK-LABEL: lCPI6_0:
-; CHECK-NEXT:  .byte	1
-; CHECK-NEXT:  .byte	2
-; CHECK-NEXT:  .byte	4
-; CHECK-NEXT:  .byte	8
-; CHECK-NEXT:  .byte	16
-; CHECK-NEXT:  .byte	32
-; CHECK-NEXT:  .byte	64
-; CHECK-NEXT:  .byte	128
-
-; CHECK-LABEL: store_8_elements_64_bit_vector
+; CHECK-LABEL: store_8_elements_64_bit_vector:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:  Lloh12:
-; CHECK-NEXT:  adrp	x8, lCPI6_0 at PAGE
-; CHECK-NEXT:  cmeq.8b	v0, v0, #0
+; CHECK-NEXT:    adrp x8, lCPI6_0 at PAGE
+; CHECK-NEXT:    cmeq.8b v0, v0, #0
 ; CHECK-NEXT:  Lloh13:
-; CHECK-NEXT:  ldr	    d1, [x8, lCPI6_0 at PAGEOFF]
-; CHECK-NEXT:  bic.8b	v0, v1, v0
-; CHECK-NEXT:  addv.8b	b0, v0
-; CHECK-NEXT:  st1.b	{ v0 }[0], [x0]
-; CHECK-NEXT:  ret
+; CHECK-NEXT:    ldr d1, [x8, lCPI6_0 at PAGEOFF]
+; CHECK-NEXT:    bic.8b v0, v1, v0
+; CHECK-NEXT:    addv.8b b0, v0
+; CHECK-NEXT:    st1.b { v0 }[0], [x0]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh12, Lloh13
+
 
   %cmp_result = icmp ne <8 x i8> %vec, zeroinitializer
   store <8 x i1> %cmp_result, ptr %out
@@ -197,24 +152,20 @@ define void @store_8_elements_64_bit_vector(<8 x i8> %vec, ptr %out) {
 }
 
 define void @store_4_elements_64_bit_vector(<4 x i16> %vec, ptr %out) {
-; CHECK-LABEL: lCPI7_0:
-; CHECK-NEXT:  .short	1
-; CHECK-NEXT:  .short	2
-; CHECK-NEXT:  .short	4
-; CHECK-NEXT:  .short	8
-
-; CHECK-LABEL: store_4_elements_64_bit_vector
+; CHECK-LABEL: store_4_elements_64_bit_vector:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:  Lloh14:
-; CHECK-NEXT:  adrp	x8, lCPI7_0 at PAGE
-; CHECK-NEXT:  cmeq.4h	v0, v0, #0
+; CHECK-NEXT:    adrp x8, lCPI7_0 at PAGE
+; CHECK-NEXT:    cmeq.4h v0, v0, #0
 ; CHECK-NEXT:  Lloh15:
-; CHECK-NEXT:  ldr	    d1, [x8, lCPI7_0 at PAGEOFF]
-; CHECK-NEXT:  bic.8b	v0, v1, v0
-; CHECK-NEXT:  addv.4h	h0, v0
-; CHECK-NEXT:  fmov	    w8, s0
-; CHECK-NEXT:  strb	    w8, [x0]
-; CHECK-NEXT:  ret
+; CHECK-NEXT:    ldr d1, [x8, lCPI7_0 at PAGEOFF]
+; CHECK-NEXT:    bic.8b v0, v1, v0
+; CHECK-NEXT:    addv.4h h0, v0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strb w8, [x0]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh14, Lloh15
+
 
   %cmp_result = icmp ne <4 x i16> %vec, zeroinitializer
   store <4 x i1> %cmp_result, ptr %out
@@ -222,22 +173,20 @@ define void @store_4_elements_64_bit_vector(<4 x i16> %vec, ptr %out) {
 }
 
 define void @store_2_elements_64_bit_vector(<2 x i32> %vec, ptr %out) {
-; CHECK-LABEL: lCPI8_0:
-; CHECK-NEXT:  .long	1
-; CHECK-NEXT:  .long	2
-
-; CHECK-LABEL: store_2_elements_64_bit_vector
+; CHECK-LABEL: store_2_elements_64_bit_vector:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:  Lloh16:
-; CHECK-NEXT:  adrp	x8, lCPI8_0 at PAGE
-; CHECK-NEXT:  cmeq.2s	v0, v0, #0
+; CHECK-NEXT:    adrp x8, lCPI8_0 at PAGE
+; CHECK-NEXT:    cmeq.2s v0, v0, #0
 ; CHECK-NEXT:  Lloh17:
-; CHECK-NEXT:  ldr	    d1, [x8, lCPI8_0 at PAGEOFF]
-; CHECK-NEXT:  bic.8b	v0, v1, v0
-; CHECK-NEXT:  addp.2s	v0, v0, v0
-; CHECK-NEXT:  fmov	    w8, s0
-; CHECK-NEXT:  strb	    w8, [x0]
-; CHECK-NEXT:  ret
+; CHECK-NEXT:    ldr d1, [x8, lCPI8_0 at PAGEOFF]
+; CHECK-NEXT:    bic.8b v0, v1, v0
+; CHECK-NEXT:    addp.2s v0, v0, v0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strb w8, [x0]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh16, Lloh17
+
 
   %cmp_result = icmp ne <2 x i32> %vec, zeroinitializer
   store <2 x i1> %cmp_result, ptr %out
@@ -245,9 +194,11 @@ define void @store_2_elements_64_bit_vector(<2 x i32> %vec, ptr %out) {
 }
 
 define void @no_combine_without_truncate(<16 x i8> %vec, ptr %out) {
-; CHECK-LABEL: no_combine_without_truncate
-; CHECK:     cmtst.16b v0, v0, v0
-; CHECK-NOT: addv.8b	b0, v0
+; CHECK-LABEL: no_combine_without_truncate:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmtst.16b v0, v0, v0
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
 
   %cmp_result = icmp ne <16 x i8> %vec, zeroinitializer
   %extended_result = sext <16 x i1> %cmp_result to <16 x i8>
@@ -256,9 +207,12 @@ define void @no_combine_without_truncate(<16 x i8> %vec, ptr %out) {
 }
 
 define void @no_combine_for_non_bool_truncate(<4 x i32> %vec, ptr %out) {
-; CHECK-LABEL: no_combine_for_non_bool_truncate
-; CHECK:     xtn.4h v0, v0
-; CHECK-NOT: addv.4s s0, v0
+; CHECK-LABEL: no_combine_for_non_bool_truncate:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    xtn.4h v0, v0
+; CHECK-NEXT:    xtn.8b v0, v0
+; CHECK-NEXT:    str s0, [x0]
+; CHECK-NEXT:    ret
 
   %trunc = trunc <4 x i32> %vec to <4 x i8>
   store <4 x i8> %trunc, ptr %out
@@ -266,8 +220,13 @@ define void @no_combine_for_non_bool_truncate(<4 x i32> %vec, ptr %out) {
 }
 
 define void @no_combine_for_build_vector(i1 %a, i1 %b, i1 %c, i1 %d, ptr %out) {
-; CHECK-LABEL: no_combine_for_build_vector
-; CHECK-NOT: addv
+; CHECK-LABEL: no_combine_for_build_vector:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    orr w8, w0, w1, lsl #1
+; CHECK-NEXT:    orr w8, w8, w2, lsl #2
+; CHECK-NEXT:    orr w8, w8, w3, lsl #3
+; CHECK-NEXT:    strb w8, [x4]
+; CHECK-NEXT:    ret
 
   %1 =   insertelement <4 x i1> undef, i1 %a, i64 0
   %2 =   insertelement <4 x i1>    %1, i1 %b, i64 1

diff  --git a/llvm/test/CodeGen/AArch64/vec-libcalls.ll b/llvm/test/CodeGen/AArch64/vec-libcalls.ll
index 9ce9e96e997edc..e1b4967ed0fb93 100644
--- a/llvm/test/CodeGen/AArch64/vec-libcalls.ll
+++ b/llvm/test/CodeGen/AArch64/vec-libcalls.ll
@@ -144,11 +144,11 @@ define <5 x float> @sin_v5f32(<5 x float> %x) nounwind {
 ; CHECK-NEXT:    str d12, [sp, #-48]! // 8-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #8] // 16-byte Folded Spill
 ; CHECK-NEXT:    fmov s10, s2
+; CHECK-NEXT:    fmov s11, s1
 ; CHECK-NEXT:    stp d9, d8, [sp, #24] // 16-byte Folded Spill
 ; CHECK-NEXT:    fmov s8, s4
-; CHECK-NEXT:    str x30, [sp, #40] // 8-byte Folded Spill
 ; CHECK-NEXT:    fmov s9, s3
-; CHECK-NEXT:    fmov s11, s1
+; CHECK-NEXT:    str x30, [sp, #40] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl sinf
 ; CHECK-NEXT:    fmov s12, s0
 ; CHECK-NEXT:    fmov s0, s11
@@ -164,12 +164,12 @@ define <5 x float> @sin_v5f32(<5 x float> %x) nounwind {
 ; CHECK-NEXT:    bl sinf
 ; CHECK-NEXT:    fmov s1, s11
 ; CHECK-NEXT:    fmov s2, s10
-; CHECK-NEXT:    fmov s3, s9
 ; CHECK-NEXT:    ldr x30, [sp, #40] // 8-byte Folded Reload
+; CHECK-NEXT:    fmov s3, s9
 ; CHECK-NEXT:    ldp d9, d8, [sp, #24] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #8] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov s4, s0
 ; CHECK-NEXT:    fmov s0, s12
-; CHECK-NEXT:    ldp d11, d10, [sp, #8] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr d12, [sp], #48 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %r = call <5 x float> @llvm.sin.v5f32(<5 x float> %x)
@@ -182,11 +182,11 @@ define <6 x float> @sin_v6f32(<6 x float> %x) nounwind {
 ; CHECK-NEXT:    stp d13, d12, [sp, #-64]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    fmov s10, s3
+; CHECK-NEXT:    fmov s11, s2
 ; CHECK-NEXT:    stp d9, d8, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    fmov s8, s5
-; CHECK-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
 ; CHECK-NEXT:    fmov s9, s4
-; CHECK-NEXT:    fmov s11, s2
+; CHECK-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
 ; CHECK-NEXT:    fmov s12, s1
 ; CHECK-NEXT:    bl sinf
 ; CHECK-NEXT:    fmov s13, s0
@@ -206,12 +206,12 @@ define <6 x float> @sin_v6f32(<6 x float> %x) nounwind {
 ; CHECK-NEXT:    bl sinf
 ; CHECK-NEXT:    fmov s2, s11
 ; CHECK-NEXT:    fmov s3, s10
-; CHECK-NEXT:    fmov s4, s9
 ; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-NEXT:    fmov s4, s9
 ; CHECK-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov s5, s0
 ; CHECK-NEXT:    fmov s0, s13
-; CHECK-NEXT:    ldp d11, d10, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    fmov s1, s12
 ; CHECK-NEXT:    ldp d13, d12, [sp], #64 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -225,8 +225,8 @@ define <3 x double> @sin_v3f64(<3 x double> %x) nounwind {
 ; CHECK-NEXT:    str d10, [sp, #-32]! // 8-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #8] // 16-byte Folded Spill
 ; CHECK-NEXT:    fmov d8, d2
-; CHECK-NEXT:    str x30, [sp, #24] // 8-byte Folded Spill
 ; CHECK-NEXT:    fmov d9, d1
+; CHECK-NEXT:    str x30, [sp, #24] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl sin
 ; CHECK-NEXT:    fmov d10, d0
 ; CHECK-NEXT:    fmov d0, d9
@@ -235,8 +235,8 @@ define <3 x double> @sin_v3f64(<3 x double> %x) nounwind {
 ; CHECK-NEXT:    fmov d0, d8
 ; CHECK-NEXT:    bl sin
 ; CHECK-NEXT:    fmov d1, d9
-; CHECK-NEXT:    ldr x30, [sp, #24] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #8] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #24] // 8-byte Folded Reload
 ; CHECK-NEXT:    fmov d2, d0
 ; CHECK-NEXT:    fmov d0, d10
 ; CHECK-NEXT:    ldr d10, [sp], #32 // 8-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AArch64/vec_cttz.ll b/llvm/test/CodeGen/AArch64/vec_cttz.ll
index 231790fc212198..26565f4ab095a9 100644
--- a/llvm/test/CodeGen/AArch64/vec_cttz.ll
+++ b/llvm/test/CodeGen/AArch64/vec_cttz.ll
@@ -54,7 +54,7 @@ define <2 x i32> @cttz_v2i32(<2 x i32> %a) nounwind {
 define <1 x i64> @cttz_v1i64(<1 x i64> %a) nounwind {
 ; CHECK-LABEL: cttz_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    fmov d1, x8
 ; CHECK-NEXT:    sub d1, d0, d1
 ; CHECK-NEXT:    bic v0.8b, v1.8b, v0.8b
@@ -85,8 +85,8 @@ define <8 x i16> @cttz_v8i16(<8 x i16> %a) nounwind {
 ; CHECK-NEXT:    movi v1.8h, #1
 ; CHECK-NEXT:    sub v1.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    bic v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    clz v0.8h, v0.8h
 ; CHECK-NEXT:    movi v1.8h, #16
+; CHECK-NEXT:    clz v0.8h, v0.8h
 ; CHECK-NEXT:    sub v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    ret
     %b = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true)
@@ -99,8 +99,8 @@ define <4 x i32> @cttz_v4i32(<4 x i32> %a) nounwind {
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    sub v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    bic v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    clz v0.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #32
+; CHECK-NEXT:    clz v0.4s, v0.4s
 ; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    ret
     %b = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true)
@@ -110,7 +110,7 @@ define <4 x i32> @cttz_v4i32(<4 x i32> %a) nounwind {
 define <2 x i64> @cttz_v2i64(<2 x i64> %a) nounwind {
 ; CHECK-LABEL: cttz_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    dup v1.2d, x8
 ; CHECK-NEXT:    sub v1.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    bic v0.16b, v1.16b, v0.16b

diff  --git a/llvm/test/CodeGen/AArch64/vec_uaddo.ll b/llvm/test/CodeGen/AArch64/vec_uaddo.ll
index cf4f6095f4b179..6ad880020cc664 100644
--- a/llvm/test/CodeGen/AArch64/vec_uaddo.ll
+++ b/llvm/test/CodeGen/AArch64/vec_uaddo.ll
@@ -52,8 +52,8 @@ define <3 x i32> @uaddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
 ; CHECK-NEXT:    add v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    add x8, x0, #8
 ; CHECK-NEXT:    cmhi v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    str d1, [x0]
 ; CHECK-NEXT:    st1 { v1.s }[2], [x8]
+; CHECK-NEXT:    str d1, [x0]
 ; CHECK-NEXT:    ret
   %t = call {<3 x i32>, <3 x i1>} @llvm.uadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
   %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
@@ -81,34 +81,34 @@ define <4 x i32> @uaddo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind {
 define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 ; CHECK-LABEL: uaddo_v6i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov s0, w6
-; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    fmov s1, w6
 ; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    fmov s3, w4
 ; CHECK-NEXT:    ldr s2, [sp, #16]
 ; CHECK-NEXT:    add x9, sp, #24
-; CHECK-NEXT:    add x10, sp, #8
-; CHECK-NEXT:    mov v0.s[1], w7
-; CHECK-NEXT:    fmov s3, w4
-; CHECK-NEXT:    mov v1.s[1], w1
+; CHECK-NEXT:    mov v0.s[1], w1
+; CHECK-NEXT:    mov v1.s[1], w7
 ; CHECK-NEXT:    ld1 { v2.s }[1], [x9]
 ; CHECK-NEXT:    mov v3.s[1], w5
-; CHECK-NEXT:    ld1 { v0.s }[2], [x8]
-; CHECK-NEXT:    mov v1.s[2], w2
-; CHECK-NEXT:    ldr x8, [sp, #32]
+; CHECK-NEXT:    mov v0.s[2], w2
+; CHECK-NEXT:    ld1 { v1.s }[2], [x8]
+; CHECK-NEXT:    add x8, sp, #8
 ; CHECK-NEXT:    add v2.4s, v3.4s, v2.4s
-; CHECK-NEXT:    ld1 { v0.s }[3], [x10]
-; CHECK-NEXT:    mov v1.s[3], w3
-; CHECK-NEXT:    str d2, [x8, #16]
+; CHECK-NEXT:    ld1 { v1.s }[3], [x8]
+; CHECK-NEXT:    ldr x8, [sp, #32]
+; CHECK-NEXT:    mov v0.s[3], w3
 ; CHECK-NEXT:    cmhi v3.4s, v3.4s, v2.4s
+; CHECK-NEXT:    str d2, [x8, #16]
+; CHECK-NEXT:    add v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    mov w5, v3.s[1]
 ; CHECK-NEXT:    fmov w4, s3
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    cmhi v1.4s, v1.4s, v0.4s
-; CHECK-NEXT:    str q0, [x8]
-; CHECK-NEXT:    mov w1, v1.s[1]
-; CHECK-NEXT:    mov w2, v1.s[2]
-; CHECK-NEXT:    mov w3, v1.s[3]
-; CHECK-NEXT:    fmov w0, s1
+; CHECK-NEXT:    cmhi v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    str q1, [x8]
+; CHECK-NEXT:    mov w1, v0.s[1]
+; CHECK-NEXT:    mov w2, v0.s[2]
+; CHECK-NEXT:    mov w3, v0.s[3]
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %t = call {<6 x i32>, <6 x i1>} @llvm.uadd.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
   %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
@@ -121,10 +121,10 @@ define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 define <8 x i32> @uaddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
 ; CHECK-LABEL: uaddo_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v3.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
-; CHECK-NEXT:    cmhi v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    add v3.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    cmhi v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    cmhi v1.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    stp q2, q3, [x0]
 ; CHECK-NEXT:    ret
   %t = call {<8 x i32>, <8 x i1>} @llvm.uadd.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
@@ -141,23 +141,23 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; CHECK-NEXT:    add v4.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    cmhi v0.16b, v0.16b, v4.16b
 ; CHECK-NEXT:    str q4, [x0]
-; CHECK-NEXT:    zip1 v1.8b, v0.8b, v0.8b
-; CHECK-NEXT:    zip2 v2.8b, v0.8b, v0.8b
-; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    zip1 v3.8b, v0.8b, v0.8b
-; CHECK-NEXT:    zip2 v5.8b, v0.8b, v0.8b
-; CHECK-NEXT:    shl v0.4s, v1.4s, #31
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    zip1 v2.8b, v0.8b, v0.8b
+; CHECK-NEXT:    zip2 v0.8b, v0.8b, v0.8b
+; CHECK-NEXT:    zip1 v3.8b, v1.8b, v0.8b
+; CHECK-NEXT:    zip2 v1.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
-; CHECK-NEXT:    shl v1.4s, v2.4s, #31
-; CHECK-NEXT:    ushll v2.4s, v3.4h, #0
-; CHECK-NEXT:    ushll v3.4s, v5.4h, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    shl v2.4s, v2.4s, #31
+; CHECK-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    shl v5.4s, v0.4s, #31
+; CHECK-NEXT:    cmlt v0.4s, v2.4s, #0
 ; CHECK-NEXT:    shl v3.4s, v3.4s, #31
-; CHECK-NEXT:    cmlt v1.4s, v1.4s, #0
-; CHECK-NEXT:    cmlt v2.4s, v2.4s, #0
-; CHECK-NEXT:    cmlt v3.4s, v3.4s, #0
+; CHECK-NEXT:    shl v6.4s, v1.4s, #31
+; CHECK-NEXT:    cmlt v1.4s, v5.4s, #0
+; CHECK-NEXT:    cmlt v2.4s, v3.4s, #0
+; CHECK-NEXT:    cmlt v3.4s, v6.4s, #0
 ; CHECK-NEXT:    ret
   %t = call {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
   %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
@@ -213,26 +213,26 @@ define <4 x i32> @uaddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
 ; CHECK-NEXT:    bic v1.4s, #255, lsl #24
 ; CHECK-NEXT:    bic v0.4s, #255, lsl #24
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    mov w8, v0.s[3]
 ; CHECK-NEXT:    mov w9, v0.s[2]
 ; CHECK-NEXT:    mov w10, v0.s[1]
 ; CHECK-NEXT:    fmov w11, s0
-; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    bic v1.4s, #1, lsl #24
 ; CHECK-NEXT:    sturh w8, [x0, #9]
 ; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    cmeq v1.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    strh w9, [x0, #6]
-; CHECK-NEXT:    sturh w10, [x0, #3]
 ; CHECK-NEXT:    lsr w9, w9, #16
-; CHECK-NEXT:    lsr w10, w10, #16
+; CHECK-NEXT:    cmeq v1.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    strb w8, [x0, #11]
-; CHECK-NEXT:    lsr w8, w11, #16
-; CHECK-NEXT:    strh w11, [x0]
-; CHECK-NEXT:    mvn v0.16b, v1.16b
+; CHECK-NEXT:    lsr w8, w10, #16
 ; CHECK-NEXT:    strb w9, [x0, #8]
-; CHECK-NEXT:    strb w10, [x0, #5]
-; CHECK-NEXT:    strb w8, [x0, #2]
+; CHECK-NEXT:    lsr w9, w11, #16
+; CHECK-NEXT:    sturh w10, [x0, #3]
+; CHECK-NEXT:    mvn v0.16b, v1.16b
+; CHECK-NEXT:    strh w11, [x0]
+; CHECK-NEXT:    strb w8, [x0, #5]
+; CHECK-NEXT:    strb w9, [x0, #2]
 ; CHECK-NEXT:    ret
   %t = call {<4 x i24>, <4 x i1>} @llvm.uadd.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
   %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0
@@ -247,16 +247,16 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v2.4h, #1
 ; CHECK-NEXT:    adrp x8, .LCPI10_0
-; CHECK-NEXT:    ldr d3, [x8, :lo12:.LCPI10_0]
 ; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
 ; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    fmov d1, d0
 ; CHECK-NEXT:    shl v2.4h, v0.4h, #15
+; CHECK-NEXT:    cmlt v2.4h, v2.4h, #0
 ; CHECK-NEXT:    bic v1.4h, #2
 ; CHECK-NEXT:    cmeq v0.4h, v1.4h, v0.4h
-; CHECK-NEXT:    cmlt v1.4h, v2.4h, #0
-; CHECK-NEXT:    and v1.8b, v1.8b, v3.8b
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI10_0]
+; CHECK-NEXT:    and v1.8b, v2.8b, v1.8b
 ; CHECK-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-NEXT:    addv h1, v1.4h
 ; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
@@ -284,8 +284,8 @@ define <2 x i32> @uaddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind
 ; CHECK-NEXT:    mov v0.s[1], w10
 ; CHECK-NEXT:    ldr x10, [sp]
 ; CHECK-NEXT:    stp x8, x9, [x10, #16]
-; CHECK-NEXT:    shl v0.2s, v0.2s, #31
 ; CHECK-NEXT:    stp x11, x12, [x10]
+; CHECK-NEXT:    shl v0.2s, v0.2s, #31
 ; CHECK-NEXT:    cmlt v0.2s, v0.2s, #0
 ; CHECK-NEXT:    ret
   %t = call {<2 x i128>, <2 x i1>} @llvm.uadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)

diff  --git a/llvm/test/CodeGen/AArch64/vec_umulo.ll b/llvm/test/CodeGen/AArch64/vec_umulo.ll
index 029b396ef25398..3a481efd9785aa 100644
--- a/llvm/test/CodeGen/AArch64/vec_umulo.ll
+++ b/llvm/test/CodeGen/AArch64/vec_umulo.ll
@@ -54,8 +54,8 @@ define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
 ; CHECK-LABEL: umulo_v3i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    umull2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT:    add x8, x0, #8
 ; CHECK-NEXT:    umull v3.2d, v0.2s, v1.2s
+; CHECK-NEXT:    add x8, x0, #8
 ; CHECK-NEXT:    mul v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    uzp2 v2.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    st1 { v1.s }[2], [x8]
@@ -93,40 +93,40 @@ define <4 x i32> @umulo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind {
 define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 ; CHECK-LABEL: umulo_v6i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov s0, w6
-; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    fmov s1, w6
 ; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    fmov s3, w4
 ; CHECK-NEXT:    ldr s2, [sp, #16]
 ; CHECK-NEXT:    add x9, sp, #24
-; CHECK-NEXT:    add x10, sp, #8
-; CHECK-NEXT:    mov v0.s[1], w7
-; CHECK-NEXT:    fmov s3, w4
-; CHECK-NEXT:    mov v1.s[1], w1
+; CHECK-NEXT:    mov v0.s[1], w1
+; CHECK-NEXT:    mov v1.s[1], w7
 ; CHECK-NEXT:    ld1 { v2.s }[1], [x9]
 ; CHECK-NEXT:    mov v3.s[1], w5
-; CHECK-NEXT:    ld1 { v0.s }[2], [x8]
-; CHECK-NEXT:    mov v1.s[2], w2
-; CHECK-NEXT:    ldr x8, [sp, #32]
-; CHECK-NEXT:    umull2 v4.2d, v3.4s, v2.4s
-; CHECK-NEXT:    ld1 { v0.s }[3], [x10]
-; CHECK-NEXT:    mov v1.s[3], w3
+; CHECK-NEXT:    mov v0.s[2], w2
+; CHECK-NEXT:    ld1 { v1.s }[2], [x8]
+; CHECK-NEXT:    add x8, sp, #8
+; CHECK-NEXT:    umull2 v6.2d, v3.4s, v2.4s
 ; CHECK-NEXT:    umull v7.2d, v3.2s, v2.2s
 ; CHECK-NEXT:    mul v2.4s, v3.4s, v2.4s
-; CHECK-NEXT:    umull2 v5.2d, v1.4s, v0.4s
-; CHECK-NEXT:    umull v6.2d, v1.2s, v0.2s
-; CHECK-NEXT:    uzp2 v4.4s, v7.4s, v4.4s
+; CHECK-NEXT:    ld1 { v1.s }[3], [x8]
+; CHECK-NEXT:    ldr x8, [sp, #32]
+; CHECK-NEXT:    mov v0.s[3], w3
 ; CHECK-NEXT:    str d2, [x8, #16]
-; CHECK-NEXT:    mul v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    uzp2 v5.4s, v6.4s, v5.4s
-; CHECK-NEXT:    cmtst v4.4s, v4.4s, v4.4s
+; CHECK-NEXT:    umull2 v4.2d, v0.4s, v1.4s
+; CHECK-NEXT:    umull v5.2d, v0.2s, v1.2s
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    uzp2 v4.4s, v5.4s, v4.4s
+; CHECK-NEXT:    uzp2 v5.4s, v7.4s, v6.4s
 ; CHECK-NEXT:    str q0, [x8]
+; CHECK-NEXT:    cmtst v4.4s, v4.4s, v4.4s
 ; CHECK-NEXT:    cmtst v5.4s, v5.4s, v5.4s
-; CHECK-NEXT:    mov w5, v4.s[1]
-; CHECK-NEXT:    fmov w4, s4
-; CHECK-NEXT:    mov w1, v5.s[1]
-; CHECK-NEXT:    mov w2, v5.s[2]
-; CHECK-NEXT:    mov w3, v5.s[3]
-; CHECK-NEXT:    fmov w0, s5
+; CHECK-NEXT:    mov w1, v4.s[1]
+; CHECK-NEXT:    mov w2, v4.s[2]
+; CHECK-NEXT:    mov w5, v5.s[1]
+; CHECK-NEXT:    mov w3, v4.s[3]
+; CHECK-NEXT:    fmov w4, s5
+; CHECK-NEXT:    fmov w0, s4
 ; CHECK-NEXT:    ret
   %t = call {<6 x i32>, <6 x i1>} @llvm.umul.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
   %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
@@ -139,17 +139,17 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
 ; CHECK-LABEL: umulo_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umull2 v4.2d, v1.4s, v3.4s
-; CHECK-NEXT:    umull2 v5.2d, v0.4s, v2.4s
-; CHECK-NEXT:    umull v6.2d, v0.2s, v2.2s
+; CHECK-NEXT:    umull2 v4.2d, v0.4s, v2.4s
+; CHECK-NEXT:    umull v5.2d, v0.2s, v2.2s
+; CHECK-NEXT:    umull2 v6.2d, v1.4s, v3.4s
 ; CHECK-NEXT:    umull v7.2d, v1.2s, v3.2s
-; CHECK-NEXT:    mul v3.4s, v1.4s, v3.4s
+; CHECK-NEXT:    mul v1.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    mul v2.4s, v0.4s, v2.4s
-; CHECK-NEXT:    uzp2 v5.4s, v6.4s, v5.4s
-; CHECK-NEXT:    uzp2 v6.4s, v7.4s, v4.4s
-; CHECK-NEXT:    stp q2, q3, [x0]
-; CHECK-NEXT:    cmtst v4.4s, v5.4s, v5.4s
-; CHECK-NEXT:    cmtst v5.4s, v6.4s, v6.4s
+; CHECK-NEXT:    uzp2 v4.4s, v5.4s, v4.4s
+; CHECK-NEXT:    uzp2 v5.4s, v7.4s, v6.4s
+; CHECK-NEXT:    stp q2, q1, [x0]
+; CHECK-NEXT:    cmtst v4.4s, v4.4s, v4.4s
+; CHECK-NEXT:    cmtst v5.4s, v5.4s, v5.4s
 ; CHECK-NEXT:    mov v0.16b, v4.16b
 ; CHECK-NEXT:    mov v1.16b, v5.16b
 ; CHECK-NEXT:    ret
@@ -170,23 +170,23 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; CHECK-NEXT:    uzp2 v2.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    str q6, [x0]
 ; CHECK-NEXT:    cmtst v2.16b, v2.16b, v2.16b
-; CHECK-NEXT:    zip1 v3.8b, v2.8b, v0.8b
-; CHECK-NEXT:    zip2 v4.8b, v2.8b, v0.8b
-; CHECK-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
-; CHECK-NEXT:    ushll v3.4s, v3.4h, #0
-; CHECK-NEXT:    zip1 v5.8b, v2.8b, v0.8b
+; CHECK-NEXT:    ext v3.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT:    zip1 v4.8b, v2.8b, v0.8b
 ; CHECK-NEXT:    zip2 v2.8b, v2.8b, v0.8b
-; CHECK-NEXT:    shl v3.4s, v3.4s, #31
+; CHECK-NEXT:    zip1 v5.8b, v3.8b, v0.8b
+; CHECK-NEXT:    zip2 v3.8b, v3.8b, v0.8b
 ; CHECK-NEXT:    ushll v4.4s, v4.4h, #0
-; CHECK-NEXT:    cmlt v0.4s, v3.4s, #0
-; CHECK-NEXT:    ushll v3.4s, v5.4h, #0
 ; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-NEXT:    shl v1.4s, v4.4s, #31
+; CHECK-NEXT:    shl v4.4s, v4.4s, #31
+; CHECK-NEXT:    ushll v5.4s, v5.4h, #0
+; CHECK-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-NEXT:    shl v2.4s, v2.4s, #31
+; CHECK-NEXT:    cmlt v0.4s, v4.4s, #0
+; CHECK-NEXT:    shl v5.4s, v5.4s, #31
 ; CHECK-NEXT:    shl v3.4s, v3.4s, #31
-; CHECK-NEXT:    shl v4.4s, v2.4s, #31
-; CHECK-NEXT:    cmlt v1.4s, v1.4s, #0
-; CHECK-NEXT:    cmlt v2.4s, v3.4s, #0
-; CHECK-NEXT:    cmlt v3.4s, v4.4s, #0
+; CHECK-NEXT:    cmlt v1.4s, v2.4s, #0
+; CHECK-NEXT:    cmlt v2.4s, v5.4s, #0
+; CHECK-NEXT:    cmlt v3.4s, v3.4s, #0
 ; CHECK-NEXT:    ret
   %t = call {<16 x i8>, <16 x i1>} @llvm.umul.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
   %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
@@ -201,7 +201,9 @@ define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    umull2 v2.4s, v0.8h, v1.8h
 ; CHECK-NEXT:    umull v3.4s, v0.4h, v1.4h
+; CHECK-NEXT:    mul v4.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    uzp2 v2.8h, v3.8h, v2.8h
+; CHECK-NEXT:    str q4, [x0]
 ; CHECK-NEXT:    cmtst v2.8h, v2.8h, v2.8h
 ; CHECK-NEXT:    xtn v2.8b, v2.8h
 ; CHECK-NEXT:    zip1 v3.8b, v2.8b, v0.8b
@@ -209,13 +211,9 @@ define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind {
 ; CHECK-NEXT:    ushll v3.4s, v3.4h, #0
 ; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
 ; CHECK-NEXT:    shl v3.4s, v3.4s, #31
-; CHECK-NEXT:    shl v4.4s, v2.4s, #31
-; CHECK-NEXT:    cmlt v2.4s, v3.4s, #0
-; CHECK-NEXT:    cmlt v3.4s, v4.4s, #0
-; CHECK-NEXT:    mul v4.8h, v0.8h, v1.8h
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    mov v1.16b, v3.16b
-; CHECK-NEXT:    str q4, [x0]
+; CHECK-NEXT:    shl v2.4s, v2.4s, #31
+; CHECK-NEXT:    cmlt v0.4s, v3.4s, #0
+; CHECK-NEXT:    cmlt v1.4s, v2.4s, #0
 ; CHECK-NEXT:    ret
   %t = call {<8 x i16>, <8 x i1>} @llvm.umul.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1)
   %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0
@@ -229,23 +227,23 @@ define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
 ; CHECK-LABEL: umulo_v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov x8, v1.d[1]
-; CHECK-NEXT:    fmov x10, d1
 ; CHECK-NEXT:    mov x9, v0.d[1]
-; CHECK-NEXT:    fmov x11, d0
-; CHECK-NEXT:    umulh x12, x9, x8
-; CHECK-NEXT:    umulh x13, x11, x10
-; CHECK-NEXT:    cmp xzr, x12
-; CHECK-NEXT:    mul x10, x11, x10
-; CHECK-NEXT:    csetm x12, ne
+; CHECK-NEXT:    fmov x11, d1
+; CHECK-NEXT:    fmov x12, d0
+; CHECK-NEXT:    umulh x10, x9, x8
+; CHECK-NEXT:    umulh x13, x12, x11
+; CHECK-NEXT:    mul x11, x12, x11
+; CHECK-NEXT:    cmp xzr, x10
+; CHECK-NEXT:    csetm x10, ne
+; CHECK-NEXT:    mul x8, x9, x8
 ; CHECK-NEXT:    cmp xzr, x13
 ; CHECK-NEXT:    csetm x13, ne
-; CHECK-NEXT:    mul x8, x9, x8
-; CHECK-NEXT:    fmov d1, x10
 ; CHECK-NEXT:    fmov d0, x13
+; CHECK-NEXT:    fmov d1, x11
+; CHECK-NEXT:    mov v0.d[1], x10
 ; CHECK-NEXT:    mov v1.d[1], x8
-; CHECK-NEXT:    mov v0.d[1], x12
-; CHECK-NEXT:    str q1, [x0]
 ; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    str q1, [x0]
 ; CHECK-NEXT:    ret
   %t = call {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
   %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
@@ -260,30 +258,30 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    bic v1.4s, #255, lsl #24
 ; CHECK-NEXT:    bic v0.4s, #255, lsl #24
-; CHECK-NEXT:    mul v2.4s, v0.4s, v1.4s
-; CHECK-NEXT:    umull2 v3.2d, v0.4s, v1.4s
-; CHECK-NEXT:    umull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    mov w8, v2.s[3]
-; CHECK-NEXT:    mov w10, v2.s[2]
-; CHECK-NEXT:    mov w11, v2.s[1]
-; CHECK-NEXT:    ushr v1.4s, v2.4s, #24
-; CHECK-NEXT:    uzp2 v0.4s, v0.4s, v3.4s
-; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    cmtst v1.4s, v1.4s, v1.4s
+; CHECK-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; CHECK-NEXT:    umull v3.2d, v0.2s, v1.2s
+; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    uzp2 v1.4s, v3.4s, v2.4s
+; CHECK-NEXT:    ushr v2.4s, v0.4s, #24
+; CHECK-NEXT:    mov w8, v0.s[3]
+; CHECK-NEXT:    mov w9, v0.s[2]
+; CHECK-NEXT:    mov w10, v0.s[1]
+; CHECK-NEXT:    fmov w11, s0
+; CHECK-NEXT:    cmtst v2.4s, v2.4s, v2.4s
+; CHECK-NEXT:    cmeq v1.4s, v1.4s, #0
 ; CHECK-NEXT:    sturh w8, [x0, #9]
 ; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    strh w10, [x0, #6]
-; CHECK-NEXT:    lsr w10, w10, #16
-; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
-; CHECK-NEXT:    sturh w11, [x0, #3]
-; CHECK-NEXT:    lsr w11, w11, #16
+; CHECK-NEXT:    strh w9, [x0, #6]
+; CHECK-NEXT:    lsr w9, w9, #16
 ; CHECK-NEXT:    strb w8, [x0, #11]
-; CHECK-NEXT:    lsr w8, w9, #16
-; CHECK-NEXT:    strh w9, [x0]
-; CHECK-NEXT:    orn v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    strb w10, [x0, #8]
-; CHECK-NEXT:    strb w11, [x0, #5]
-; CHECK-NEXT:    strb w8, [x0, #2]
+; CHECK-NEXT:    lsr w8, w10, #16
+; CHECK-NEXT:    orn v0.16b, v2.16b, v1.16b
+; CHECK-NEXT:    strb w9, [x0, #8]
+; CHECK-NEXT:    lsr w9, w11, #16
+; CHECK-NEXT:    sturh w10, [x0, #3]
+; CHECK-NEXT:    strh w11, [x0]
+; CHECK-NEXT:    strb w8, [x0, #5]
+; CHECK-NEXT:    strb w9, [x0, #2]
 ; CHECK-NEXT:    ret
   %t = call {<4 x i24>, <4 x i1>} @llvm.umul.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
   %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0
@@ -296,10 +294,10 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
 define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
 ; CHECK-LABEL: umulo_v4i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI10_0
 ; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    shl v0.4h, v0.4h, #15
+; CHECK-NEXT:    adrp x8, .LCPI10_0
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI10_0]
+; CHECK-NEXT:    shl v0.4h, v0.4h, #15
 ; CHECK-NEXT:    cmlt v0.4h, v0.4h, #0
 ; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    addv h1, v0.4h
@@ -318,38 +316,38 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
 define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind {
 ; CHECK-LABEL: umulo_v2i128:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul x9, x7, x2
 ; CHECK-NEXT:    cmp x3, #0
-; CHECK-NEXT:    umulh x8, x3, x6
 ; CHECK-NEXT:    ccmp x7, #0, #4, ne
-; CHECK-NEXT:    umulh x9, x7, x2
-; CHECK-NEXT:    umulh x11, x5, x0
+; CHECK-NEXT:    umulh x10, x3, x6
+; CHECK-NEXT:    umulh x8, x7, x2
+; CHECK-NEXT:    madd x9, x3, x6, x9
+; CHECK-NEXT:    ccmp xzr, x10, #0, eq
+; CHECK-NEXT:    umulh x11, x2, x6
 ; CHECK-NEXT:    ccmp xzr, x8, #0, eq
-; CHECK-NEXT:    mul x8, x7, x2
-; CHECK-NEXT:    madd x8, x3, x6, x8
-; CHECK-NEXT:    ccmp xzr, x9, #0, eq
-; CHECK-NEXT:    umulh x9, x2, x6
-; CHECK-NEXT:    cset w10, ne
-; CHECK-NEXT:    adds x8, x9, x8
-; CHECK-NEXT:    csinc w9, w10, wzr, lo
+; CHECK-NEXT:    mul x13, x5, x0
+; CHECK-NEXT:    cset w8, ne
+; CHECK-NEXT:    umulh x14, x1, x4
+; CHECK-NEXT:    adds x9, x11, x9
+; CHECK-NEXT:    umulh x12, x5, x0
+; CHECK-NEXT:    csinc w8, w8, wzr, lo
 ; CHECK-NEXT:    cmp x1, #0
 ; CHECK-NEXT:    ccmp x5, #0, #4, ne
-; CHECK-NEXT:    umulh x10, x1, x4
-; CHECK-NEXT:    ccmp xzr, x10, #0, eq
-; CHECK-NEXT:    mul x10, x5, x0
-; CHECK-NEXT:    madd x10, x1, x4, x10
-; CHECK-NEXT:    ccmp xzr, x11, #0, eq
+; CHECK-NEXT:    madd x10, x1, x4, x13
+; CHECK-NEXT:    ccmp xzr, x14, #0, eq
 ; CHECK-NEXT:    umulh x11, x0, x4
+; CHECK-NEXT:    ccmp xzr, x12, #0, eq
 ; CHECK-NEXT:    cset w12, ne
 ; CHECK-NEXT:    adds x10, x11, x10
 ; CHECK-NEXT:    csinc w11, w12, wzr, lo
-; CHECK-NEXT:    mul x12, x0, x4
+; CHECK-NEXT:    ldr x12, [sp]
 ; CHECK-NEXT:    fmov s0, w11
-; CHECK-NEXT:    ldr x11, [sp]
-; CHECK-NEXT:    mov v0.s[1], w9
-; CHECK-NEXT:    mul x9, x2, x6
-; CHECK-NEXT:    stp x12, x10, [x11]
+; CHECK-NEXT:    mul x11, x0, x4
+; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    mul x8, x2, x6
+; CHECK-NEXT:    stp x11, x10, [x12]
 ; CHECK-NEXT:    shl v0.2s, v0.2s, #31
-; CHECK-NEXT:    stp x9, x8, [x11, #16]
+; CHECK-NEXT:    stp x8, x9, [x12, #16]
 ; CHECK-NEXT:    cmlt v0.2s, v0.2s, #0
 ; CHECK-NEXT:    ret
   %t = call {<2 x i128>, <2 x i1>} @llvm.umul.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 9e113be3148880..039417784da0bb 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -276,10 +276,10 @@ define i32 @add_v8i8_v8i32_zext(<8 x i8> %x) {
 ;
 ; CHECK-DOT-LABEL: add_v8i8_v8i32_zext:
 ; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v1.8b, #1
-; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v1.8b
-; CHECK-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT:    movi v2.8b, #1
+; CHECK-DOT-NEXT:    udot v1.2s, v0.8b, v2.8b
+; CHECK-DOT-NEXT:    addp v0.2s, v1.2s, v1.2s
 ; CHECK-DOT-NEXT:    fmov w0, s0
 ; CHECK-DOT-NEXT:    ret
 entry:
@@ -298,10 +298,10 @@ define i32 @add_v8i8_v8i32_sext(<8 x i8> %x) {
 ;
 ; CHECK-DOT-LABEL: add_v8i8_v8i32_sext:
 ; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v1.8b, #1
-; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-DOT-NEXT:    sdot v2.2s, v0.8b, v1.8b
-; CHECK-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT:    movi v2.8b, #1
+; CHECK-DOT-NEXT:    sdot v1.2s, v0.8b, v2.8b
+; CHECK-DOT-NEXT:    addp v0.2s, v1.2s, v1.2s
 ; CHECK-DOT-NEXT:    fmov w0, s0
 ; CHECK-DOT-NEXT:    ret
 entry:
@@ -407,17 +407,17 @@ define i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ushll2 v1.8h, v0.16b, #0
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-NEXT:    ushll2 v2.4s, v1.8h, #0
 ; CHECK-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddl2 v4.2d, v3.4s, v1.4s
-; CHECK-NEXT:    uaddl2 v5.2d, v0.4s, v2.4s
-; CHECK-NEXT:    uaddl v1.2d, v3.2s, v1.2s
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v2.2s
-; CHECK-NEXT:    add v2.2d, v5.2d, v4.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
+; CHECK-NEXT:    uaddl v2.2d, v3.2s, v2.2s
+; CHECK-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
+; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    add v1.2d, v5.2d, v4.2d
 ; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    addp d0, v0.2d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -432,17 +432,17 @@ define i64 @add_v16i8_v16i64_sext(<16 x i8> %x) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    sshll2 v1.8h, v0.16b, #0
 ; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll v2.4s, v1.4h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-NEXT:    sshll2 v2.4s, v1.8h, #0
 ; CHECK-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
 ; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    saddl2 v4.2d, v3.4s, v1.4s
-; CHECK-NEXT:    saddl2 v5.2d, v0.4s, v2.4s
-; CHECK-NEXT:    saddl v1.2d, v3.2s, v1.2s
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v2.2s
-; CHECK-NEXT:    add v2.2d, v5.2d, v4.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
+; CHECK-NEXT:    saddl v2.2d, v3.2s, v2.2s
+; CHECK-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
+; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    add v1.2d, v5.2d, v4.2d
 ; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    addp d0, v0.2d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -876,10 +876,10 @@ define i32 @add_v8i8_v8i32_acc_zext(<8 x i8> %x, i32 %a) {
 ;
 ; CHECK-DOT-LABEL: add_v8i8_v8i32_acc_zext:
 ; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v1.8b, #1
-; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v1.8b
-; CHECK-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT:    movi v2.8b, #1
+; CHECK-DOT-NEXT:    udot v1.2s, v0.8b, v2.8b
+; CHECK-DOT-NEXT:    addp v0.2s, v1.2s, v1.2s
 ; CHECK-DOT-NEXT:    fmov w8, s0
 ; CHECK-DOT-NEXT:    add w0, w8, w0
 ; CHECK-DOT-NEXT:    ret
@@ -901,10 +901,10 @@ define i32 @add_v8i8_v8i32_acc_sext(<8 x i8> %x, i32 %a) {
 ;
 ; CHECK-DOT-LABEL: add_v8i8_v8i32_acc_sext:
 ; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v1.8b, #1
-; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-DOT-NEXT:    sdot v2.2s, v0.8b, v1.8b
-; CHECK-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
+; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT:    movi v2.8b, #1
+; CHECK-DOT-NEXT:    sdot v1.2s, v0.8b, v2.8b
+; CHECK-DOT-NEXT:    addp v0.2s, v1.2s, v1.2s
 ; CHECK-DOT-NEXT:    fmov w8, s0
 ; CHECK-DOT-NEXT:    add w0, w8, w0
 ; CHECK-DOT-NEXT:    ret
@@ -1029,17 +1029,17 @@ define i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ushll2 v1.8h, v0.16b, #0
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-NEXT:    ushll2 v2.4s, v1.8h, #0
 ; CHECK-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddl2 v4.2d, v3.4s, v1.4s
-; CHECK-NEXT:    uaddl2 v5.2d, v0.4s, v2.4s
-; CHECK-NEXT:    uaddl v1.2d, v3.2s, v1.2s
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v2.2s
-; CHECK-NEXT:    add v2.2d, v5.2d, v4.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
+; CHECK-NEXT:    uaddl v2.2d, v3.2s, v2.2s
+; CHECK-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
+; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    add v1.2d, v5.2d, v4.2d
 ; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    addp d0, v0.2d
 ; CHECK-NEXT:    fmov x8, d0
 ; CHECK-NEXT:    add x0, x8, x0
@@ -1056,17 +1056,17 @@ define i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    sshll2 v1.8h, v0.16b, #0
 ; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll v2.4s, v1.4h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-NEXT:    sshll2 v2.4s, v1.8h, #0
 ; CHECK-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
 ; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    saddl2 v4.2d, v3.4s, v1.4s
-; CHECK-NEXT:    saddl2 v5.2d, v0.4s, v2.4s
-; CHECK-NEXT:    saddl v1.2d, v3.2s, v1.2s
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v2.2s
-; CHECK-NEXT:    add v2.2d, v5.2d, v4.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
+; CHECK-NEXT:    saddl v2.2d, v3.2s, v2.2s
+; CHECK-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
+; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    add v1.2d, v5.2d, v4.2d
 ; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    addp d0, v0.2d
 ; CHECK-NEXT:    fmov x8, d0
 ; CHECK-NEXT:    add x0, x8, x0
@@ -1577,11 +1577,11 @@ define i32 @add_pair_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y) {
 ;
 ; CHECK-DOT-LABEL: add_pair_v8i8_v8i32_zext:
 ; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v2.8b, #1
-; CHECK-DOT-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-DOT-NEXT:    udot v3.2s, v1.8b, v2.8b
-; CHECK-DOT-NEXT:    udot v3.2s, v0.8b, v2.8b
-; CHECK-DOT-NEXT:    addp v0.2s, v3.2s, v3.2s
+; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-DOT-NEXT:    movi v3.8b, #1
+; CHECK-DOT-NEXT:    udot v2.2s, v1.8b, v3.8b
+; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
 ; CHECK-DOT-NEXT:    fmov w0, s0
 ; CHECK-DOT-NEXT:    ret
 entry:
@@ -1606,11 +1606,11 @@ define i32 @add_pair_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y) {
 ;
 ; CHECK-DOT-LABEL: add_pair_v8i8_v8i32_sext:
 ; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v2.8b, #1
-; CHECK-DOT-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-DOT-NEXT:    sdot v3.2s, v1.8b, v2.8b
-; CHECK-DOT-NEXT:    sdot v3.2s, v0.8b, v2.8b
-; CHECK-DOT-NEXT:    addp v0.2s, v3.2s, v3.2s
+; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-DOT-NEXT:    movi v3.8b, #1
+; CHECK-DOT-NEXT:    sdot v2.2s, v1.8b, v3.8b
+; CHECK-DOT-NEXT:    sdot v2.2s, v0.8b, v3.8b
+; CHECK-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
 ; CHECK-DOT-NEXT:    fmov w0, s0
 ; CHECK-DOT-NEXT:    ret
 entry:
@@ -1746,29 +1746,29 @@ define i64 @add_pair_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ushll2 v2.8h, v0.16b, #0
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll v3.4s, v2.4h, #0
+; CHECK-NEXT:    ushll2 v3.8h, v1.16b, #0
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    ushll v4.4s, v2.4h, #0
 ; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    ushll2 v4.4s, v0.8h, #0
+; CHECK-NEXT:    ushll2 v5.4s, v0.8h, #0
 ; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    uaddl2 v5.2d, v4.4s, v2.4s
-; CHECK-NEXT:    uaddl2 v6.2d, v0.4s, v3.4s
-; CHECK-NEXT:    ushll2 v7.8h, v1.16b, #0
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    uaddl v2.2d, v4.2s, v2.2s
-; CHECK-NEXT:    add v4.2d, v6.2d, v5.2d
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v3.2s
-; CHECK-NEXT:    ushll v3.4s, v7.4h, #0
-; CHECK-NEXT:    ushll2 v5.4s, v7.8h, #0
-; CHECK-NEXT:    ushll2 v6.4s, v1.8h, #0
+; CHECK-NEXT:    ushll2 v6.4s, v3.8h, #0
+; CHECK-NEXT:    ushll2 v7.4s, v1.8h, #0
+; CHECK-NEXT:    ushll v3.4s, v3.4h, #0
 ; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    uaddl2 v7.2d, v6.4s, v5.4s
-; CHECK-NEXT:    uaddl v5.2d, v6.2s, v5.2s
-; CHECK-NEXT:    uaddl2 v6.2d, v1.4s, v3.4s
+; CHECK-NEXT:    uaddl2 v16.2d, v5.4s, v2.4s
+; CHECK-NEXT:    uaddl v2.2d, v5.2s, v2.2s
+; CHECK-NEXT:    uaddl2 v5.2d, v0.4s, v4.4s
+; CHECK-NEXT:    uaddl v0.2d, v0.2s, v4.2s
+; CHECK-NEXT:    uaddl2 v4.2d, v7.4s, v6.4s
+; CHECK-NEXT:    uaddl v6.2d, v7.2s, v6.2s
+; CHECK-NEXT:    uaddl2 v7.2d, v1.4s, v3.4s
 ; CHECK-NEXT:    uaddl v1.2d, v1.2s, v3.2s
+; CHECK-NEXT:    add v3.2d, v5.2d, v16.2d
 ; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    add v2.2d, v6.2d, v7.2d
-; CHECK-NEXT:    add v1.2d, v1.2d, v5.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-NEXT:    add v2.2d, v7.2d, v4.2d
+; CHECK-NEXT:    add v1.2d, v1.2d, v6.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v3.2d
 ; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
 ; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    addp d0, v0.2d
@@ -1788,29 +1788,29 @@ define i64 @add_pair_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    sshll2 v2.8h, v0.16b, #0
 ; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll v3.4s, v2.4h, #0
+; CHECK-NEXT:    sshll2 v3.8h, v1.16b, #0
+; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-NEXT:    sshll v4.4s, v2.4h, #0
 ; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    sshll2 v4.4s, v0.8h, #0
+; CHECK-NEXT:    sshll2 v5.4s, v0.8h, #0
 ; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    saddl2 v5.2d, v4.4s, v2.4s
-; CHECK-NEXT:    saddl2 v6.2d, v0.4s, v3.4s
-; CHECK-NEXT:    sshll2 v7.8h, v1.16b, #0
-; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    saddl v2.2d, v4.2s, v2.2s
-; CHECK-NEXT:    add v4.2d, v6.2d, v5.2d
-; CHECK-NEXT:    saddl v0.2d, v0.2s, v3.2s
-; CHECK-NEXT:    sshll v3.4s, v7.4h, #0
-; CHECK-NEXT:    sshll2 v5.4s, v7.8h, #0
-; CHECK-NEXT:    sshll2 v6.4s, v1.8h, #0
+; CHECK-NEXT:    sshll2 v6.4s, v3.8h, #0
+; CHECK-NEXT:    sshll2 v7.4s, v1.8h, #0
+; CHECK-NEXT:    sshll v3.4s, v3.4h, #0
 ; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-NEXT:    saddl2 v7.2d, v6.4s, v5.4s
-; CHECK-NEXT:    saddl v5.2d, v6.2s, v5.2s
-; CHECK-NEXT:    saddl2 v6.2d, v1.4s, v3.4s
+; CHECK-NEXT:    saddl2 v16.2d, v5.4s, v2.4s
+; CHECK-NEXT:    saddl v2.2d, v5.2s, v2.2s
+; CHECK-NEXT:    saddl2 v5.2d, v0.4s, v4.4s
+; CHECK-NEXT:    saddl v0.2d, v0.2s, v4.2s
+; CHECK-NEXT:    saddl2 v4.2d, v7.4s, v6.4s
+; CHECK-NEXT:    saddl v6.2d, v7.2s, v6.2s
+; CHECK-NEXT:    saddl2 v7.2d, v1.4s, v3.4s
 ; CHECK-NEXT:    saddl v1.2d, v1.2s, v3.2s
+; CHECK-NEXT:    add v3.2d, v5.2d, v16.2d
 ; CHECK-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    add v2.2d, v6.2d, v7.2d
-; CHECK-NEXT:    add v1.2d, v1.2d, v5.2d
-; CHECK-NEXT:    add v0.2d, v0.2d, v4.2d
+; CHECK-NEXT:    add v2.2d, v7.2d, v4.2d
+; CHECK-NEXT:    add v1.2d, v1.2d, v6.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v3.2d
 ; CHECK-NEXT:    add v1.2d, v1.2d, v2.2d
 ; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    addp d0, v0.2d
@@ -1905,21 +1905,21 @@ entry:
 define i64 @add_pair_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-LABEL: add_pair_v4i8_v4i64_sext:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ushll v2.2d, v1.2s, #0
-; CHECK-NEXT:    ushll v3.2d, v0.2s, #0
-; CHECK-NEXT:    ushll2 v1.2d, v1.4s, #0
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-NEXT:    ushll v3.2d, v1.2s, #0
 ; CHECK-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-NEXT:    shl v3.2d, v3.2d, #56
+; CHECK-NEXT:    ushll2 v1.2d, v1.4s, #0
 ; CHECK-NEXT:    shl v2.2d, v2.2d, #56
+; CHECK-NEXT:    shl v3.2d, v3.2d, #56
 ; CHECK-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-NEXT:    sshr v3.2d, v3.2d, #56
 ; CHECK-NEXT:    shl v1.2d, v1.2d, #56
 ; CHECK-NEXT:    sshr v2.2d, v2.2d, #56
-; CHECK-NEXT:    ssra v3.2d, v0.2d, #56
-; CHECK-NEXT:    ssra v2.2d, v1.2d, #56
-; CHECK-NEXT:    add v0.2d, v3.2d, v2.2d
+; CHECK-NEXT:    sshr v3.2d, v3.2d, #56
+; CHECK-NEXT:    ssra v2.2d, v0.2d, #56
+; CHECK-NEXT:    ssra v3.2d, v1.2d, #56
+; CHECK-NEXT:    add v0.2d, v2.2d, v3.2d
 ; CHECK-NEXT:    addp d0, v0.2d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
@@ -1975,12 +1975,12 @@ entry:
 define i32 @add_pair_v8i8_v8i32_double_sext_zext(<8 x i8> %ax, <8 x i8> %ay, <8 x i8> %bx, <8 x i8> %by) {
 ; CHECK-BASE-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
 ; CHECK-BASE:       // %bb.0: // %entry
-; CHECK-BASE-NEXT:    sshll v3.8h, v3.8b, #0
 ; CHECK-BASE-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-BASE-NEXT:    saddlp v3.4s, v3.8h
-; CHECK-BASE-NEXT:    uaddlp v1.4s, v1.8h
+; CHECK-BASE-NEXT:    sshll v3.8h, v3.8b, #0
 ; CHECK-BASE-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-BASE-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-BASE-NEXT:    uaddlp v1.4s, v1.8h
+; CHECK-BASE-NEXT:    saddlp v3.4s, v3.8h
 ; CHECK-BASE-NEXT:    uadalp v1.4s, v0.8h
 ; CHECK-BASE-NEXT:    sadalp v3.4s, v2.8h
 ; CHECK-BASE-NEXT:    add v0.4s, v3.4s, v1.4s
@@ -1993,10 +1993,10 @@ define i32 @add_pair_v8i8_v8i32_double_sext_zext(<8 x i8> %ax, <8 x i8> %ay, <8
 ; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
 ; CHECK-DOT-NEXT:    movi v5.8b, #1
 ; CHECK-DOT-NEXT:    movi v6.2d, #0000000000000000
-; CHECK-DOT-NEXT:    sdot v4.2s, v3.8b, v5.8b
 ; CHECK-DOT-NEXT:    udot v6.2s, v1.8b, v5.8b
-; CHECK-DOT-NEXT:    sdot v4.2s, v2.8b, v5.8b
+; CHECK-DOT-NEXT:    sdot v4.2s, v3.8b, v5.8b
 ; CHECK-DOT-NEXT:    udot v6.2s, v0.8b, v5.8b
+; CHECK-DOT-NEXT:    sdot v4.2s, v2.8b, v5.8b
 ; CHECK-DOT-NEXT:    add v0.2s, v6.2s, v4.2s
 ; CHECK-DOT-NEXT:    addp v0.2s, v0.2s, v0.2s
 ; CHECK-DOT-NEXT:    fmov w0, s0
@@ -2019,10 +2019,10 @@ entry:
 define i32 @add_pair_v8i16_v4i32_double_sext_zext_shuffle(<8 x i16> %ax, <8 x i16> %ay, <8 x i16> %bx, <8 x i16> %by) {
 ; CHECK-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddlp v3.4s, v3.8h
 ; CHECK-NEXT:    uaddlp v1.4s, v1.8h
-; CHECK-NEXT:    uadalp v3.4s, v2.8h
+; CHECK-NEXT:    uaddlp v3.4s, v3.8h
 ; CHECK-NEXT:    uadalp v1.4s, v0.8h
+; CHECK-NEXT:    uadalp v3.4s, v2.8h
 ; CHECK-NEXT:    add v0.4s, v3.4s, v1.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
@@ -2068,53 +2068,53 @@ entry:
 define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
 ; CHECK-BASE-LABEL: full:
 ; CHECK-BASE:       // %bb.0: // %entry
+; CHECK-BASE-NEXT:    ldr d0, [x2]
+; CHECK-BASE-NEXT:    ldr d1, [x0]
 ; CHECK-BASE-NEXT:    // kill: def $w3 killed $w3 def $x3
 ; CHECK-BASE-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-BASE-NEXT:    sxtw x8, w1
-; CHECK-BASE-NEXT:    sxtw x10, w3
-; CHECK-BASE-NEXT:    add x9, x0, x8
-; CHECK-BASE-NEXT:    ldr d0, [x0]
-; CHECK-BASE-NEXT:    ldr d1, [x2]
-; CHECK-BASE-NEXT:    add x11, x2, x10
-; CHECK-BASE-NEXT:    ldr d2, [x9]
-; CHECK-BASE-NEXT:    add x9, x9, x8
-; CHECK-BASE-NEXT:    uabdl v0.8h, v0.8b, v1.8b
-; CHECK-BASE-NEXT:    ldr d1, [x11]
-; CHECK-BASE-NEXT:    add x11, x11, x10
+; CHECK-BASE-NEXT:    sxtw x8, w3
+; CHECK-BASE-NEXT:    sxtw x9, w1
+; CHECK-BASE-NEXT:    uabdl v0.8h, v1.8b, v0.8b
+; CHECK-BASE-NEXT:    add x11, x2, x8
+; CHECK-BASE-NEXT:    add x10, x0, x9
+; CHECK-BASE-NEXT:    ldr d2, [x11]
+; CHECK-BASE-NEXT:    add x11, x11, x8
+; CHECK-BASE-NEXT:    ldr d1, [x10]
+; CHECK-BASE-NEXT:    add x10, x10, x9
 ; CHECK-BASE-NEXT:    uaddlp v0.4s, v0.8h
-; CHECK-BASE-NEXT:    uabdl v1.8h, v2.8b, v1.8b
-; CHECK-BASE-NEXT:    ldr d2, [x9]
-; CHECK-BASE-NEXT:    ldr d3, [x11]
-; CHECK-BASE-NEXT:    add x9, x9, x8
-; CHECK-BASE-NEXT:    add x11, x11, x10
+; CHECK-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
+; CHECK-BASE-NEXT:    ldr d2, [x11]
+; CHECK-BASE-NEXT:    add x11, x11, x8
+; CHECK-BASE-NEXT:    uadalp v0.4s, v1.8h
+; CHECK-BASE-NEXT:    ldr d1, [x10]
+; CHECK-BASE-NEXT:    add x10, x10, x9
+; CHECK-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
+; CHECK-BASE-NEXT:    ldr d2, [x11]
+; CHECK-BASE-NEXT:    add x11, x11, x8
 ; CHECK-BASE-NEXT:    uadalp v0.4s, v1.8h
-; CHECK-BASE-NEXT:    uabdl v1.8h, v2.8b, v3.8b
-; CHECK-BASE-NEXT:    ldr d2, [x9]
-; CHECK-BASE-NEXT:    ldr d3, [x11]
-; CHECK-BASE-NEXT:    add x9, x9, x8
-; CHECK-BASE-NEXT:    add x11, x11, x10
+; CHECK-BASE-NEXT:    ldr d1, [x10]
+; CHECK-BASE-NEXT:    add x10, x10, x9
+; CHECK-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
+; CHECK-BASE-NEXT:    ldr d2, [x11]
+; CHECK-BASE-NEXT:    add x11, x11, x8
 ; CHECK-BASE-NEXT:    uadalp v0.4s, v1.8h
-; CHECK-BASE-NEXT:    uabdl v1.8h, v2.8b, v3.8b
-; CHECK-BASE-NEXT:    ldr d2, [x9]
-; CHECK-BASE-NEXT:    ldr d3, [x11]
-; CHECK-BASE-NEXT:    add x9, x9, x8
-; CHECK-BASE-NEXT:    add x11, x11, x10
+; CHECK-BASE-NEXT:    ldr d1, [x10]
+; CHECK-BASE-NEXT:    add x10, x10, x9
+; CHECK-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
+; CHECK-BASE-NEXT:    ldr d2, [x11]
+; CHECK-BASE-NEXT:    add x11, x11, x8
 ; CHECK-BASE-NEXT:    uadalp v0.4s, v1.8h
-; CHECK-BASE-NEXT:    uabdl v1.8h, v2.8b, v3.8b
-; CHECK-BASE-NEXT:    ldr d2, [x9]
-; CHECK-BASE-NEXT:    ldr d3, [x11]
-; CHECK-BASE-NEXT:    add x9, x9, x8
-; CHECK-BASE-NEXT:    add x11, x11, x10
+; CHECK-BASE-NEXT:    ldr d1, [x10]
+; CHECK-BASE-NEXT:    add x10, x10, x9
+; CHECK-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
+; CHECK-BASE-NEXT:    ldr d2, [x11]
 ; CHECK-BASE-NEXT:    uadalp v0.4s, v1.8h
-; CHECK-BASE-NEXT:    uabdl v1.8h, v2.8b, v3.8b
-; CHECK-BASE-NEXT:    ldr d2, [x9]
-; CHECK-BASE-NEXT:    ldr d3, [x11]
+; CHECK-BASE-NEXT:    ldr d1, [x10]
+; CHECK-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
+; CHECK-BASE-NEXT:    ldr d2, [x11, x8]
 ; CHECK-BASE-NEXT:    uadalp v0.4s, v1.8h
-; CHECK-BASE-NEXT:    ldr d1, [x9, x8]
-; CHECK-BASE-NEXT:    uabdl v2.8h, v2.8b, v3.8b
-; CHECK-BASE-NEXT:    ldr d3, [x11, x10]
-; CHECK-BASE-NEXT:    uadalp v0.4s, v2.8h
-; CHECK-BASE-NEXT:    uabdl v1.8h, v1.8b, v3.8b
+; CHECK-BASE-NEXT:    ldr d1, [x10, x9]
+; CHECK-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
 ; CHECK-BASE-NEXT:    uadalp v0.4s, v1.8h
 ; CHECK-BASE-NEXT:    addv s0, v0.4s
 ; CHECK-BASE-NEXT:    fmov w0, s0
@@ -2122,21 +2122,21 @@ define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
 ;
 ; CHECK-DOT-LABEL: full:
 ; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ldr d0, [x0]
+; CHECK-DOT-NEXT:    ldr d1, [x2]
 ; CHECK-DOT-NEXT:    // kill: def $w3 killed $w3 def $x3
 ; CHECK-DOT-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-DOT-NEXT:    sxtw x8, w3
 ; CHECK-DOT-NEXT:    sxtw x9, w1
-; CHECK-DOT-NEXT:    ldr d0, [x0]
-; CHECK-DOT-NEXT:    add x10, x0, x9
-; CHECK-DOT-NEXT:    ldr d1, [x2]
-; CHECK-DOT-NEXT:    add x11, x2, x8
 ; CHECK-DOT-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-DOT-NEXT:    movi v3.8b, #1
 ; CHECK-DOT-NEXT:    uabd v0.8b, v0.8b, v1.8b
-; CHECK-DOT-NEXT:    ldr d1, [x10]
+; CHECK-DOT-NEXT:    add x11, x2, x8
+; CHECK-DOT-NEXT:    add x10, x0, x9
 ; CHECK-DOT-NEXT:    ldr d4, [x11]
-; CHECK-DOT-NEXT:    add x10, x10, x9
 ; CHECK-DOT-NEXT:    add x11, x11, x8
+; CHECK-DOT-NEXT:    ldr d1, [x10]
+; CHECK-DOT-NEXT:    add x10, x10, x9
 ; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
 ; CHECK-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
 ; CHECK-DOT-NEXT:    ldr d1, [x10]
@@ -2166,11 +2166,11 @@ define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
 ; CHECK-DOT-NEXT:    ldr d1, [x10]
 ; CHECK-DOT-NEXT:    ldr d4, [x11]
 ; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
-; CHECK-DOT-NEXT:    ldr d0, [x10, x9]
-; CHECK-DOT-NEXT:    uabd v1.8b, v1.8b, v4.8b
+; CHECK-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
+; CHECK-DOT-NEXT:    ldr d1, [x10, x9]
 ; CHECK-DOT-NEXT:    ldr d4, [x11, x8]
-; CHECK-DOT-NEXT:    udot v2.2s, v1.8b, v3.8b
-; CHECK-DOT-NEXT:    uabd v0.8b, v0.8b, v4.8b
+; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
+; CHECK-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
 ; CHECK-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
 ; CHECK-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
 ; CHECK-DOT-NEXT:    fmov w0, s0

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
index 1b070635567957..7fa416e0dbcd5c 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
@@ -101,8 +101,8 @@ define i8 @test_v3i8(<3 x i8> %a) nounwind {
 define i8 @test_v9i8(<9 x i8> %a) nounwind {
 ; CHECK-LABEL: test_v9i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    mov v1.16b, v0.16b
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-NEXT:    mov v1.b[9], w8
 ; CHECK-NEXT:    mov v1.b[10], w8
 ; CHECK-NEXT:    mov v1.b[11], w8
@@ -165,8 +165,8 @@ define i24 @test_v4i24(<4 x i24> %a) nounwind {
 define i128 @test_v2i128(<2 x i128> %a) nounwind {
 ; CHECK-LABEL: test_v2i128:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and x0, x0, x2
 ; CHECK-NEXT:    and x1, x1, x3
+; CHECK-NEXT:    and x0, x0, x2
 ; CHECK-NEXT:    ret
   %b = call i128 @llvm.vector.reduce.and.v2i128(<2 x i128> %a)
   ret i128 %b

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll
index af63e3e81f39cb..215b6e086591dc 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll
@@ -144,8 +144,8 @@ define fp128 @test_v2f128(<2 x fp128> %a, fp128 %s) nounwind {
 ; CHECK-NEXT:    sub sp, sp, #32
 ; CHECK-NEXT:    str q1, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl __addtf3
 ; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
@@ -170,7 +170,7 @@ define float @test_v16f32(<16 x float> %a, float %s) nounwind {
 ; CHECK-NEXT:    fadd s4, s4, s0
 ; CHECK-NEXT:    mov s7, v0.s[2]
 ; CHECK-NEXT:    mov s0, v0.s[3]
-; CHECK-NEXT:    mov s5, v3.s[1]
+; CHECK-NEXT:    mov s5, v2.s[1]
 ; CHECK-NEXT:    fadd s4, s4, s6
 ; CHECK-NEXT:    mov s6, v1.s[2]
 ; CHECK-NEXT:    fadd s4, s4, s7
@@ -179,21 +179,21 @@ define float @test_v16f32(<16 x float> %a, float %s) nounwind {
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    mov s1, v1.s[3]
 ; CHECK-NEXT:    fadd s0, s0, s4
-; CHECK-NEXT:    mov s4, v2.s[2]
 ; CHECK-NEXT:    fadd s0, s0, s6
 ; CHECK-NEXT:    fadd s0, s0, s1
-; CHECK-NEXT:    mov s1, v2.s[1]
+; CHECK-NEXT:    mov s1, v2.s[2]
 ; CHECK-NEXT:    fadd s0, s0, s2
+; CHECK-NEXT:    mov s2, v2.s[3]
+; CHECK-NEXT:    fadd s0, s0, s5
 ; CHECK-NEXT:    fadd s0, s0, s1
-; CHECK-NEXT:    mov s1, v2.s[3]
-; CHECK-NEXT:    mov s2, v3.s[3]
-; CHECK-NEXT:    fadd s0, s0, s4
-; CHECK-NEXT:    fadd s0, s0, s1
-; CHECK-NEXT:    mov s1, v3.s[2]
+; CHECK-NEXT:    mov s1, v3.s[1]
+; CHECK-NEXT:    fadd s0, s0, s2
+; CHECK-NEXT:    mov s2, v3.s[2]
 ; CHECK-NEXT:    fadd s0, s0, s3
-; CHECK-NEXT:    fadd s0, s0, s5
 ; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    mov s1, v3.s[3]
 ; CHECK-NEXT:    fadd s0, s0, s2
+; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
   %b = call float @llvm.vector.reduce.fadd.f32.v16f32(float %s, <16 x float> %a)
   ret float %b
@@ -205,24 +205,24 @@ define float @test_v16f32_neutral(<16 x float> %a) nounwind {
 ; CHECK-NEXT:    mov s5, v0.s[2]
 ; CHECK-NEXT:    faddp s6, v0.2s
 ; CHECK-NEXT:    mov s0, v0.s[3]
-; CHECK-NEXT:    mov s4, v2.s[1]
+; CHECK-NEXT:    mov s4, v1.s[1]
 ; CHECK-NEXT:    fadd s5, s6, s5
-; CHECK-NEXT:    mov s6, v1.s[2]
 ; CHECK-NEXT:    fadd s0, s5, s0
-; CHECK-NEXT:    mov s5, v1.s[1]
+; CHECK-NEXT:    mov s5, v1.s[2]
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    mov s1, v1.s[3]
+; CHECK-NEXT:    fadd s0, s0, s4
+; CHECK-NEXT:    mov s4, v2.s[2]
 ; CHECK-NEXT:    fadd s0, s0, s5
-; CHECK-NEXT:    fadd s0, s0, s6
 ; CHECK-NEXT:    fadd s0, s0, s1
-; CHECK-NEXT:    mov s1, v2.s[2]
+; CHECK-NEXT:    mov s1, v2.s[1]
 ; CHECK-NEXT:    fadd s0, s0, s2
-; CHECK-NEXT:    mov s2, v2.s[3]
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    mov s1, v2.s[3]
+; CHECK-NEXT:    mov s2, v3.s[2]
 ; CHECK-NEXT:    fadd s0, s0, s4
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    mov s1, v3.s[1]
-; CHECK-NEXT:    fadd s0, s0, s2
-; CHECK-NEXT:    mov s2, v3.s[2]
 ; CHECK-NEXT:    fadd s0, s0, s3
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    mov s1, v3.s[3]

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
index b43b01027dda21..e770def93aa4e6 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
@@ -172,8 +172,8 @@ define half @add_2H(<16 x half> %bin.rdx)  {
 ; CHECKNOFP16-NEXT:    mov h5, v0.h[6]
 ; CHECKNOFP16-NEXT:    mov h0, v0.h[7]
 ; CHECKNOFP16-NEXT:    fadd s2, s2, s3
-; CHECKNOFP16-NEXT:    mov h3, v1.h[6]
-; CHECKNOFP16-NEXT:    fcvt h4, s4
+; CHECKNOFP16-NEXT:    fcvt h3, s4
+; CHECKNOFP16-NEXT:    mov h4, v1.h[6]
 ; CHECKNOFP16-NEXT:    fcvt s5, h5
 ; CHECKNOFP16-NEXT:    mov h1, v1.h[7]
 ; CHECKNOFP16-NEXT:    fcvt s0, h0
@@ -182,15 +182,15 @@ define half @add_2H(<16 x half> %bin.rdx)  {
 ; CHECKNOFP16-NEXT:    fcvt s4, h4
 ; CHECKNOFP16-NEXT:    fcvt s1, h1
 ; CHECKNOFP16-NEXT:    fcvt s2, h2
-; CHECKNOFP16-NEXT:    fadd s3, s5, s3
 ; CHECKNOFP16-NEXT:    fadd s0, s0, s1
-; CHECKNOFP16-NEXT:    fadd s2, s2, s4
-; CHECKNOFP16-NEXT:    fcvt h3, s3
+; CHECKNOFP16-NEXT:    fadd s2, s2, s3
+; CHECKNOFP16-NEXT:    fadd s3, s5, s4
 ; CHECKNOFP16-NEXT:    fcvt h0, s0
 ; CHECKNOFP16-NEXT:    fcvt h2, s2
-; CHECKNOFP16-NEXT:    fcvt s3, h3
+; CHECKNOFP16-NEXT:    fcvt h3, s3
 ; CHECKNOFP16-NEXT:    fcvt s0, h0
 ; CHECKNOFP16-NEXT:    fcvt s2, h2
+; CHECKNOFP16-NEXT:    fcvt s3, h3
 ; CHECKNOFP16-NEXT:    fadd s2, s2, s3
 ; CHECKNOFP16-NEXT:    fcvt h1, s2
 ; CHECKNOFP16-NEXT:    fcvt s1, h1
@@ -494,8 +494,8 @@ define half @fadd_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) {
 ; CHECKNOFP16-NEXT:    fadd s3, s3, s5
 ; CHECKNOFP16-NEXT:    mov h4, v0.h[6]
 ; CHECKNOFP16-NEXT:    mov h5, v1.h[6]
-; CHECKNOFP16-NEXT:    mov h0, v0.h[7]
 ; CHECKNOFP16-NEXT:    mov h1, v1.h[7]
+; CHECKNOFP16-NEXT:    mov h0, v0.h[7]
 ; CHECKNOFP16-NEXT:    fcvt h2, s2
 ; CHECKNOFP16-NEXT:    fcvt h3, s3
 ; CHECKNOFP16-NEXT:    fcvt s4, h4

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
index 2b1927ff644691..4354fcd465dac8 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
@@ -269,8 +269,8 @@ define half @test_v16f16(<16 x half> %a) nounwind {
 ; CHECK-NOFP-SD-NEXT:    mov h5, v0.h[6]
 ; CHECK-NOFP-SD-NEXT:    mov h0, v0.h[7]
 ; CHECK-NOFP-SD-NEXT:    fmaxnm s2, s2, s3
-; CHECK-NOFP-SD-NEXT:    mov h3, v1.h[6]
-; CHECK-NOFP-SD-NEXT:    fcvt h4, s4
+; CHECK-NOFP-SD-NEXT:    fcvt h3, s4
+; CHECK-NOFP-SD-NEXT:    mov h4, v1.h[6]
 ; CHECK-NOFP-SD-NEXT:    fcvt s5, h5
 ; CHECK-NOFP-SD-NEXT:    mov h1, v1.h[7]
 ; CHECK-NOFP-SD-NEXT:    fcvt s0, h0
@@ -279,15 +279,15 @@ define half @test_v16f16(<16 x half> %a) nounwind {
 ; CHECK-NOFP-SD-NEXT:    fcvt s4, h4
 ; CHECK-NOFP-SD-NEXT:    fcvt s1, h1
 ; CHECK-NOFP-SD-NEXT:    fcvt s2, h2
-; CHECK-NOFP-SD-NEXT:    fmaxnm s3, s5, s3
 ; CHECK-NOFP-SD-NEXT:    fmaxnm s0, s0, s1
-; CHECK-NOFP-SD-NEXT:    fmaxnm s2, s2, s4
-; CHECK-NOFP-SD-NEXT:    fcvt h3, s3
+; CHECK-NOFP-SD-NEXT:    fmaxnm s2, s2, s3
+; CHECK-NOFP-SD-NEXT:    fmaxnm s3, s5, s4
 ; CHECK-NOFP-SD-NEXT:    fcvt h0, s0
 ; CHECK-NOFP-SD-NEXT:    fcvt h2, s2
-; CHECK-NOFP-SD-NEXT:    fcvt s3, h3
+; CHECK-NOFP-SD-NEXT:    fcvt h3, s3
 ; CHECK-NOFP-SD-NEXT:    fcvt s0, h0
 ; CHECK-NOFP-SD-NEXT:    fcvt s2, h2
+; CHECK-NOFP-SD-NEXT:    fcvt s3, h3
 ; CHECK-NOFP-SD-NEXT:    fmaxnm s2, s2, s3
 ; CHECK-NOFP-SD-NEXT:    fcvt h1, s2
 ; CHECK-NOFP-SD-NEXT:    fcvt s1, h1
@@ -404,15 +404,15 @@ define half @test_v11f16(<11 x half> %a) nounwind {
 ; CHECK-NOFP-LABEL: test_v11f16:
 ; CHECK-NOFP:       // %bb.0:
 ; CHECK-NOFP-NEXT:    ldr h16, [sp, #8]
-; CHECK-NOFP-NEXT:    fcvt s1, h1
 ; CHECK-NOFP-NEXT:    ldr h17, [sp]
+; CHECK-NOFP-NEXT:    fcvt s1, h1
 ; CHECK-NOFP-NEXT:    fcvt s0, h0
 ; CHECK-NOFP-NEXT:    fcvt s2, h2
 ; CHECK-NOFP-NEXT:    fcvt s16, h16
 ; CHECK-NOFP-NEXT:    fcvt s17, h17
 ; CHECK-NOFP-NEXT:    fmaxnm s1, s1, s16
-; CHECK-NOFP-NEXT:    ldr h16, [sp, #16]
 ; CHECK-NOFP-NEXT:    fmaxnm s0, s0, s17
+; CHECK-NOFP-NEXT:    ldr h16, [sp, #16]
 ; CHECK-NOFP-NEXT:    fcvt s16, h16
 ; CHECK-NOFP-NEXT:    fcvt h1, s1
 ; CHECK-NOFP-NEXT:    fcvt h0, s0
@@ -455,8 +455,8 @@ define half @test_v11f16(<11 x half> %a) nounwind {
 ; CHECK-FP-NEXT:    // kill: def $h2 killed $h2 def $q2
 ; CHECK-FP-NEXT:    // kill: def $h3 killed $h3 def $q3
 ; CHECK-FP-NEXT:    // kill: def $h4 killed $h4 def $q4
-; CHECK-FP-NEXT:    mov x8, sp
 ; CHECK-FP-NEXT:    // kill: def $h5 killed $h5 def $q5
+; CHECK-FP-NEXT:    mov x8, sp
 ; CHECK-FP-NEXT:    // kill: def $h6 killed $h6 def $q6
 ; CHECK-FP-NEXT:    // kill: def $h7 killed $h7 def $q7
 ; CHECK-FP-NEXT:    mov v0.h[1], v1.h[0]
@@ -464,11 +464,11 @@ define half @test_v11f16(<11 x half> %a) nounwind {
 ; CHECK-FP-NEXT:    mov v0.h[2], v2.h[0]
 ; CHECK-FP-NEXT:    ld1 { v1.h }[0], [x8]
 ; CHECK-FP-NEXT:    add x8, sp, #8
-; CHECK-FP-NEXT:    mov v0.h[3], v3.h[0]
 ; CHECK-FP-NEXT:    ld1 { v1.h }[1], [x8]
 ; CHECK-FP-NEXT:    add x8, sp, #16
-; CHECK-FP-NEXT:    mov v0.h[4], v4.h[0]
+; CHECK-FP-NEXT:    mov v0.h[3], v3.h[0]
 ; CHECK-FP-NEXT:    ld1 { v1.h }[2], [x8]
+; CHECK-FP-NEXT:    mov v0.h[4], v4.h[0]
 ; CHECK-FP-NEXT:    mov v0.h[5], v5.h[0]
 ; CHECK-FP-NEXT:    mov v0.h[6], v6.h[0]
 ; CHECK-FP-NEXT:    mov v0.h[7], v7.h[0]
@@ -483,15 +483,15 @@ define half @test_v11f16_ninf(<11 x half> %a) nounwind {
 ; CHECK-NOFP-LABEL: test_v11f16_ninf:
 ; CHECK-NOFP:       // %bb.0:
 ; CHECK-NOFP-NEXT:    ldr h16, [sp, #8]
-; CHECK-NOFP-NEXT:    fcvt s1, h1
 ; CHECK-NOFP-NEXT:    ldr h17, [sp]
+; CHECK-NOFP-NEXT:    fcvt s1, h1
 ; CHECK-NOFP-NEXT:    fcvt s0, h0
 ; CHECK-NOFP-NEXT:    fcvt s2, h2
 ; CHECK-NOFP-NEXT:    fcvt s16, h16
 ; CHECK-NOFP-NEXT:    fcvt s17, h17
 ; CHECK-NOFP-NEXT:    fmaxnm s1, s1, s16
-; CHECK-NOFP-NEXT:    ldr h16, [sp, #16]
 ; CHECK-NOFP-NEXT:    fmaxnm s0, s0, s17
+; CHECK-NOFP-NEXT:    ldr h16, [sp, #16]
 ; CHECK-NOFP-NEXT:    fcvt s16, h16
 ; CHECK-NOFP-NEXT:    fcvt h1, s1
 ; CHECK-NOFP-NEXT:    fcvt h0, s0
@@ -534,8 +534,8 @@ define half @test_v11f16_ninf(<11 x half> %a) nounwind {
 ; CHECK-FP-NEXT:    // kill: def $h2 killed $h2 def $q2
 ; CHECK-FP-NEXT:    // kill: def $h3 killed $h3 def $q3
 ; CHECK-FP-NEXT:    // kill: def $h4 killed $h4 def $q4
-; CHECK-FP-NEXT:    mov x8, sp
 ; CHECK-FP-NEXT:    // kill: def $h5 killed $h5 def $q5
+; CHECK-FP-NEXT:    mov x8, sp
 ; CHECK-FP-NEXT:    // kill: def $h6 killed $h6 def $q6
 ; CHECK-FP-NEXT:    // kill: def $h7 killed $h7 def $q7
 ; CHECK-FP-NEXT:    mov v0.h[1], v1.h[0]
@@ -543,11 +543,11 @@ define half @test_v11f16_ninf(<11 x half> %a) nounwind {
 ; CHECK-FP-NEXT:    mov v0.h[2], v2.h[0]
 ; CHECK-FP-NEXT:    ld1 { v1.h }[0], [x8]
 ; CHECK-FP-NEXT:    add x8, sp, #8
-; CHECK-FP-NEXT:    mov v0.h[3], v3.h[0]
 ; CHECK-FP-NEXT:    ld1 { v1.h }[1], [x8]
 ; CHECK-FP-NEXT:    add x8, sp, #16
-; CHECK-FP-NEXT:    mov v0.h[4], v4.h[0]
+; CHECK-FP-NEXT:    mov v0.h[3], v3.h[0]
 ; CHECK-FP-NEXT:    ld1 { v1.h }[2], [x8]
+; CHECK-FP-NEXT:    mov v0.h[4], v4.h[0]
 ; CHECK-FP-NEXT:    mov v0.h[5], v5.h[0]
 ; CHECK-FP-NEXT:    mov v0.h[6], v6.h[0]
 ; CHECK-FP-NEXT:    mov v0.h[7], v7.h[0]

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
index 067c766875c8ce..4f1e3fdc34fcd3 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
@@ -289,14 +289,14 @@ define half @test_v16f16(<16 x half> %a) nounwind {
 ; CHECK-NOFP-SD-NEXT:    fmaxnm s2, s2, s3
 ; CHECK-NOFP-SD-NEXT:    fcsel s3, s5, s4, gt
 ; CHECK-NOFP-SD-NEXT:    fcmp s0, s1
+; CHECK-NOFP-SD-NEXT:    fcsel s0, s0, s1, gt
 ; CHECK-NOFP-SD-NEXT:    fcvt h2, s2
 ; CHECK-NOFP-SD-NEXT:    fcvt h3, s3
-; CHECK-NOFP-SD-NEXT:    fcsel s0, s0, s1, gt
+; CHECK-NOFP-SD-NEXT:    fcvt h0, s0
 ; CHECK-NOFP-SD-NEXT:    fcvt s2, h2
 ; CHECK-NOFP-SD-NEXT:    fcvt s3, h3
-; CHECK-NOFP-SD-NEXT:    fcvt h0, s0
-; CHECK-NOFP-SD-NEXT:    fmaxnm s2, s2, s3
 ; CHECK-NOFP-SD-NEXT:    fcvt s0, h0
+; CHECK-NOFP-SD-NEXT:    fmaxnm s2, s2, s3
 ; CHECK-NOFP-SD-NEXT:    fcvt h1, s2
 ; CHECK-NOFP-SD-NEXT:    fcvt s1, h1
 ; CHECK-NOFP-SD-NEXT:    fmaxnm s0, s1, s0
@@ -418,30 +418,30 @@ define half @test_v11f16(<11 x half> %a) nounwind {
 ; CHECK-NOFP-NEXT:    fcvt s2, h2
 ; CHECK-NOFP-NEXT:    adrp x8, .LCPI14_0
 ; CHECK-NOFP-NEXT:    fcvt s16, h16
-; CHECK-NOFP-NEXT:    fcvt s3, h3
 ; CHECK-NOFP-NEXT:    fcvt s17, h17
+; CHECK-NOFP-NEXT:    fcvt s3, h3
 ; CHECK-NOFP-NEXT:    fcmp s1, s16
 ; CHECK-NOFP-NEXT:    fcsel s1, s1, s16, gt
 ; CHECK-NOFP-NEXT:    fcmp s0, s17
 ; CHECK-NOFP-NEXT:    ldr h16, [sp, #16]
+; CHECK-NOFP-NEXT:    fcvt s16, h16
 ; CHECK-NOFP-NEXT:    fcsel s0, s0, s17, gt
 ; CHECK-NOFP-NEXT:    fcvt h1, s1
-; CHECK-NOFP-NEXT:    fcvt s16, h16
+; CHECK-NOFP-NEXT:    fcmp s2, s16
 ; CHECK-NOFP-NEXT:    fcvt h0, s0
 ; CHECK-NOFP-NEXT:    fcvt s1, h1
-; CHECK-NOFP-NEXT:    fcmp s2, s16
 ; CHECK-NOFP-NEXT:    fcvt s0, h0
 ; CHECK-NOFP-NEXT:    fmaxnm s0, s0, s1
 ; CHECK-NOFP-NEXT:    fcsel s1, s2, s16, gt
 ; CHECK-NOFP-NEXT:    ldr h2, [x8, :lo12:.LCPI14_0]
 ; CHECK-NOFP-NEXT:    mov w8, #-8388608 // =0xff800000
-; CHECK-NOFP-NEXT:    fcvt h0, s0
-; CHECK-NOFP-NEXT:    fcvt h1, s1
 ; CHECK-NOFP-NEXT:    fcvt s2, h2
 ; CHECK-NOFP-NEXT:    fmov s16, w8
+; CHECK-NOFP-NEXT:    fcvt h0, s0
+; CHECK-NOFP-NEXT:    fcvt h1, s1
+; CHECK-NOFP-NEXT:    fcmp s3, s2
 ; CHECK-NOFP-NEXT:    fcvt s0, h0
 ; CHECK-NOFP-NEXT:    fcvt s1, h1
-; CHECK-NOFP-NEXT:    fcmp s3, s2
 ; CHECK-NOFP-NEXT:    fmaxnm s0, s0, s1
 ; CHECK-NOFP-NEXT:    fcsel s1, s3, s16, gt
 ; CHECK-NOFP-NEXT:    fcvt s3, h4
@@ -491,8 +491,8 @@ define half @test_v11f16(<11 x half> %a) nounwind {
 ; CHECK-FP-NEXT:    // kill: def $h2 killed $h2 def $q2
 ; CHECK-FP-NEXT:    // kill: def $h3 killed $h3 def $q3
 ; CHECK-FP-NEXT:    // kill: def $h4 killed $h4 def $q4
-; CHECK-FP-NEXT:    mov x8, sp
 ; CHECK-FP-NEXT:    // kill: def $h5 killed $h5 def $q5
+; CHECK-FP-NEXT:    mov x8, sp
 ; CHECK-FP-NEXT:    // kill: def $h6 killed $h6 def $q6
 ; CHECK-FP-NEXT:    // kill: def $h7 killed $h7 def $q7
 ; CHECK-FP-NEXT:    mov v0.h[1], v1.h[0]
@@ -500,11 +500,11 @@ define half @test_v11f16(<11 x half> %a) nounwind {
 ; CHECK-FP-NEXT:    mov v0.h[2], v2.h[0]
 ; CHECK-FP-NEXT:    ld1 { v1.h }[0], [x8]
 ; CHECK-FP-NEXT:    add x8, sp, #8
-; CHECK-FP-NEXT:    mov v0.h[3], v3.h[0]
 ; CHECK-FP-NEXT:    ld1 { v1.h }[1], [x8]
 ; CHECK-FP-NEXT:    add x8, sp, #16
-; CHECK-FP-NEXT:    mov v0.h[4], v4.h[0]
+; CHECK-FP-NEXT:    mov v0.h[3], v3.h[0]
 ; CHECK-FP-NEXT:    ld1 { v1.h }[2], [x8]
+; CHECK-FP-NEXT:    mov v0.h[4], v4.h[0]
 ; CHECK-FP-NEXT:    mov v0.h[5], v5.h[0]
 ; CHECK-FP-NEXT:    mov v0.h[6], v6.h[0]
 ; CHECK-FP-NEXT:    mov v0.h[7], v7.h[0]
@@ -525,31 +525,31 @@ define half @test_v11f16_ninf(<11 x half> %a) nounwind {
 ; CHECK-NOFP-NEXT:    fcvt s2, h2
 ; CHECK-NOFP-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-NOFP-NEXT:    fcvt s16, h16
-; CHECK-NOFP-NEXT:    fcvt s3, h3
 ; CHECK-NOFP-NEXT:    fcvt s17, h17
+; CHECK-NOFP-NEXT:    fcvt s3, h3
 ; CHECK-NOFP-NEXT:    fcmp s1, s16
 ; CHECK-NOFP-NEXT:    fcsel s1, s1, s16, gt
 ; CHECK-NOFP-NEXT:    fcmp s0, s17
 ; CHECK-NOFP-NEXT:    ldr h16, [sp, #16]
+; CHECK-NOFP-NEXT:    fcvt s16, h16
 ; CHECK-NOFP-NEXT:    fcsel s0, s0, s17, gt
 ; CHECK-NOFP-NEXT:    fcvt h1, s1
-; CHECK-NOFP-NEXT:    fcvt s16, h16
+; CHECK-NOFP-NEXT:    fcmp s2, s16
 ; CHECK-NOFP-NEXT:    fcvt h0, s0
 ; CHECK-NOFP-NEXT:    fcvt s1, h1
-; CHECK-NOFP-NEXT:    fcmp s2, s16
 ; CHECK-NOFP-NEXT:    fcvt s0, h0
 ; CHECK-NOFP-NEXT:    fmaxnm s0, s0, s1
 ; CHECK-NOFP-NEXT:    fcsel s1, s2, s16, gt
 ; CHECK-NOFP-NEXT:    ldr h2, [x8, :lo12:.LCPI15_0]
 ; CHECK-NOFP-NEXT:    mov w8, #57344 // =0xe000
+; CHECK-NOFP-NEXT:    fcvt s2, h2
 ; CHECK-NOFP-NEXT:    movk w8, #51071, lsl #16
+; CHECK-NOFP-NEXT:    fmov s16, w8
 ; CHECK-NOFP-NEXT:    fcvt h0, s0
 ; CHECK-NOFP-NEXT:    fcvt h1, s1
-; CHECK-NOFP-NEXT:    fcvt s2, h2
-; CHECK-NOFP-NEXT:    fmov s16, w8
+; CHECK-NOFP-NEXT:    fcmp s3, s2
 ; CHECK-NOFP-NEXT:    fcvt s0, h0
 ; CHECK-NOFP-NEXT:    fcvt s1, h1
-; CHECK-NOFP-NEXT:    fcmp s3, s2
 ; CHECK-NOFP-NEXT:    fmaxnm s0, s0, s1
 ; CHECK-NOFP-NEXT:    fcsel s1, s3, s16, gt
 ; CHECK-NOFP-NEXT:    fcvt s3, h4
@@ -599,8 +599,8 @@ define half @test_v11f16_ninf(<11 x half> %a) nounwind {
 ; CHECK-FP-NEXT:    // kill: def $h2 killed $h2 def $q2
 ; CHECK-FP-NEXT:    // kill: def $h3 killed $h3 def $q3
 ; CHECK-FP-NEXT:    // kill: def $h4 killed $h4 def $q4
-; CHECK-FP-NEXT:    mov x8, sp
 ; CHECK-FP-NEXT:    // kill: def $h5 killed $h5 def $q5
+; CHECK-FP-NEXT:    mov x8, sp
 ; CHECK-FP-NEXT:    // kill: def $h6 killed $h6 def $q6
 ; CHECK-FP-NEXT:    // kill: def $h7 killed $h7 def $q7
 ; CHECK-FP-NEXT:    mov v0.h[1], v1.h[0]

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll
index 49270c427407b6..635ed3e1977ce1 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll
@@ -229,8 +229,8 @@ define half @test_v16f16(<16 x half> %a) nounwind {
 ; CHECK-NOFP-SD-NEXT:    mov h5, v0.h[6]
 ; CHECK-NOFP-SD-NEXT:    mov h0, v0.h[7]
 ; CHECK-NOFP-SD-NEXT:    fmax s2, s2, s3
-; CHECK-NOFP-SD-NEXT:    mov h3, v1.h[6]
-; CHECK-NOFP-SD-NEXT:    fcvt h4, s4
+; CHECK-NOFP-SD-NEXT:    fcvt h3, s4
+; CHECK-NOFP-SD-NEXT:    mov h4, v1.h[6]
 ; CHECK-NOFP-SD-NEXT:    fcvt s5, h5
 ; CHECK-NOFP-SD-NEXT:    mov h1, v1.h[7]
 ; CHECK-NOFP-SD-NEXT:    fcvt s0, h0
@@ -239,15 +239,15 @@ define half @test_v16f16(<16 x half> %a) nounwind {
 ; CHECK-NOFP-SD-NEXT:    fcvt s4, h4
 ; CHECK-NOFP-SD-NEXT:    fcvt s1, h1
 ; CHECK-NOFP-SD-NEXT:    fcvt s2, h2
-; CHECK-NOFP-SD-NEXT:    fmax s3, s5, s3
 ; CHECK-NOFP-SD-NEXT:    fmax s0, s0, s1
-; CHECK-NOFP-SD-NEXT:    fmax s2, s2, s4
-; CHECK-NOFP-SD-NEXT:    fcvt h3, s3
+; CHECK-NOFP-SD-NEXT:    fmax s2, s2, s3
+; CHECK-NOFP-SD-NEXT:    fmax s3, s5, s4
 ; CHECK-NOFP-SD-NEXT:    fcvt h0, s0
 ; CHECK-NOFP-SD-NEXT:    fcvt h2, s2
-; CHECK-NOFP-SD-NEXT:    fcvt s3, h3
+; CHECK-NOFP-SD-NEXT:    fcvt h3, s3
 ; CHECK-NOFP-SD-NEXT:    fcvt s0, h0
 ; CHECK-NOFP-SD-NEXT:    fcvt s2, h2
+; CHECK-NOFP-SD-NEXT:    fcvt s3, h3
 ; CHECK-NOFP-SD-NEXT:    fmax s2, s2, s3
 ; CHECK-NOFP-SD-NEXT:    fcvt h1, s2
 ; CHECK-NOFP-SD-NEXT:    fcvt s1, h1
@@ -364,15 +364,15 @@ define half @test_v11f16(<11 x half> %a) nounwind {
 ; CHECK-NOFP-LABEL: test_v11f16:
 ; CHECK-NOFP:       // %bb.0:
 ; CHECK-NOFP-NEXT:    ldr h16, [sp, #8]
-; CHECK-NOFP-NEXT:    fcvt s1, h1
 ; CHECK-NOFP-NEXT:    ldr h17, [sp]
+; CHECK-NOFP-NEXT:    fcvt s1, h1
 ; CHECK-NOFP-NEXT:    fcvt s0, h0
 ; CHECK-NOFP-NEXT:    fcvt s2, h2
 ; CHECK-NOFP-NEXT:    fcvt s16, h16
 ; CHECK-NOFP-NEXT:    fcvt s17, h17
 ; CHECK-NOFP-NEXT:    fmax s1, s1, s16
-; CHECK-NOFP-NEXT:    ldr h16, [sp, #16]
 ; CHECK-NOFP-NEXT:    fmax s0, s0, s17
+; CHECK-NOFP-NEXT:    ldr h16, [sp, #16]
 ; CHECK-NOFP-NEXT:    fcvt s16, h16
 ; CHECK-NOFP-NEXT:    fcvt h1, s1
 ; CHECK-NOFP-NEXT:    fcvt h0, s0
@@ -415,8 +415,8 @@ define half @test_v11f16(<11 x half> %a) nounwind {
 ; CHECK-FP-NEXT:    // kill: def $h2 killed $h2 def $q2
 ; CHECK-FP-NEXT:    // kill: def $h3 killed $h3 def $q3
 ; CHECK-FP-NEXT:    // kill: def $h4 killed $h4 def $q4
-; CHECK-FP-NEXT:    mov x8, sp
 ; CHECK-FP-NEXT:    // kill: def $h5 killed $h5 def $q5
+; CHECK-FP-NEXT:    mov x8, sp
 ; CHECK-FP-NEXT:    // kill: def $h6 killed $h6 def $q6
 ; CHECK-FP-NEXT:    // kill: def $h7 killed $h7 def $q7
 ; CHECK-FP-NEXT:    mov v0.h[1], v1.h[0]
@@ -424,11 +424,11 @@ define half @test_v11f16(<11 x half> %a) nounwind {
 ; CHECK-FP-NEXT:    mov v0.h[2], v2.h[0]
 ; CHECK-FP-NEXT:    ld1 { v1.h }[0], [x8]
 ; CHECK-FP-NEXT:    add x8, sp, #8
-; CHECK-FP-NEXT:    mov v0.h[3], v3.h[0]
 ; CHECK-FP-NEXT:    ld1 { v1.h }[1], [x8]
 ; CHECK-FP-NEXT:    add x8, sp, #16
-; CHECK-FP-NEXT:    mov v0.h[4], v4.h[0]
+; CHECK-FP-NEXT:    mov v0.h[3], v3.h[0]
 ; CHECK-FP-NEXT:    ld1 { v1.h }[2], [x8]
+; CHECK-FP-NEXT:    mov v0.h[4], v4.h[0]
 ; CHECK-FP-NEXT:    mov v0.h[5], v5.h[0]
 ; CHECK-FP-NEXT:    mov v0.h[6], v6.h[0]
 ; CHECK-FP-NEXT:    mov v0.h[7], v7.h[0]

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
index 3d73569ef1f44f..a2bfc3c438da36 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
@@ -289,14 +289,14 @@ define half @test_v16f16(<16 x half> %a) nounwind {
 ; CHECK-NOFP-SD-NEXT:    fminnm s2, s2, s3
 ; CHECK-NOFP-SD-NEXT:    fcsel s3, s5, s4, lt
 ; CHECK-NOFP-SD-NEXT:    fcmp s0, s1
+; CHECK-NOFP-SD-NEXT:    fcsel s0, s0, s1, lt
 ; CHECK-NOFP-SD-NEXT:    fcvt h2, s2
 ; CHECK-NOFP-SD-NEXT:    fcvt h3, s3
-; CHECK-NOFP-SD-NEXT:    fcsel s0, s0, s1, lt
+; CHECK-NOFP-SD-NEXT:    fcvt h0, s0
 ; CHECK-NOFP-SD-NEXT:    fcvt s2, h2
 ; CHECK-NOFP-SD-NEXT:    fcvt s3, h3
-; CHECK-NOFP-SD-NEXT:    fcvt h0, s0
-; CHECK-NOFP-SD-NEXT:    fminnm s2, s2, s3
 ; CHECK-NOFP-SD-NEXT:    fcvt s0, h0
+; CHECK-NOFP-SD-NEXT:    fminnm s2, s2, s3
 ; CHECK-NOFP-SD-NEXT:    fcvt h1, s2
 ; CHECK-NOFP-SD-NEXT:    fcvt s1, h1
 ; CHECK-NOFP-SD-NEXT:    fminnm s0, s1, s0
@@ -418,30 +418,30 @@ define half @test_v11f16(<11 x half> %a) nounwind {
 ; CHECK-NOFP-NEXT:    fcvt s2, h2
 ; CHECK-NOFP-NEXT:    adrp x8, .LCPI14_0
 ; CHECK-NOFP-NEXT:    fcvt s16, h16
-; CHECK-NOFP-NEXT:    fcvt s3, h3
 ; CHECK-NOFP-NEXT:    fcvt s17, h17
+; CHECK-NOFP-NEXT:    fcvt s3, h3
 ; CHECK-NOFP-NEXT:    fcmp s1, s16
 ; CHECK-NOFP-NEXT:    fcsel s1, s1, s16, lt
 ; CHECK-NOFP-NEXT:    fcmp s0, s17
 ; CHECK-NOFP-NEXT:    ldr h16, [sp, #16]
+; CHECK-NOFP-NEXT:    fcvt s16, h16
 ; CHECK-NOFP-NEXT:    fcsel s0, s0, s17, lt
 ; CHECK-NOFP-NEXT:    fcvt h1, s1
-; CHECK-NOFP-NEXT:    fcvt s16, h16
+; CHECK-NOFP-NEXT:    fcmp s2, s16
 ; CHECK-NOFP-NEXT:    fcvt h0, s0
 ; CHECK-NOFP-NEXT:    fcvt s1, h1
-; CHECK-NOFP-NEXT:    fcmp s2, s16
 ; CHECK-NOFP-NEXT:    fcvt s0, h0
 ; CHECK-NOFP-NEXT:    fminnm s0, s0, s1
 ; CHECK-NOFP-NEXT:    fcsel s1, s2, s16, lt
 ; CHECK-NOFP-NEXT:    ldr h2, [x8, :lo12:.LCPI14_0]
 ; CHECK-NOFP-NEXT:    mov w8, #2139095040 // =0x7f800000
-; CHECK-NOFP-NEXT:    fcvt h0, s0
-; CHECK-NOFP-NEXT:    fcvt h1, s1
 ; CHECK-NOFP-NEXT:    fcvt s2, h2
 ; CHECK-NOFP-NEXT:    fmov s16, w8
+; CHECK-NOFP-NEXT:    fcvt h0, s0
+; CHECK-NOFP-NEXT:    fcvt h1, s1
+; CHECK-NOFP-NEXT:    fcmp s3, s2
 ; CHECK-NOFP-NEXT:    fcvt s0, h0
 ; CHECK-NOFP-NEXT:    fcvt s1, h1
-; CHECK-NOFP-NEXT:    fcmp s3, s2
 ; CHECK-NOFP-NEXT:    fminnm s0, s0, s1
 ; CHECK-NOFP-NEXT:    fcsel s1, s3, s16, lt
 ; CHECK-NOFP-NEXT:    fcvt s3, h4
@@ -491,8 +491,8 @@ define half @test_v11f16(<11 x half> %a) nounwind {
 ; CHECK-FP-NEXT:    // kill: def $h2 killed $h2 def $q2
 ; CHECK-FP-NEXT:    // kill: def $h3 killed $h3 def $q3
 ; CHECK-FP-NEXT:    // kill: def $h4 killed $h4 def $q4
-; CHECK-FP-NEXT:    mov x8, sp
 ; CHECK-FP-NEXT:    // kill: def $h5 killed $h5 def $q5
+; CHECK-FP-NEXT:    mov x8, sp
 ; CHECK-FP-NEXT:    // kill: def $h6 killed $h6 def $q6
 ; CHECK-FP-NEXT:    // kill: def $h7 killed $h7 def $q7
 ; CHECK-FP-NEXT:    mov v0.h[1], v1.h[0]
@@ -500,11 +500,11 @@ define half @test_v11f16(<11 x half> %a) nounwind {
 ; CHECK-FP-NEXT:    mov v0.h[2], v2.h[0]
 ; CHECK-FP-NEXT:    ld1 { v1.h }[0], [x8]
 ; CHECK-FP-NEXT:    add x8, sp, #8
-; CHECK-FP-NEXT:    mov v0.h[3], v3.h[0]
 ; CHECK-FP-NEXT:    ld1 { v1.h }[1], [x8]
 ; CHECK-FP-NEXT:    add x8, sp, #16
-; CHECK-FP-NEXT:    mov v0.h[4], v4.h[0]
+; CHECK-FP-NEXT:    mov v0.h[3], v3.h[0]
 ; CHECK-FP-NEXT:    ld1 { v1.h }[2], [x8]
+; CHECK-FP-NEXT:    mov v0.h[4], v4.h[0]
 ; CHECK-FP-NEXT:    mov v0.h[5], v5.h[0]
 ; CHECK-FP-NEXT:    mov v0.h[6], v6.h[0]
 ; CHECK-FP-NEXT:    mov v0.h[7], v7.h[0]
@@ -525,31 +525,31 @@ define half @test_v11f16_ninf(<11 x half> %a) nounwind {
 ; CHECK-NOFP-NEXT:    fcvt s2, h2
 ; CHECK-NOFP-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-NOFP-NEXT:    fcvt s16, h16
-; CHECK-NOFP-NEXT:    fcvt s3, h3
 ; CHECK-NOFP-NEXT:    fcvt s17, h17
+; CHECK-NOFP-NEXT:    fcvt s3, h3
 ; CHECK-NOFP-NEXT:    fcmp s1, s16
 ; CHECK-NOFP-NEXT:    fcsel s1, s1, s16, lt
 ; CHECK-NOFP-NEXT:    fcmp s0, s17
 ; CHECK-NOFP-NEXT:    ldr h16, [sp, #16]
+; CHECK-NOFP-NEXT:    fcvt s16, h16
 ; CHECK-NOFP-NEXT:    fcsel s0, s0, s17, lt
 ; CHECK-NOFP-NEXT:    fcvt h1, s1
-; CHECK-NOFP-NEXT:    fcvt s16, h16
+; CHECK-NOFP-NEXT:    fcmp s2, s16
 ; CHECK-NOFP-NEXT:    fcvt h0, s0
 ; CHECK-NOFP-NEXT:    fcvt s1, h1
-; CHECK-NOFP-NEXT:    fcmp s2, s16
 ; CHECK-NOFP-NEXT:    fcvt s0, h0
 ; CHECK-NOFP-NEXT:    fminnm s0, s0, s1
 ; CHECK-NOFP-NEXT:    fcsel s1, s2, s16, lt
 ; CHECK-NOFP-NEXT:    ldr h2, [x8, :lo12:.LCPI15_0]
 ; CHECK-NOFP-NEXT:    mov w8, #57344 // =0xe000
+; CHECK-NOFP-NEXT:    fcvt s2, h2
 ; CHECK-NOFP-NEXT:    movk w8, #18303, lsl #16
+; CHECK-NOFP-NEXT:    fmov s16, w8
 ; CHECK-NOFP-NEXT:    fcvt h0, s0
 ; CHECK-NOFP-NEXT:    fcvt h1, s1
-; CHECK-NOFP-NEXT:    fcvt s2, h2
-; CHECK-NOFP-NEXT:    fmov s16, w8
+; CHECK-NOFP-NEXT:    fcmp s3, s2
 ; CHECK-NOFP-NEXT:    fcvt s0, h0
 ; CHECK-NOFP-NEXT:    fcvt s1, h1
-; CHECK-NOFP-NEXT:    fcmp s3, s2
 ; CHECK-NOFP-NEXT:    fminnm s0, s0, s1
 ; CHECK-NOFP-NEXT:    fcsel s1, s3, s16, lt
 ; CHECK-NOFP-NEXT:    fcvt s3, h4
@@ -599,8 +599,8 @@ define half @test_v11f16_ninf(<11 x half> %a) nounwind {
 ; CHECK-FP-NEXT:    // kill: def $h2 killed $h2 def $q2
 ; CHECK-FP-NEXT:    // kill: def $h3 killed $h3 def $q3
 ; CHECK-FP-NEXT:    // kill: def $h4 killed $h4 def $q4
-; CHECK-FP-NEXT:    mov x8, sp
 ; CHECK-FP-NEXT:    // kill: def $h5 killed $h5 def $q5
+; CHECK-FP-NEXT:    mov x8, sp
 ; CHECK-FP-NEXT:    // kill: def $h6 killed $h6 def $q6
 ; CHECK-FP-NEXT:    // kill: def $h7 killed $h7 def $q7
 ; CHECK-FP-NEXT:    mov v0.h[1], v1.h[0]

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll b/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll
index 378e50795c1dd3..8123ca6d1b54ff 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll
@@ -229,8 +229,8 @@ define half @test_v16f16(<16 x half> %a) nounwind {
 ; CHECK-NOFP-SD-NEXT:    mov h5, v0.h[6]
 ; CHECK-NOFP-SD-NEXT:    mov h0, v0.h[7]
 ; CHECK-NOFP-SD-NEXT:    fmin s2, s2, s3
-; CHECK-NOFP-SD-NEXT:    mov h3, v1.h[6]
-; CHECK-NOFP-SD-NEXT:    fcvt h4, s4
+; CHECK-NOFP-SD-NEXT:    fcvt h3, s4
+; CHECK-NOFP-SD-NEXT:    mov h4, v1.h[6]
 ; CHECK-NOFP-SD-NEXT:    fcvt s5, h5
 ; CHECK-NOFP-SD-NEXT:    mov h1, v1.h[7]
 ; CHECK-NOFP-SD-NEXT:    fcvt s0, h0
@@ -239,15 +239,15 @@ define half @test_v16f16(<16 x half> %a) nounwind {
 ; CHECK-NOFP-SD-NEXT:    fcvt s4, h4
 ; CHECK-NOFP-SD-NEXT:    fcvt s1, h1
 ; CHECK-NOFP-SD-NEXT:    fcvt s2, h2
-; CHECK-NOFP-SD-NEXT:    fmin s3, s5, s3
 ; CHECK-NOFP-SD-NEXT:    fmin s0, s0, s1
-; CHECK-NOFP-SD-NEXT:    fmin s2, s2, s4
-; CHECK-NOFP-SD-NEXT:    fcvt h3, s3
+; CHECK-NOFP-SD-NEXT:    fmin s2, s2, s3
+; CHECK-NOFP-SD-NEXT:    fmin s3, s5, s4
 ; CHECK-NOFP-SD-NEXT:    fcvt h0, s0
 ; CHECK-NOFP-SD-NEXT:    fcvt h2, s2
-; CHECK-NOFP-SD-NEXT:    fcvt s3, h3
+; CHECK-NOFP-SD-NEXT:    fcvt h3, s3
 ; CHECK-NOFP-SD-NEXT:    fcvt s0, h0
 ; CHECK-NOFP-SD-NEXT:    fcvt s2, h2
+; CHECK-NOFP-SD-NEXT:    fcvt s3, h3
 ; CHECK-NOFP-SD-NEXT:    fmin s2, s2, s3
 ; CHECK-NOFP-SD-NEXT:    fcvt h1, s2
 ; CHECK-NOFP-SD-NEXT:    fcvt s1, h1
@@ -364,15 +364,15 @@ define half @test_v11f16(<11 x half> %a) nounwind {
 ; CHECK-NOFP-LABEL: test_v11f16:
 ; CHECK-NOFP:       // %bb.0:
 ; CHECK-NOFP-NEXT:    ldr h16, [sp, #8]
-; CHECK-NOFP-NEXT:    fcvt s1, h1
 ; CHECK-NOFP-NEXT:    ldr h17, [sp]
+; CHECK-NOFP-NEXT:    fcvt s1, h1
 ; CHECK-NOFP-NEXT:    fcvt s0, h0
 ; CHECK-NOFP-NEXT:    fcvt s2, h2
 ; CHECK-NOFP-NEXT:    fcvt s16, h16
 ; CHECK-NOFP-NEXT:    fcvt s17, h17
 ; CHECK-NOFP-NEXT:    fmin s1, s1, s16
-; CHECK-NOFP-NEXT:    ldr h16, [sp, #16]
 ; CHECK-NOFP-NEXT:    fmin s0, s0, s17
+; CHECK-NOFP-NEXT:    ldr h16, [sp, #16]
 ; CHECK-NOFP-NEXT:    fcvt s16, h16
 ; CHECK-NOFP-NEXT:    fcvt h1, s1
 ; CHECK-NOFP-NEXT:    fcvt h0, s0
@@ -415,8 +415,8 @@ define half @test_v11f16(<11 x half> %a) nounwind {
 ; CHECK-FP-NEXT:    // kill: def $h2 killed $h2 def $q2
 ; CHECK-FP-NEXT:    // kill: def $h3 killed $h3 def $q3
 ; CHECK-FP-NEXT:    // kill: def $h4 killed $h4 def $q4
-; CHECK-FP-NEXT:    mov x8, sp
 ; CHECK-FP-NEXT:    // kill: def $h5 killed $h5 def $q5
+; CHECK-FP-NEXT:    mov x8, sp
 ; CHECK-FP-NEXT:    // kill: def $h6 killed $h6 def $q6
 ; CHECK-FP-NEXT:    // kill: def $h7 killed $h7 def $q7
 ; CHECK-FP-NEXT:    mov v0.h[1], v1.h[0]
@@ -424,11 +424,11 @@ define half @test_v11f16(<11 x half> %a) nounwind {
 ; CHECK-FP-NEXT:    mov v0.h[2], v2.h[0]
 ; CHECK-FP-NEXT:    ld1 { v1.h }[0], [x8]
 ; CHECK-FP-NEXT:    add x8, sp, #8
-; CHECK-FP-NEXT:    mov v0.h[3], v3.h[0]
 ; CHECK-FP-NEXT:    ld1 { v1.h }[1], [x8]
 ; CHECK-FP-NEXT:    add x8, sp, #16
-; CHECK-FP-NEXT:    mov v0.h[4], v4.h[0]
+; CHECK-FP-NEXT:    mov v0.h[3], v3.h[0]
 ; CHECK-FP-NEXT:    ld1 { v1.h }[2], [x8]
+; CHECK-FP-NEXT:    mov v0.h[4], v4.h[0]
 ; CHECK-FP-NEXT:    mov v0.h[5], v5.h[0]
 ; CHECK-FP-NEXT:    mov v0.h[6], v6.h[0]
 ; CHECK-FP-NEXT:    mov v0.h[7], v7.h[0]

diff  --git a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll
index 90cbbc05b4d11a..c7134508883b11 100644
--- a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll
@@ -40,8 +40,8 @@ define <1 x double> @test_copysign_v1f64_v1f32(<1 x double> %a, <1 x float> %b)
 ; CHECK-LABEL: test_copysign_v1f64_v1f32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    movi.2d v2, #0xffffffffffffffff
-; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    fcvtl v1.2d, v1.2s
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    fneg.2d v2, v2
 ; CHECK-NEXT:    bif.16b v0, v1, v2
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
@@ -156,11 +156,11 @@ define <4 x double> @test_copysign_v4f64_v4f32(<4 x double> %a, <4 x float> %b)
 ; CHECK-LABEL: test_copysign_v4f64_v4f32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    movi.2d v3, #0xffffffffffffffff
-; CHECK-NEXT:    fcvtl2 v4.2d, v2.4s
-; CHECK-NEXT:    fcvtl v2.2d, v2.2s
+; CHECK-NEXT:    fcvtl v4.2d, v2.2s
+; CHECK-NEXT:    fcvtl2 v2.2d, v2.4s
 ; CHECK-NEXT:    fneg.2d v3, v3
-; CHECK-NEXT:    bif.16b v1, v4, v3
-; CHECK-NEXT:    bif.16b v0, v2, v3
+; CHECK-NEXT:    bif.16b v1, v2, v3
+; CHECK-NEXT:    bif.16b v0, v4, v3
 ; CHECK-NEXT:    ret
   %tmp0 = fpext <4 x float> %b to <4 x double>
   %r = call <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %tmp0)
@@ -191,28 +191,29 @@ define <4 x half> @test_copysign_v4f16_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; NOFP16-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; NOFP16-NEXT:    mov h3, v1[1]
 ; NOFP16-NEXT:    mov h4, v0[1]
-; NOFP16-NEXT:    fcvt s5, h1
-; NOFP16-NEXT:    fcvt s6, h0
-; NOFP16-NEXT:    mov h7, v1[2]
-; NOFP16-NEXT:    mov h16, v0[2]
+; NOFP16-NEXT:    mov h5, v1[2]
+; NOFP16-NEXT:    mov h6, v0[2]
 ; NOFP16-NEXT:    mvni.4s v2, #128, lsl #24
+; NOFP16-NEXT:    fcvt s7, h1
+; NOFP16-NEXT:    fcvt s16, h0
 ; NOFP16-NEXT:    mov h1, v1[3]
 ; NOFP16-NEXT:    fcvt s3, h3
 ; NOFP16-NEXT:    fcvt s4, h4
-; NOFP16-NEXT:    bit.16b v5, v6, v2
-; NOFP16-NEXT:    fcvt s6, h7
-; NOFP16-NEXT:    fcvt s7, h16
 ; NOFP16-NEXT:    fcvt s1, h1
 ; NOFP16-NEXT:    bit.16b v3, v4, v2
-; NOFP16-NEXT:    mov h4, v0[3]
-; NOFP16-NEXT:    fcvt h0, s5
-; NOFP16-NEXT:    bit.16b v6, v7, v2
+; NOFP16-NEXT:    fcvt s4, h5
+; NOFP16-NEXT:    fcvt s5, h6
+; NOFP16-NEXT:    mov.16b v6, v2
+; NOFP16-NEXT:    bsl.16b v6, v16, v7
+; NOFP16-NEXT:    mov h7, v0[3]
+; NOFP16-NEXT:    bit.16b v4, v5, v2
 ; NOFP16-NEXT:    fcvt h3, s3
-; NOFP16-NEXT:    fcvt s4, h4
-; NOFP16-NEXT:    fcvt h5, s6
+; NOFP16-NEXT:    fcvt h0, s6
+; NOFP16-NEXT:    fcvt s5, h7
 ; NOFP16-NEXT:    mov.h v0[1], v3[0]
-; NOFP16-NEXT:    bit.16b v1, v4, v2
-; NOFP16-NEXT:    mov.h v0[2], v5[0]
+; NOFP16-NEXT:    fcvt h3, s4
+; NOFP16-NEXT:    bit.16b v1, v5, v2
+; NOFP16-NEXT:    mov.h v0[2], v3[0]
 ; NOFP16-NEXT:    fcvt h1, s1
 ; NOFP16-NEXT:    mov.h v0[3], v1[0]
 ; NOFP16-NEXT:    ; kill: def $d0 killed $d0 killed $q0
@@ -233,29 +234,30 @@ define <4 x half> @test_copysign_v4f16_v4f32(<4 x half> %a, <4 x float> %b) #0 {
 ; NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; NOFP16-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; NOFP16-NEXT:    mov h3, v0[1]
-; NOFP16-NEXT:    fcvt s5, h0
-; NOFP16-NEXT:    mov h7, v0[2]
+; NOFP16-NEXT:    mov h5, v0[2]
 ; NOFP16-NEXT:    mvni.4s v2, #128, lsl #24
+; NOFP16-NEXT:    fcvt s7, h0
 ; NOFP16-NEXT:    mov h4, v1[1]
-; NOFP16-NEXT:    fcvt s6, h1
-; NOFP16-NEXT:    mov h16, v1[2]
 ; NOFP16-NEXT:    fcvt s3, h3
+; NOFP16-NEXT:    mov h6, v1[2]
+; NOFP16-NEXT:    fcvt s16, h1
 ; NOFP16-NEXT:    mov h1, v1[3]
 ; NOFP16-NEXT:    fcvt s4, h4
-; NOFP16-NEXT:    bif.16b v5, v6, v2
-; NOFP16-NEXT:    fcvt s6, h7
-; NOFP16-NEXT:    fcvt s7, h16
 ; NOFP16-NEXT:    fcvt s1, h1
 ; NOFP16-NEXT:    bif.16b v3, v4, v2
-; NOFP16-NEXT:    mov h4, v0[3]
-; NOFP16-NEXT:    fcvt h0, s5
-; NOFP16-NEXT:    bif.16b v6, v7, v2
+; NOFP16-NEXT:    fcvt s4, h5
+; NOFP16-NEXT:    fcvt s5, h6
+; NOFP16-NEXT:    mov.16b v6, v2
+; NOFP16-NEXT:    bsl.16b v6, v7, v16
+; NOFP16-NEXT:    mov h7, v0[3]
+; NOFP16-NEXT:    bif.16b v4, v5, v2
 ; NOFP16-NEXT:    fcvt h3, s3
-; NOFP16-NEXT:    fcvt s4, h4
-; NOFP16-NEXT:    fcvt h5, s6
+; NOFP16-NEXT:    fcvt h0, s6
+; NOFP16-NEXT:    fcvt s5, h7
 ; NOFP16-NEXT:    mov.h v0[1], v3[0]
-; NOFP16-NEXT:    bit.16b v1, v4, v2
-; NOFP16-NEXT:    mov.h v0[2], v5[0]
+; NOFP16-NEXT:    fcvt h3, s4
+; NOFP16-NEXT:    bit.16b v1, v5, v2
+; NOFP16-NEXT:    mov.h v0[2], v3[0]
 ; NOFP16-NEXT:    fcvt h1, s1
 ; NOFP16-NEXT:    mov.h v0[3], v1[0]
 ; NOFP16-NEXT:    ; kill: def $d0 killed $d0 killed $q0
@@ -276,31 +278,31 @@ define <4 x half> @test_copysign_v4f16_v4f64(<4 x half> %a, <4 x double> %b) #0
 ; NOFP16-LABEL: test_copysign_v4f16_v4f64:
 ; NOFP16:       ; %bb.0:
 ; NOFP16-NEXT:    ; kill: def $d0 killed $d0 def $q0
-; NOFP16-NEXT:    mov d4, v1[1]
-; NOFP16-NEXT:    mov h5, v0[1]
+; NOFP16-NEXT:    mov d3, v1[1]
+; NOFP16-NEXT:    mov h4, v0[1]
 ; NOFP16-NEXT:    fcvt s1, d1
-; NOFP16-NEXT:    fcvt s6, h0
+; NOFP16-NEXT:    fcvt s5, h0
 ; NOFP16-NEXT:    mov h7, v0[2]
-; NOFP16-NEXT:    mvni.4s v3, #128, lsl #24
-; NOFP16-NEXT:    fcvt s4, d4
-; NOFP16-NEXT:    fcvt s5, h5
-; NOFP16-NEXT:    bit.16b v1, v6, v3
-; NOFP16-NEXT:    fcvt s6, d2
+; NOFP16-NEXT:    mvni.4s v6, #128, lsl #24
+; NOFP16-NEXT:    fcvt s3, d3
+; NOFP16-NEXT:    fcvt s4, h4
+; NOFP16-NEXT:    bit.16b v1, v5, v6
 ; NOFP16-NEXT:    fcvt s7, h7
-; NOFP16-NEXT:    mov d2, v2[1]
-; NOFP16-NEXT:    bit.16b v4, v5, v3
 ; NOFP16-NEXT:    mov h5, v0[3]
-; NOFP16-NEXT:    fcvt h0, s1
-; NOFP16-NEXT:    bit.16b v6, v7, v3
+; NOFP16-NEXT:    bit.16b v3, v4, v6
+; NOFP16-NEXT:    mov d4, v2[1]
 ; NOFP16-NEXT:    fcvt s2, d2
-; NOFP16-NEXT:    fcvt h1, s4
+; NOFP16-NEXT:    fcvt h0, s1
+; NOFP16-NEXT:    fcvt h1, s3
+; NOFP16-NEXT:    bit.16b v2, v7, v6
+; NOFP16-NEXT:    fcvt s3, d4
 ; NOFP16-NEXT:    fcvt s4, h5
-; NOFP16-NEXT:    fcvt h5, s6
 ; NOFP16-NEXT:    mov.h v0[1], v1[0]
-; NOFP16-NEXT:    mov.16b v1, v3
-; NOFP16-NEXT:    bsl.16b v1, v4, v2
-; NOFP16-NEXT:    mov.h v0[2], v5[0]
-; NOFP16-NEXT:    fcvt h1, s1
+; NOFP16-NEXT:    fcvt h1, s2
+; NOFP16-NEXT:    mov.16b v2, v6
+; NOFP16-NEXT:    bsl.16b v2, v4, v3
+; NOFP16-NEXT:    mov.h v0[2], v1[0]
+; NOFP16-NEXT:    fcvt h1, s2
 ; NOFP16-NEXT:    mov.h v0[3], v1[0]
 ; NOFP16-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; NOFP16-NEXT:    ret
@@ -331,63 +333,61 @@ declare <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %b) #0
 define <8 x half> @test_copysign_v8f16_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; NOFP16-LABEL: test_copysign_v8f16_v8f16:
 ; NOFP16:       ; %bb.0:
-; NOFP16-NEXT:    mov h5, v1[1]
-; NOFP16-NEXT:    mov h6, v0[1]
-; NOFP16-NEXT:    fcvt s2, h1
-; NOFP16-NEXT:    fcvt s4, h0
+; NOFP16-NEXT:    mov h2, v1[1]
+; NOFP16-NEXT:    mov h4, v0[1]
+; NOFP16-NEXT:    fcvt s5, h1
+; NOFP16-NEXT:    fcvt s6, h0
+; NOFP16-NEXT:    mvni.4s v3, #128, lsl #24
 ; NOFP16-NEXT:    mov h7, v1[2]
 ; NOFP16-NEXT:    mov h16, v0[2]
-; NOFP16-NEXT:    mvni.4s v3, #128, lsl #24
-; NOFP16-NEXT:    mov h17, v0[3]
-; NOFP16-NEXT:    fcvt s5, h5
-; NOFP16-NEXT:    fcvt s6, h6
-; NOFP16-NEXT:    mov h18, v1[5]
-; NOFP16-NEXT:    bit.16b v2, v4, v3
-; NOFP16-NEXT:    mov h4, v1[3]
+; NOFP16-NEXT:    mov h17, v1[3]
+; NOFP16-NEXT:    fcvt s2, h2
+; NOFP16-NEXT:    fcvt s4, h4
+; NOFP16-NEXT:    bit.16b v5, v6, v3
+; NOFP16-NEXT:    mov h6, v0[3]
 ; NOFP16-NEXT:    fcvt s7, h7
 ; NOFP16-NEXT:    fcvt s16, h16
 ; NOFP16-NEXT:    fcvt s17, h17
-; NOFP16-NEXT:    bit.16b v5, v6, v3
-; NOFP16-NEXT:    fcvt s18, h18
-; NOFP16-NEXT:    mov.16b v6, v3
-; NOFP16-NEXT:    fcvt s4, h4
-; NOFP16-NEXT:    bsl.16b v6, v16, v7
+; NOFP16-NEXT:    bif.16b v4, v2, v3
+; NOFP16-NEXT:    fcvt h2, s5
+; NOFP16-NEXT:    mov.16b v5, v3
+; NOFP16-NEXT:    fcvt s6, h6
+; NOFP16-NEXT:    bsl.16b v5, v16, v7
+; NOFP16-NEXT:    fcvt h4, s4
 ; NOFP16-NEXT:    mov h7, v1[4]
 ; NOFP16-NEXT:    mov h16, v0[4]
-; NOFP16-NEXT:    fcvt h2, s2
-; NOFP16-NEXT:    fcvt h5, s5
-; NOFP16-NEXT:    bit.16b v4, v17, v3
+; NOFP16-NEXT:    bif.16b v6, v17, v3
 ; NOFP16-NEXT:    mov h17, v0[5]
-; NOFP16-NEXT:    fcvt s7, h7
-; NOFP16-NEXT:    fcvt s16, h16
+; NOFP16-NEXT:    fcvt h5, s5
+; NOFP16-NEXT:    mov.h v2[1], v4[0]
+; NOFP16-NEXT:    fcvt s4, h7
+; NOFP16-NEXT:    fcvt s7, h16
+; NOFP16-NEXT:    mov h16, v1[5]
 ; NOFP16-NEXT:    fcvt h6, s6
-; NOFP16-NEXT:    mov.h v2[1], v5[0]
-; NOFP16-NEXT:    mov.16b v5, v3
 ; NOFP16-NEXT:    fcvt s17, h17
-; NOFP16-NEXT:    bsl.16b v5, v16, v7
-; NOFP16-NEXT:    mov h7, v1[6]
-; NOFP16-NEXT:    mov h16, v0[6]
-; NOFP16-NEXT:    mov.h v2[2], v6[0]
-; NOFP16-NEXT:    fcvt h4, s4
-; NOFP16-NEXT:    mov.16b v6, v3
-; NOFP16-NEXT:    bsl.16b v6, v17, v18
-; NOFP16-NEXT:    fcvt s7, h7
-; NOFP16-NEXT:    fcvt s16, h16
+; NOFP16-NEXT:    mov.h v2[2], v5[0]
+; NOFP16-NEXT:    mov h5, v1[6]
 ; NOFP16-NEXT:    mov h1, v1[7]
-; NOFP16-NEXT:    mov.h v2[3], v4[0]
-; NOFP16-NEXT:    fcvt h4, s5
+; NOFP16-NEXT:    bit.16b v4, v7, v3
+; NOFP16-NEXT:    mov h7, v0[6]
+; NOFP16-NEXT:    fcvt s16, h16
 ; NOFP16-NEXT:    mov h0, v0[7]
-; NOFP16-NEXT:    mov.16b v5, v3
-; NOFP16-NEXT:    bsl.16b v5, v16, v7
-; NOFP16-NEXT:    mov.h v2[4], v4[0]
-; NOFP16-NEXT:    fcvt h4, s6
+; NOFP16-NEXT:    mov.h v2[3], v6[0]
+; NOFP16-NEXT:    fcvt s5, h5
 ; NOFP16-NEXT:    fcvt s1, h1
+; NOFP16-NEXT:    fcvt s6, h7
+; NOFP16-NEXT:    mov.16b v7, v3
+; NOFP16-NEXT:    fcvt h4, s4
 ; NOFP16-NEXT:    fcvt s0, h0
-; NOFP16-NEXT:    fcvt h5, s5
-; NOFP16-NEXT:    mov.h v2[5], v4[0]
+; NOFP16-NEXT:    bsl.16b v7, v17, v16
+; NOFP16-NEXT:    bit.16b v5, v6, v3
+; NOFP16-NEXT:    mov.h v2[4], v4[0]
 ; NOFP16-NEXT:    bif.16b v0, v1, v3
-; NOFP16-NEXT:    mov.h v2[6], v5[0]
+; NOFP16-NEXT:    fcvt h4, s7
 ; NOFP16-NEXT:    fcvt h0, s0
+; NOFP16-NEXT:    mov.h v2[5], v4[0]
+; NOFP16-NEXT:    fcvt h4, s5
+; NOFP16-NEXT:    mov.h v2[6], v4[0]
 ; NOFP16-NEXT:    mov.h v2[7], v0[0]
 ; NOFP16-NEXT:    mov.16b v0, v2
 ; NOFP16-NEXT:    ret
@@ -407,59 +407,58 @@ define <8 x half> @test_copysign_v8f16_v8f32(<8 x half> %a, <8 x float> %b) #0 {
 ; NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; NOFP16-NEXT:    mov h4, v0[1]
 ; NOFP16-NEXT:    fcvt s6, h0
-; NOFP16-NEXT:    mov h16, v0[2]
 ; NOFP16-NEXT:    mvni.4s v3, #128, lsl #24
+; NOFP16-NEXT:    mov h7, v0[2]
 ; NOFP16-NEXT:    fcvtn v2.4h, v2.4s
 ; NOFP16-NEXT:    mov h5, v1[1]
-; NOFP16-NEXT:    fcvt s7, h1
+; NOFP16-NEXT:    fcvt s16, h1
 ; NOFP16-NEXT:    fcvt s4, h4
 ; NOFP16-NEXT:    mov h17, v1[2]
 ; NOFP16-NEXT:    mov h1, v1[3]
-; NOFP16-NEXT:    fcvt s16, h16
+; NOFP16-NEXT:    fcvt s7, h7
 ; NOFP16-NEXT:    fcvt s5, h5
-; NOFP16-NEXT:    bif.16b v6, v7, v3
-; NOFP16-NEXT:    mov h7, v0[3]
+; NOFP16-NEXT:    bif.16b v6, v16, v3
+; NOFP16-NEXT:    mov h16, v0[3]
 ; NOFP16-NEXT:    fcvt s17, h17
 ; NOFP16-NEXT:    fcvt s18, h1
 ; NOFP16-NEXT:    bif.16b v4, v5, v3
-; NOFP16-NEXT:    mov h5, v0[4]
 ; NOFP16-NEXT:    fcvt h1, s6
 ; NOFP16-NEXT:    mov.16b v6, v3
-; NOFP16-NEXT:    fcvt s7, h7
-; NOFP16-NEXT:    bsl.16b v6, v16, v17
+; NOFP16-NEXT:    mov h5, v0[4]
+; NOFP16-NEXT:    fcvt s16, h16
+; NOFP16-NEXT:    bsl.16b v6, v7, v17
+; NOFP16-NEXT:    mov h7, v0[5]
+; NOFP16-NEXT:    mov h17, v2[1]
 ; NOFP16-NEXT:    fcvt h4, s4
-; NOFP16-NEXT:    mov h16, v0[5]
 ; NOFP16-NEXT:    fcvt s5, h5
-; NOFP16-NEXT:    fcvt s17, h2
-; NOFP16-NEXT:    bif.16b v7, v18, v3
-; NOFP16-NEXT:    mov h18, v2[1]
-; NOFP16-NEXT:    mov.h v1[1], v4[0]
+; NOFP16-NEXT:    bif.16b v16, v18, v3
 ; NOFP16-NEXT:    fcvt h6, s6
-; NOFP16-NEXT:    fcvt s4, h16
-; NOFP16-NEXT:    bif.16b v5, v17, v3
-; NOFP16-NEXT:    fcvt s16, h18
-; NOFP16-NEXT:    mov h17, v0[6]
-; NOFP16-NEXT:    mov h18, v2[2]
-; NOFP16-NEXT:    mov h0, v0[7]
+; NOFP16-NEXT:    fcvt s7, h7
+; NOFP16-NEXT:    fcvt s17, h17
+; NOFP16-NEXT:    mov.h v1[1], v4[0]
+; NOFP16-NEXT:    fcvt s4, h2
+; NOFP16-NEXT:    bif.16b v7, v17, v3
+; NOFP16-NEXT:    bit.16b v4, v5, v3
+; NOFP16-NEXT:    fcvt h5, s16
 ; NOFP16-NEXT:    mov.h v1[2], v6[0]
-; NOFP16-NEXT:    fcvt h6, s7
-; NOFP16-NEXT:    bif.16b v4, v16, v3
-; NOFP16-NEXT:    fcvt s7, h17
-; NOFP16-NEXT:    fcvt s16, h18
-; NOFP16-NEXT:    fcvt h5, s5
-; NOFP16-NEXT:    mov.h v1[3], v6[0]
+; NOFP16-NEXT:    mov h6, v0[6]
+; NOFP16-NEXT:    mov h16, v2[2]
+; NOFP16-NEXT:    mov h0, v0[7]
 ; NOFP16-NEXT:    mov h2, v2[3]
-; NOFP16-NEXT:    mov.16b v6, v3
+; NOFP16-NEXT:    mov.h v1[3], v5[0]
 ; NOFP16-NEXT:    fcvt h4, s4
-; NOFP16-NEXT:    bsl.16b v6, v7, v16
+; NOFP16-NEXT:    fcvt s5, h6
+; NOFP16-NEXT:    fcvt s6, h16
 ; NOFP16-NEXT:    fcvt s0, h0
-; NOFP16-NEXT:    mov.h v1[4], v5[0]
 ; NOFP16-NEXT:    fcvt s2, h2
-; NOFP16-NEXT:    fcvt h5, s6
-; NOFP16-NEXT:    mov.h v1[5], v4[0]
+; NOFP16-NEXT:    mov.h v1[4], v4[0]
+; NOFP16-NEXT:    fcvt h4, s7
+; NOFP16-NEXT:    bif.16b v5, v6, v3
 ; NOFP16-NEXT:    bif.16b v0, v2, v3
-; NOFP16-NEXT:    mov.h v1[6], v5[0]
+; NOFP16-NEXT:    mov.h v1[5], v4[0]
+; NOFP16-NEXT:    fcvt h4, s5
 ; NOFP16-NEXT:    fcvt h0, s0
+; NOFP16-NEXT:    mov.h v1[6], v4[0]
 ; NOFP16-NEXT:    mov.h v1[7], v0[0]
 ; NOFP16-NEXT:    mov.16b v0, v1
 ; NOFP16-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/vector-fcvt.ll b/llvm/test/CodeGen/AArch64/vector-fcvt.ll
index 111c9a6f07987a..8f38bdbedc629b 100644
--- a/llvm/test/CodeGen/AArch64/vector-fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/vector-fcvt.ll
@@ -36,24 +36,24 @@ define <8 x float> @sitofp_v8i8_float(<8 x i8> %a) {
 define <16 x float> @sitofp_v16i8_float(<16 x i8> %a) {
 ; CHECK-LABEL: sitofp_v16i8_float:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 v1.8b, v0.8b, v0.8b
-; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    zip1 v2.8b, v0.8b, v0.8b
 ; CHECK-NEXT:    zip2 v0.8b, v0.8b, v0.8b
-; CHECK-NEXT:    shl v1.4h, v1.4h, #8
-; CHECK-NEXT:    zip1 v3.8b, v2.8b, v0.8b
+; CHECK-NEXT:    zip1 v3.8b, v1.8b, v0.8b
+; CHECK-NEXT:    zip2 v1.8b, v1.8b, v0.8b
+; CHECK-NEXT:    shl v2.4h, v2.4h, #8
 ; CHECK-NEXT:    shl v0.4h, v0.4h, #8
-; CHECK-NEXT:    sshr v1.4h, v1.4h, #8
-; CHECK-NEXT:    zip2 v2.8b, v2.8b, v0.8b
-; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
-; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-NEXT:    sshr v2.4h, v2.4h, #8
 ; CHECK-NEXT:    shl v3.4h, v3.4h, #8
+; CHECK-NEXT:    shl v1.4h, v1.4h, #8
+; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-NEXT:    sshr v3.4h, v3.4h, #8
+; CHECK-NEXT:    sshr v1.4h, v1.4h, #8
 ; CHECK-NEXT:    sshll v4.4s, v0.4h, #0
-; CHECK-NEXT:    shl v2.4h, v2.4h, #8
-; CHECK-NEXT:    sshr v0.4h, v3.4h, #8
-; CHECK-NEXT:    sshr v2.4h, v2.4h, #8
-; CHECK-NEXT:    sshll v3.4s, v0.4h, #0
-; CHECK-NEXT:    scvtf v0.4s, v1.4s
-; CHECK-NEXT:    sshll v5.4s, v2.4h, #0
+; CHECK-NEXT:    scvtf v0.4s, v2.4s
+; CHECK-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-NEXT:    sshll v5.4s, v1.4h, #0
 ; CHECK-NEXT:    scvtf v1.4s, v4.4s
 ; CHECK-NEXT:    scvtf v2.4s, v3.4s
 ; CHECK-NEXT:    scvtf v3.4s, v5.4s
@@ -87,8 +87,8 @@ define <8 x float> @sitofp_i32_float(<8 x i32> %a) {
 define <8 x float> @sitofp_i64_float(<8 x i64> %a) {
 ; CHECK-LABEL: sitofp_i64_float:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    scvtf v2.2d, v2.2d
 ; CHECK-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-NEXT:    scvtf v2.2d, v2.2d
 ; CHECK-NEXT:    scvtf v4.2d, v1.2d
 ; CHECK-NEXT:    fcvtn v0.2s, v0.2d
 ; CHECK-NEXT:    fcvtn v1.2s, v2.2d
@@ -130,23 +130,23 @@ define <8 x float> @uitofp_v8i8_float(<8 x i8> %a) {
 define <16 x float> @uitofp_v16i8_float(<16 x i8> %a) {
 ; CHECK-LABEL: uitofp_v16i8_float:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 v1.8b, v0.8b, v0.8b
-; CHECK-NEXT:    zip2 v2.8b, v0.8b, v0.8b
-; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    bic v1.4h, #255, lsl #8
-; CHECK-NEXT:    bic v2.4h, #255, lsl #8
-; CHECK-NEXT:    zip1 v3.8b, v0.8b, v0.8b
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    zip1 v2.8b, v0.8b, v0.8b
 ; CHECK-NEXT:    zip2 v0.8b, v0.8b, v0.8b
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    zip1 v3.8b, v1.8b, v0.8b
+; CHECK-NEXT:    zip2 v1.8b, v1.8b, v0.8b
+; CHECK-NEXT:    bic v2.4h, #255, lsl #8
+; CHECK-NEXT:    bic v0.4h, #255, lsl #8
 ; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
 ; CHECK-NEXT:    bic v3.4h, #255, lsl #8
-; CHECK-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-NEXT:    bic v1.4h, #255, lsl #8
 ; CHECK-NEXT:    ushll v4.4s, v0.4h, #0
-; CHECK-NEXT:    ucvtf v0.4s, v1.4s
-; CHECK-NEXT:    ucvtf v1.4s, v2.4s
+; CHECK-NEXT:    ucvtf v0.4s, v2.4s
+; CHECK-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-NEXT:    ushll v5.4s, v1.4h, #0
+; CHECK-NEXT:    ucvtf v1.4s, v4.4s
 ; CHECK-NEXT:    ucvtf v2.4s, v3.4s
-; CHECK-NEXT:    ucvtf v3.4s, v4.4s
+; CHECK-NEXT:    ucvtf v3.4s, v5.4s
 ; CHECK-NEXT:    ret
   %1 = uitofp <16 x i8> %a to <16 x float>
   ret <16 x float> %1
@@ -177,8 +177,8 @@ define <8 x float> @uitofp_i32_float(<8 x i32> %a) {
 define <8 x float> @uitofp_i64_float(<8 x i64> %a) {
 ; CHECK-LABEL: uitofp_i64_float:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ucvtf v2.2d, v2.2d
 ; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    ucvtf v2.2d, v2.2d
 ; CHECK-NEXT:    ucvtf v4.2d, v1.2d
 ; CHECK-NEXT:    fcvtn v0.2s, v0.2d
 ; CHECK-NEXT:    fcvtn v1.2s, v2.2d
@@ -215,25 +215,25 @@ define <8 x double> @sitofp_v8i8_double(<8 x i8> %a) {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    umov w8, v0.b[0]
 ; CHECK-NEXT:    umov w9, v0.b[2]
-; CHECK-NEXT:    umov w10, v0.b[4]
-; CHECK-NEXT:    umov w11, v0.b[6]
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    umov w8, v0.b[1]
-; CHECK-NEXT:    fmov s2, w9
-; CHECK-NEXT:    umov w9, v0.b[3]
-; CHECK-NEXT:    fmov s3, w10
-; CHECK-NEXT:    umov w10, v0.b[5]
-; CHECK-NEXT:    fmov s4, w11
-; CHECK-NEXT:    umov w11, v0.b[7]
-; CHECK-NEXT:    mov v1.s[1], w8
-; CHECK-NEXT:    mov v2.s[1], w9
-; CHECK-NEXT:    mov v3.s[1], w10
-; CHECK-NEXT:    mov v4.s[1], w11
-; CHECK-NEXT:    shl v0.2s, v1.2s, #24
-; CHECK-NEXT:    shl v1.2s, v2.2s, #24
-; CHECK-NEXT:    shl v2.2s, v3.2s, #24
+; CHECK-NEXT:    umov w11, v0.b[4]
+; CHECK-NEXT:    umov w12, v0.b[6]
+; CHECK-NEXT:    umov w10, v0.b[1]
+; CHECK-NEXT:    umov w13, v0.b[3]
+; CHECK-NEXT:    umov w14, v0.b[5]
+; CHECK-NEXT:    umov w15, v0.b[7]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fmov s2, w11
+; CHECK-NEXT:    fmov s3, w12
+; CHECK-NEXT:    mov v0.s[1], w10
+; CHECK-NEXT:    mov v1.s[1], w13
+; CHECK-NEXT:    mov v2.s[1], w14
+; CHECK-NEXT:    mov v3.s[1], w15
+; CHECK-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-NEXT:    shl v1.2s, v1.2s, #24
+; CHECK-NEXT:    shl v2.2s, v2.2s, #24
+; CHECK-NEXT:    shl v3.2s, v3.2s, #24
 ; CHECK-NEXT:    sshr v0.2s, v0.2s, #24
-; CHECK-NEXT:    shl v3.2s, v4.2s, #24
 ; CHECK-NEXT:    sshr v1.2s, v1.2s, #24
 ; CHECK-NEXT:    sshr v2.2s, v2.2s, #24
 ; CHECK-NEXT:    sshr v3.2s, v3.2s, #24
@@ -253,71 +253,71 @@ define <8 x double> @sitofp_v8i8_double(<8 x i8> %a) {
 define <16 x double> @sitofp_v16i8_double(<16 x i8> %a) {
 ; CHECK-LABEL: sitofp_v16i8_double:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    umov w8, v0.b[0]
-; CHECK-NEXT:    umov w9, v0.b[2]
-; CHECK-NEXT:    umov w11, v0.b[1]
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    umov w9, v0.b[1]
+; CHECK-NEXT:    umov w10, v0.b[2]
 ; CHECK-NEXT:    umov w12, v0.b[4]
-; CHECK-NEXT:    umov w10, v1.b[0]
+; CHECK-NEXT:    umov w14, v0.b[6]
+; CHECK-NEXT:    umov w11, v0.b[3]
+; CHECK-NEXT:    umov w13, v0.b[5]
 ; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    umov w8, v1.b[2]
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    umov w9, v1.b[1]
-; CHECK-NEXT:    fmov s7, w12
-; CHECK-NEXT:    mov v2.s[1], w11
-; CHECK-NEXT:    umov w11, v1.b[3]
-; CHECK-NEXT:    fmov s4, w10
-; CHECK-NEXT:    umov w10, v1.b[4]
-; CHECK-NEXT:    fmov s5, w8
-; CHECK-NEXT:    umov w8, v1.b[6]
-; CHECK-NEXT:    umov w12, v0.b[7]
-; CHECK-NEXT:    mov v4.s[1], w9
-; CHECK-NEXT:    umov w9, v1.b[5]
-; CHECK-NEXT:    mov v5.s[1], w11
-; CHECK-NEXT:    fmov s6, w10
-; CHECK-NEXT:    umov w10, v0.b[6]
-; CHECK-NEXT:    umov w11, v1.b[7]
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    umov w8, v0.b[3]
-; CHECK-NEXT:    mov v6.s[1], w9
-; CHECK-NEXT:    umov w9, v0.b[5]
-; CHECK-NEXT:    shl v4.2s, v4.2s, #24
+; CHECK-NEXT:    umov w15, v1.b[0]
+; CHECK-NEXT:    umov w17, v1.b[2]
+; CHECK-NEXT:    umov w0, v1.b[4]
+; CHECK-NEXT:    umov w16, v1.b[1]
+; CHECK-NEXT:    umov w18, v1.b[3]
+; CHECK-NEXT:    umov w8, v0.b[7]
 ; CHECK-NEXT:    fmov s0, w10
-; CHECK-NEXT:    shl v5.2s, v5.2s, #24
-; CHECK-NEXT:    mov v1.s[1], w11
+; CHECK-NEXT:    umov w10, v1.b[5]
+; CHECK-NEXT:    mov v2.s[1], w9
+; CHECK-NEXT:    umov w9, v1.b[6]
+; CHECK-NEXT:    fmov s3, w12
+; CHECK-NEXT:    umov w12, v1.b[7]
+; CHECK-NEXT:    fmov s1, w14
+; CHECK-NEXT:    fmov s4, w15
+; CHECK-NEXT:    fmov s5, w17
+; CHECK-NEXT:    fmov s6, w0
+; CHECK-NEXT:    mov v0.s[1], w11
+; CHECK-NEXT:    mov v3.s[1], w13
+; CHECK-NEXT:    fmov s7, w9
+; CHECK-NEXT:    mov v1.s[1], w8
+; CHECK-NEXT:    mov v4.s[1], w16
+; CHECK-NEXT:    mov v5.s[1], w18
+; CHECK-NEXT:    mov v6.s[1], w10
 ; CHECK-NEXT:    shl v2.2s, v2.2s, #24
-; CHECK-NEXT:    mov v3.s[1], w8
-; CHECK-NEXT:    mov v7.s[1], w9
-; CHECK-NEXT:    mov v0.s[1], w12
-; CHECK-NEXT:    shl v6.2s, v6.2s, #24
-; CHECK-NEXT:    shl v1.2s, v1.2s, #24
+; CHECK-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-NEXT:    mov v7.s[1], w12
 ; CHECK-NEXT:    shl v3.2s, v3.2s, #24
+; CHECK-NEXT:    shl v1.2s, v1.2s, #24
+; CHECK-NEXT:    shl v4.2s, v4.2s, #24
+; CHECK-NEXT:    sshr v2.2s, v2.2s, #24
+; CHECK-NEXT:    shl v5.2s, v5.2s, #24
+; CHECK-NEXT:    shl v6.2s, v6.2s, #24
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #24
+; CHECK-NEXT:    sshr v3.2s, v3.2s, #24
 ; CHECK-NEXT:    shl v7.2s, v7.2s, #24
 ; CHECK-NEXT:    sshr v4.2s, v4.2s, #24
-; CHECK-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-NEXT:    sshr v1.2s, v1.2s, #24
 ; CHECK-NEXT:    sshr v5.2s, v5.2s, #24
 ; CHECK-NEXT:    sshr v6.2s, v6.2s, #24
-; CHECK-NEXT:    sshr v1.2s, v1.2s, #24
-; CHECK-NEXT:    sshr v2.2s, v2.2s, #24
-; CHECK-NEXT:    sshr v3.2s, v3.2s, #24
+; CHECK-NEXT:    sshll v2.2d, v2.2s, #0
+; CHECK-NEXT:    sshll v16.2d, v0.2s, #0
+; CHECK-NEXT:    sshll v3.2d, v3.2s, #0
 ; CHECK-NEXT:    sshr v7.2s, v7.2s, #24
-; CHECK-NEXT:    sshr v0.2s, v0.2s, #24
 ; CHECK-NEXT:    sshll v4.2d, v4.2s, #0
+; CHECK-NEXT:    sshll v17.2d, v1.2s, #0
 ; CHECK-NEXT:    sshll v5.2d, v5.2s, #0
 ; CHECK-NEXT:    sshll v6.2d, v6.2s, #0
-; CHECK-NEXT:    sshll v16.2d, v1.2s, #0
-; CHECK-NEXT:    sshll v1.2d, v2.2s, #0
-; CHECK-NEXT:    sshll v2.2d, v3.2s, #0
-; CHECK-NEXT:    sshll v3.2d, v7.2s, #0
-; CHECK-NEXT:    sshll v7.2d, v0.2s, #0
-; CHECK-NEXT:    scvtf v0.2d, v1.2d
-; CHECK-NEXT:    scvtf v1.2d, v2.2d
+; CHECK-NEXT:    scvtf v0.2d, v2.2d
+; CHECK-NEXT:    scvtf v1.2d, v16.2d
 ; CHECK-NEXT:    scvtf v2.2d, v3.2d
-; CHECK-NEXT:    scvtf v3.2d, v7.2d
+; CHECK-NEXT:    sshll v7.2d, v7.2s, #0
 ; CHECK-NEXT:    scvtf v4.2d, v4.2d
+; CHECK-NEXT:    scvtf v3.2d, v17.2d
 ; CHECK-NEXT:    scvtf v5.2d, v5.2d
 ; CHECK-NEXT:    scvtf v6.2d, v6.2d
-; CHECK-NEXT:    scvtf v7.2d, v16.2d
+; CHECK-NEXT:    scvtf v7.2d, v7.2d
 ; CHECK-NEXT:    ret
   %1 = sitofp <16 x i8> %a to <16 x double>
   ret <16 x double> %1
@@ -326,15 +326,15 @@ define <16 x double> @sitofp_v16i8_double(<16 x i8> %a) {
 define <8 x double> @sitofp_i16_double(<8 x i16> %a) {
 ; CHECK-LABEL: sitofp_i16_double:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    sshll2 v2.2d, v1.4s, #0
+; CHECK-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-NEXT:    sshll v2.2d, v1.2s, #0
 ; CHECK-NEXT:    sshll2 v3.2d, v0.4s, #0
-; CHECK-NEXT:    sshll v4.2d, v1.2s, #0
-; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-NEXT:    scvtf v1.2d, v3.2d
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    scvtf v3.2d, v2.2d
+; CHECK-NEXT:    sshll2 v1.2d, v1.4s, #0
+; CHECK-NEXT:    sshll v4.2d, v0.2s, #0
+; CHECK-NEXT:    scvtf v0.2d, v2.2d
+; CHECK-NEXT:    scvtf v3.2d, v3.2d
+; CHECK-NEXT:    scvtf v1.2d, v1.2d
 ; CHECK-NEXT:    scvtf v2.2d, v4.2d
 ; CHECK-NEXT:    ret
   %1 = sitofp <8 x i16> %a to <8 x double>
@@ -344,14 +344,14 @@ define <8 x double> @sitofp_i16_double(<8 x i16> %a) {
 define <8 x double> @sitofp_i32_double(<8 x i32> %a) {
 ; CHECK-LABEL: sitofp_i32_double:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sshll2 v2.2d, v0.4s, #0
-; CHECK-NEXT:    sshll2 v3.2d, v1.4s, #0
-; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-NEXT:    sshll v4.2d, v1.2s, #0
-; CHECK-NEXT:    scvtf v1.2d, v2.2d
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    scvtf v3.2d, v3.2d
-; CHECK-NEXT:    scvtf v2.2d, v4.2d
+; CHECK-NEXT:    sshll v2.2d, v0.2s, #0
+; CHECK-NEXT:    sshll2 v3.2d, v0.4s, #0
+; CHECK-NEXT:    sshll2 v4.2d, v1.4s, #0
+; CHECK-NEXT:    sshll v5.2d, v1.2s, #0
+; CHECK-NEXT:    scvtf v0.2d, v2.2d
+; CHECK-NEXT:    scvtf v1.2d, v3.2d
+; CHECK-NEXT:    scvtf v3.2d, v4.2d
+; CHECK-NEXT:    scvtf v2.2d, v5.2d
 ; CHECK-NEXT:    ret
   %1 = sitofp <8 x i32> %a to <8 x double>
   ret <8 x double> %1
@@ -376,8 +376,8 @@ define <4 x double> @uitofp_v4i8_double(<4 x i8> %a) {
 ; CHECK-NEXT:    movi d1, #0x0000ff000000ff
 ; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    and v1.8b, v2.8b, v1.8b
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-NEXT:    ucvtf v0.2d, v0.2d
 ; CHECK-NEXT:    ucvtf v1.2d, v1.2d
@@ -392,25 +392,25 @@ define <8 x double> @uitofp_v8i8_double(<8 x i8> %a) {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    umov w8, v0.b[0]
 ; CHECK-NEXT:    umov w9, v0.b[2]
-; CHECK-NEXT:    umov w10, v0.b[4]
-; CHECK-NEXT:    umov w11, v0.b[6]
+; CHECK-NEXT:    umov w11, v0.b[4]
+; CHECK-NEXT:    umov w12, v0.b[6]
+; CHECK-NEXT:    umov w10, v0.b[1]
+; CHECK-NEXT:    umov w13, v0.b[3]
+; CHECK-NEXT:    umov w14, v0.b[5]
+; CHECK-NEXT:    umov w15, v0.b[7]
 ; CHECK-NEXT:    movi d1, #0x0000ff000000ff
-; CHECK-NEXT:    fmov s2, w8
-; CHECK-NEXT:    umov w8, v0.b[1]
-; CHECK-NEXT:    fmov s3, w9
-; CHECK-NEXT:    umov w9, v0.b[3]
-; CHECK-NEXT:    fmov s4, w10
-; CHECK-NEXT:    umov w10, v0.b[5]
-; CHECK-NEXT:    fmov s5, w11
-; CHECK-NEXT:    umov w11, v0.b[7]
-; CHECK-NEXT:    mov v2.s[1], w8
-; CHECK-NEXT:    mov v3.s[1], w9
-; CHECK-NEXT:    mov v4.s[1], w10
-; CHECK-NEXT:    mov v5.s[1], w11
-; CHECK-NEXT:    and v0.8b, v2.8b, v1.8b
-; CHECK-NEXT:    and v2.8b, v3.8b, v1.8b
-; CHECK-NEXT:    and v3.8b, v4.8b, v1.8b
-; CHECK-NEXT:    and v1.8b, v5.8b, v1.8b
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    fmov s3, w11
+; CHECK-NEXT:    fmov s4, w12
+; CHECK-NEXT:    mov v0.s[1], w10
+; CHECK-NEXT:    mov v2.s[1], w13
+; CHECK-NEXT:    mov v3.s[1], w14
+; CHECK-NEXT:    mov v4.s[1], w15
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    and v2.8b, v2.8b, v1.8b
+; CHECK-NEXT:    and v3.8b, v3.8b, v1.8b
+; CHECK-NEXT:    and v1.8b, v4.8b, v1.8b
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    ushll v2.2d, v2.2s, #0
 ; CHECK-NEXT:    ushll v3.2d, v3.2s, #0
@@ -428,60 +428,60 @@ define <16 x double> @uitofp_v16i8_double(<16 x i8> %a) {
 ; CHECK-LABEL: uitofp_v16i8_double:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    umov w9, v0.b[0]
-; CHECK-NEXT:    umov w11, v0.b[1]
+; CHECK-NEXT:    umov w8, v0.b[0]
+; CHECK-NEXT:    umov w10, v0.b[2]
+; CHECK-NEXT:    umov w9, v0.b[1]
+; CHECK-NEXT:    umov w12, v0.b[4]
+; CHECK-NEXT:    umov w11, v0.b[3]
+; CHECK-NEXT:    umov w13, v0.b[5]
+; CHECK-NEXT:    umov w18, v0.b[6]
 ; CHECK-NEXT:    movi d1, #0x0000ff000000ff
-; CHECK-NEXT:    umov w8, v2.b[0]
-; CHECK-NEXT:    umov w10, v2.b[2]
-; CHECK-NEXT:    umov w12, v2.b[1]
-; CHECK-NEXT:    fmov s4, w9
-; CHECK-NEXT:    umov w9, v2.b[3]
+; CHECK-NEXT:    umov w14, v2.b[0]
+; CHECK-NEXT:    umov w16, v2.b[2]
+; CHECK-NEXT:    umov w0, v2.b[4]
 ; CHECK-NEXT:    fmov s3, w8
-; CHECK-NEXT:    umov w8, v2.b[4]
-; CHECK-NEXT:    fmov s5, w10
+; CHECK-NEXT:    umov w8, v0.b[7]
+; CHECK-NEXT:    fmov s0, w10
 ; CHECK-NEXT:    umov w10, v2.b[6]
-; CHECK-NEXT:    mov v4.s[1], w11
-; CHECK-NEXT:    mov v3.s[1], w12
+; CHECK-NEXT:    umov w15, v2.b[1]
+; CHECK-NEXT:    umov w17, v2.b[3]
+; CHECK-NEXT:    fmov s4, w12
 ; CHECK-NEXT:    umov w12, v2.b[5]
-; CHECK-NEXT:    fmov s6, w8
-; CHECK-NEXT:    umov w8, v0.b[2]
-; CHECK-NEXT:    mov v5.s[1], w9
+; CHECK-NEXT:    fmov s7, w18
+; CHECK-NEXT:    mov v3.s[1], w9
 ; CHECK-NEXT:    umov w9, v2.b[7]
-; CHECK-NEXT:    fmov s2, w10
-; CHECK-NEXT:    umov w10, v0.b[4]
-; CHECK-NEXT:    and v3.8b, v3.8b, v1.8b
-; CHECK-NEXT:    mov v6.s[1], w12
-; CHECK-NEXT:    umov w12, v0.b[6]
-; CHECK-NEXT:    fmov s7, w8
-; CHECK-NEXT:    umov w8, v0.b[3]
-; CHECK-NEXT:    and v5.8b, v5.8b, v1.8b
-; CHECK-NEXT:    mov v2.s[1], w9
-; CHECK-NEXT:    umov w9, v0.b[5]
+; CHECK-NEXT:    fmov s2, w14
+; CHECK-NEXT:    fmov s5, w16
+; CHECK-NEXT:    fmov s6, w0
+; CHECK-NEXT:    mov v0.s[1], w11
 ; CHECK-NEXT:    fmov s16, w10
-; CHECK-NEXT:    umov w10, v0.b[7]
-; CHECK-NEXT:    fmov s0, w12
+; CHECK-NEXT:    mov v4.s[1], w13
 ; CHECK-NEXT:    mov v7.s[1], w8
-; CHECK-NEXT:    and v6.8b, v6.8b, v1.8b
+; CHECK-NEXT:    mov v2.s[1], w15
+; CHECK-NEXT:    mov v5.s[1], w17
+; CHECK-NEXT:    mov v6.s[1], w12
+; CHECK-NEXT:    and v3.8b, v3.8b, v1.8b
 ; CHECK-NEXT:    mov v16.s[1], w9
-; CHECK-NEXT:    and v2.8b, v2.8b, v1.8b
-; CHECK-NEXT:    mov v0.s[1], w10
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    and v4.8b, v4.8b, v1.8b
 ; CHECK-NEXT:    and v7.8b, v7.8b, v1.8b
-; CHECK-NEXT:    and v16.8b, v16.8b, v1.8b
-; CHECK-NEXT:    ushll v17.2d, v3.2s, #0
-; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    and v2.8b, v2.8b, v1.8b
+; CHECK-NEXT:    ushll v3.2d, v3.2s, #0
+; CHECK-NEXT:    and v5.8b, v5.8b, v1.8b
+; CHECK-NEXT:    and v6.8b, v6.8b, v1.8b
+; CHECK-NEXT:    and v1.8b, v16.8b, v1.8b
+; CHECK-NEXT:    ushll v16.2d, v0.2s, #0
+; CHECK-NEXT:    ushll v17.2d, v4.2s, #0
+; CHECK-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-NEXT:    ushll v7.2d, v7.2s, #0
+; CHECK-NEXT:    ucvtf v0.2d, v3.2d
 ; CHECK-NEXT:    ushll v5.2d, v5.2s, #0
 ; CHECK-NEXT:    ushll v6.2d, v6.2s, #0
-; CHECK-NEXT:    ushll v18.2d, v2.2s, #0
-; CHECK-NEXT:    ushll v1.2d, v4.2s, #0
-; CHECK-NEXT:    ushll v2.2d, v7.2s, #0
-; CHECK-NEXT:    ushll v3.2d, v16.2s, #0
-; CHECK-NEXT:    ushll v4.2d, v0.2s, #0
-; CHECK-NEXT:    ucvtf v0.2d, v1.2d
-; CHECK-NEXT:    ucvtf v1.2d, v2.2d
-; CHECK-NEXT:    ucvtf v2.2d, v3.2d
-; CHECK-NEXT:    ucvtf v3.2d, v4.2d
-; CHECK-NEXT:    ucvtf v4.2d, v17.2d
+; CHECK-NEXT:    ushll v18.2d, v1.2s, #0
+; CHECK-NEXT:    ucvtf v1.2d, v16.2d
+; CHECK-NEXT:    ucvtf v4.2d, v2.2d
+; CHECK-NEXT:    ucvtf v2.2d, v17.2d
+; CHECK-NEXT:    ucvtf v3.2d, v7.2d
 ; CHECK-NEXT:    ucvtf v5.2d, v5.2d
 ; CHECK-NEXT:    ucvtf v6.2d, v6.2d
 ; CHECK-NEXT:    ucvtf v7.2d, v18.2d
@@ -493,15 +493,15 @@ define <16 x double> @uitofp_v16i8_double(<16 x i8> %a) {
 define <8 x double> @uitofp_i16_double(<8 x i16> %a) {
 ; CHECK-LABEL: uitofp_i16_double:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    ushll2 v2.2d, v1.4s, #0
+; CHECK-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-NEXT:    ushll v2.2d, v1.2s, #0
 ; CHECK-NEXT:    ushll2 v3.2d, v0.4s, #0
-; CHECK-NEXT:    ushll v4.2d, v1.2s, #0
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-NEXT:    ucvtf v1.2d, v3.2d
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    ucvtf v3.2d, v2.2d
+; CHECK-NEXT:    ushll2 v1.2d, v1.4s, #0
+; CHECK-NEXT:    ushll v4.2d, v0.2s, #0
+; CHECK-NEXT:    ucvtf v0.2d, v2.2d
+; CHECK-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-NEXT:    ucvtf v1.2d, v1.2d
 ; CHECK-NEXT:    ucvtf v2.2d, v4.2d
 ; CHECK-NEXT:    ret
   %1 = uitofp <8 x i16> %a to <8 x double>
@@ -511,14 +511,14 @@ define <8 x double> @uitofp_i16_double(<8 x i16> %a) {
 define <8 x double> @uitofp_i32_double(<8 x i32> %a) {
 ; CHECK-LABEL: uitofp_i32_double:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushll2 v2.2d, v0.4s, #0
-; CHECK-NEXT:    ushll2 v3.2d, v1.4s, #0
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-NEXT:    ushll v4.2d, v1.2s, #0
-; CHECK-NEXT:    ucvtf v1.2d, v2.2d
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    ucvtf v3.2d, v3.2d
-; CHECK-NEXT:    ucvtf v2.2d, v4.2d
+; CHECK-NEXT:    ushll v2.2d, v0.2s, #0
+; CHECK-NEXT:    ushll2 v3.2d, v0.4s, #0
+; CHECK-NEXT:    ushll2 v4.2d, v1.4s, #0
+; CHECK-NEXT:    ushll v5.2d, v1.2s, #0
+; CHECK-NEXT:    ucvtf v0.2d, v2.2d
+; CHECK-NEXT:    ucvtf v1.2d, v3.2d
+; CHECK-NEXT:    ucvtf v3.2d, v4.2d
+; CHECK-NEXT:    ucvtf v2.2d, v5.2d
 ; CHECK-NEXT:    ret
   %1 = uitofp <8 x i32> %a to <8 x double>
   ret <8 x double> %1

diff  --git a/llvm/test/CodeGen/AArch64/vector-gep.ll b/llvm/test/CodeGen/AArch64/vector-gep.ll
index c7858416e1796e..30317dce85e656 100644
--- a/llvm/test/CodeGen/AArch64/vector-gep.ll
+++ b/llvm/test/CodeGen/AArch64/vector-gep.ll
@@ -13,11 +13,11 @@ define <2 x ptr> @vector_gep(<2 x ptr> %0) {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh0:
 ; CHECK-NEXT:    adrp x8, lCPI0_0 at PAGE
-; CHECK-NEXT:    movi v2.2d, #0x000000ffffffff
 ; CHECK-NEXT:  Lloh1:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI0_0 at PAGEOFF]
 ; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    movi v1.2d, #0x000000ffffffff
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:    .loh AdrpLdr Lloh0, Lloh1
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
index 48d9ecff68d24f..55d3943bbc7d89 100644
--- a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
@@ -136,11 +136,11 @@ define void @twosrc(ptr nocapture readonly %pSrc, ptr nocapture readonly %pSrc2,
 ; CHECK-NEXT:  .LBB3_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x9, x0, x8
-; CHECK-NEXT:    ld2 { v0.4s, v1.4s }, [x9]
-; CHECK-NEXT:    add x9, x1, x8
+; CHECK-NEXT:    add x10, x1, x8
 ; CHECK-NEXT:    add x8, x8, #32
+; CHECK-NEXT:    ld2 { v0.4s, v1.4s }, [x9]
 ; CHECK-NEXT:    cmp x8, #2, lsl #12 // =8192
-; CHECK-NEXT:    ld2 { v2.4s, v3.4s }, [x9]
+; CHECK-NEXT:    ld2 { v2.4s, v3.4s }, [x10]
 ; CHECK-NEXT:    fmul v4.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    fmla v4.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    str q4, [x2], #16
@@ -301,18 +301,18 @@ define void @transpose_s16_8x8_simpler(ptr nocapture noundef %a) {
 ; CHECK-LABEL: transpose_s16_8x8_simpler:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x0, #32]
+; CHECK-NEXT:    ldp q2, q3, [x0, #64]
+; CHECK-NEXT:    ldp q4, q5, [x0, #32]
+; CHECK-NEXT:    ldp q6, q7, [x0, #96]
 ; CHECK-NEXT:    trn1 v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    ldp q4, q5, [x0, #64]
-; CHECK-NEXT:    trn1 v2.8h, v2.8h, v3.8h
-; CHECK-NEXT:    ldp q6, q1, [x0, #96]
-; CHECK-NEXT:    trn1 v3.8h, v4.8h, v5.8h
-; CHECK-NEXT:    trn1 v3.4s, v0.4s, v3.4s
-; CHECK-NEXT:    trn1 v1.8h, v6.8h, v1.8h
-; CHECK-NEXT:    trn1 v4.4s, v2.4s, v1.4s
-; CHECK-NEXT:    zip2 v0.4s, v3.4s, v4.4s
-; CHECK-NEXT:    st2 { v3.2s, v4.2s }, [x0]
-; CHECK-NEXT:    str q0, [x0, #64]
+; CHECK-NEXT:    trn1 v1.8h, v2.8h, v3.8h
+; CHECK-NEXT:    trn1 v2.8h, v4.8h, v5.8h
+; CHECK-NEXT:    trn1 v3.8h, v6.8h, v7.8h
+; CHECK-NEXT:    trn1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    trn1 v1.4s, v2.4s, v3.4s
+; CHECK-NEXT:    zip2 v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    st2 { v0.2s, v1.2s }, [x0]
+; CHECK-NEXT:    str q2, [x0, #64]
 ; CHECK-NEXT:    ret
 entry:
   %0 = load <8 x i16>, ptr %a, align 16
@@ -352,14 +352,14 @@ define void @transpose_s16_8x8_simpler2(ptr nocapture noundef %a) {
 ; CHECK-LABEL: transpose_s16_8x8_simpler2:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldp q0, q2, [x0]
-; CHECK-NEXT:    ldp q3, q4, [x0, #32]
+; CHECK-NEXT:    ldp q3, q4, [x0, #64]
+; CHECK-NEXT:    ldp q5, q6, [x0, #32]
+; CHECK-NEXT:    ldp q7, q16, [x0, #96]
 ; CHECK-NEXT:    mov v0.h[5], v2.h[4]
-; CHECK-NEXT:    ldp q5, q6, [x0, #64]
-; CHECK-NEXT:    zip1 v3.8h, v3.8h, v4.8h
-; CHECK-NEXT:    ldp q7, q2, [x0, #96]
-; CHECK-NEXT:    zip1 v4.8h, v5.8h, v6.8h
-; CHECK-NEXT:    mov v0.s[1], v4.s[0]
-; CHECK-NEXT:    mov v7.h[5], v2.h[4]
+; CHECK-NEXT:    zip1 v2.8h, v3.8h, v4.8h
+; CHECK-NEXT:    zip1 v3.8h, v5.8h, v6.8h
+; CHECK-NEXT:    mov v7.h[5], v16.h[4]
+; CHECK-NEXT:    mov v0.s[1], v2.s[0]
 ; CHECK-NEXT:    uzp1 v1.4s, v3.4s, v7.4s
 ; CHECK-NEXT:    zip2 v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    st2 { v0.2s, v1.2s }, [x0]
@@ -402,36 +402,36 @@ entry:
 define void @transpose_s16_8x8(ptr nocapture noundef %0, ptr nocapture noundef %1, ptr nocapture noundef %2, ptr nocapture noundef %3, ptr nocapture noundef %4, ptr nocapture noundef %5, ptr nocapture noundef %6, ptr nocapture noundef %7) {
 ; CHECK-LABEL: transpose_s16_8x8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    ldr q2, [x1]
-; CHECK-NEXT:    ldr q0, [x4]
-; CHECK-NEXT:    ldr q3, [x2]
-; CHECK-NEXT:    ldr q4, [x3]
-; CHECK-NEXT:    ldr q6, [x5]
-; CHECK-NEXT:    trn1 v5.8h, v1.8h, v2.8h
-; CHECK-NEXT:    ldr q16, [x7]
-; CHECK-NEXT:    trn2 v1.8h, v1.8h, v2.8h
-; CHECK-NEXT:    ldr q2, [x6]
-; CHECK-NEXT:    trn1 v7.8h, v3.8h, v4.8h
-; CHECK-NEXT:    trn2 v3.8h, v3.8h, v4.8h
-; CHECK-NEXT:    trn1 v4.8h, v0.8h, v6.8h
-; CHECK-NEXT:    trn2 v0.8h, v0.8h, v6.8h
-; CHECK-NEXT:    trn1 v17.8h, v2.8h, v16.8h
-; CHECK-NEXT:    trn2 v2.8h, v2.8h, v16.8h
-; CHECK-NEXT:    trn1 v18.4s, v5.4s, v4.4s
-; CHECK-NEXT:    trn1 v20.4s, v1.4s, v0.4s
-; CHECK-NEXT:    trn2 v4.4s, v5.4s, v4.4s
-; CHECK-NEXT:    trn2 v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    trn1 v19.4s, v7.4s, v17.4s
-; CHECK-NEXT:    trn1 v21.4s, v3.4s, v2.4s
-; CHECK-NEXT:    trn2 v5.4s, v7.4s, v17.4s
-; CHECK-NEXT:    trn2 v1.4s, v3.4s, v2.4s
-; CHECK-NEXT:    st2 { v18.2s, v19.2s }, [x0]
-; CHECK-NEXT:    zip2 v2.4s, v18.4s, v19.4s
-; CHECK-NEXT:    st2 { v20.2s, v21.2s }, [x1]
-; CHECK-NEXT:    zip2 v3.4s, v20.4s, v21.4s
-; CHECK-NEXT:    st2 { v4.2s, v5.2s }, [x2]
-; CHECK-NEXT:    zip2 v4.4s, v4.4s, v5.4s
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    ldr q3, [x4]
+; CHECK-NEXT:    ldr q4, [x5]
+; CHECK-NEXT:    ldr q2, [x2]
+; CHECK-NEXT:    ldr q5, [x3]
+; CHECK-NEXT:    trn1 v16.8h, v0.8h, v1.8h
+; CHECK-NEXT:    trn2 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ldr q6, [x6]
+; CHECK-NEXT:    ldr q7, [x7]
+; CHECK-NEXT:    trn1 v17.8h, v3.8h, v4.8h
+; CHECK-NEXT:    trn2 v1.8h, v3.8h, v4.8h
+; CHECK-NEXT:    trn1 v18.8h, v2.8h, v5.8h
+; CHECK-NEXT:    trn2 v2.8h, v2.8h, v5.8h
+; CHECK-NEXT:    trn1 v19.8h, v6.8h, v7.8h
+; CHECK-NEXT:    trn2 v3.8h, v6.8h, v7.8h
+; CHECK-NEXT:    trn1 v4.4s, v16.4s, v17.4s
+; CHECK-NEXT:    trn1 v6.4s, v0.4s, v1.4s
+; CHECK-NEXT:    trn2 v16.4s, v16.4s, v17.4s
+; CHECK-NEXT:    trn2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    trn1 v5.4s, v18.4s, v19.4s
+; CHECK-NEXT:    trn1 v7.4s, v2.4s, v3.4s
+; CHECK-NEXT:    trn2 v17.4s, v18.4s, v19.4s
+; CHECK-NEXT:    trn2 v1.4s, v2.4s, v3.4s
+; CHECK-NEXT:    st2 { v4.2s, v5.2s }, [x0]
+; CHECK-NEXT:    zip2 v2.4s, v4.4s, v5.4s
+; CHECK-NEXT:    zip2 v3.4s, v6.4s, v7.4s
+; CHECK-NEXT:    zip2 v4.4s, v16.4s, v17.4s
+; CHECK-NEXT:    st2 { v6.2s, v7.2s }, [x1]
+; CHECK-NEXT:    st2 { v16.2s, v17.2s }, [x2]
 ; CHECK-NEXT:    st2 { v0.2s, v1.2s }, [x3]
 ; CHECK-NEXT:    zip2 v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    str q2, [x4]
@@ -494,40 +494,40 @@ define void @transpose_s16_8x8_(ptr nocapture noundef %0) {
 ; CHECK-LABEL: transpose_s16_8x8_:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    ldp q4, q5, [x0, #64]
 ; CHECK-NEXT:    mov x9, x0
-; CHECK-NEXT:    ldp q1, q2, [x0, #64]
+; CHECK-NEXT:    ldr q0, [x8, #16]!
 ; CHECK-NEXT:    mov x10, x0
+; CHECK-NEXT:    ldr q3, [x0]
 ; CHECK-NEXT:    ldp q6, q7, [x0, #96]
-; CHECK-NEXT:    trn1 v16.8h, v1.8h, v2.8h
-; CHECK-NEXT:    trn2 v1.8h, v1.8h, v2.8h
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    ldr q3, [x8, #16]!
-; CHECK-NEXT:    ldr q4, [x9, #32]!
-; CHECK-NEXT:    ldr q5, [x10, #48]!
-; CHECK-NEXT:    trn1 v2.8h, v6.8h, v7.8h
-; CHECK-NEXT:    trn2 v6.8h, v6.8h, v7.8h
-; CHECK-NEXT:    trn1 v7.8h, v0.8h, v3.8h
-; CHECK-NEXT:    trn2 v0.8h, v0.8h, v3.8h
 ; CHECK-NEXT:    trn1 v17.8h, v4.8h, v5.8h
-; CHECK-NEXT:    trn2 v3.8h, v4.8h, v5.8h
-; CHECK-NEXT:    trn1 v4.4s, v7.4s, v16.4s
-; CHECK-NEXT:    trn1 v18.4s, v0.4s, v1.4s
-; CHECK-NEXT:    trn2 v20.4s, v7.4s, v16.4s
-; CHECK-NEXT:    trn2 v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    trn1 v5.4s, v17.4s, v2.4s
-; CHECK-NEXT:    trn1 v19.4s, v3.4s, v6.4s
-; CHECK-NEXT:    trn2 v21.4s, v17.4s, v2.4s
-; CHECK-NEXT:    trn2 v1.4s, v3.4s, v6.4s
-; CHECK-NEXT:    st2 { v4.2s, v5.2s }, [x0]
-; CHECK-NEXT:    zip2 v2.4s, v4.4s, v5.4s
+; CHECK-NEXT:    ldr q1, [x9, #32]!
+; CHECK-NEXT:    trn1 v16.8h, v3.8h, v0.8h
+; CHECK-NEXT:    ldr q2, [x10, #48]!
+; CHECK-NEXT:    trn2 v4.8h, v4.8h, v5.8h
+; CHECK-NEXT:    trn1 v19.8h, v6.8h, v7.8h
+; CHECK-NEXT:    trn2 v0.8h, v3.8h, v0.8h
+; CHECK-NEXT:    trn2 v3.8h, v6.8h, v7.8h
+; CHECK-NEXT:    trn1 v18.8h, v1.8h, v2.8h
+; CHECK-NEXT:    trn2 v1.8h, v1.8h, v2.8h
+; CHECK-NEXT:    trn1 v5.4s, v16.4s, v17.4s
+; CHECK-NEXT:    trn2 v16.4s, v16.4s, v17.4s
+; CHECK-NEXT:    trn1 v20.4s, v0.4s, v4.4s
+; CHECK-NEXT:    trn1 v6.4s, v18.4s, v19.4s
+; CHECK-NEXT:    trn2 v17.4s, v18.4s, v19.4s
+; CHECK-NEXT:    trn2 v18.4s, v0.4s, v4.4s
+; CHECK-NEXT:    trn1 v21.4s, v1.4s, v3.4s
+; CHECK-NEXT:    trn2 v19.4s, v1.4s, v3.4s
+; CHECK-NEXT:    zip2 v0.4s, v5.4s, v6.4s
+; CHECK-NEXT:    zip2 v2.4s, v16.4s, v17.4s
+; CHECK-NEXT:    st2 { v5.2s, v6.2s }, [x0]
+; CHECK-NEXT:    zip2 v1.4s, v20.4s, v21.4s
 ; CHECK-NEXT:    zip2 v3.4s, v18.4s, v19.4s
-; CHECK-NEXT:    st2 { v18.2s, v19.2s }, [x8]
-; CHECK-NEXT:    zip2 v4.4s, v20.4s, v21.4s
-; CHECK-NEXT:    st2 { v0.2s, v1.2s }, [x10]
-; CHECK-NEXT:    zip2 v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    st2 { v20.2s, v21.2s }, [x9]
-; CHECK-NEXT:    stp q2, q3, [x0, #64]
-; CHECK-NEXT:    stp q4, q0, [x0, #96]
+; CHECK-NEXT:    st2 { v20.2s, v21.2s }, [x8]
+; CHECK-NEXT:    st2 { v16.2s, v17.2s }, [x9]
+; CHECK-NEXT:    st2 { v18.2s, v19.2s }, [x10]
+; CHECK-NEXT:    stp q0, q1, [x0, #64]
+; CHECK-NEXT:    stp q2, q3, [x0, #96]
 ; CHECK-NEXT:    ret
   %2 = load <8 x i16>, ptr %0, align 16
   %3 = getelementptr inbounds <8 x i16>, ptr %0, i64 1
@@ -644,8 +644,8 @@ define void @store_factor3(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2
 ; CHECK-NEXT:    ext v6.16b, v1.16b, v2.16b, #12
 ; CHECK-NEXT:    zip2 v3.4s, v0.4s, v3.4s
 ; CHECK-NEXT:    mov v3.s[0], v0.s[0]
-; CHECK-NEXT:    zip2 v4.4s, v1.4s, v6.4s
 ; CHECK-NEXT:    ext v0.16b, v2.16b, v0.16b, #12
+; CHECK-NEXT:    zip2 v4.4s, v1.4s, v6.4s
 ; CHECK-NEXT:    mov v4.s[0], v1.s[0]
 ; CHECK-NEXT:    zip2 v5.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    mov v5.s[0], v2.s[0]

diff  --git a/llvm/test/CodeGen/AArch64/vselect-constants.ll b/llvm/test/CodeGen/AArch64/vselect-constants.ll
index 30ba6f2e346439..a32147eebd7592 100644
--- a/llvm/test/CodeGen/AArch64/vselect-constants.ll
+++ b/llvm/test/CodeGen/AArch64/vselect-constants.ll
@@ -10,12 +10,12 @@
 define <4 x i32> @sel_C1_or_C2_vec(<4 x i1> %cond) {
 ; CHECK-LABEL: sel_C1_or_C2_vec:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    adrp x8, .LCPI0_0
 ; CHECK-NEXT:    adrp x9, .LCPI0_1
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    shl v0.4s, v0.4s, #31
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI0_0]
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI0_1]
+; CHECK-NEXT:    shl v0.4s, v0.4s, #31
 ; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
 ; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -27,11 +27,11 @@ define <4 x i32> @cmp_sel_C1_or_C2_vec(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: cmp_sel_C1_or_C2_vec:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI1_0
-; CHECK-NEXT:    adrp x9, .LCPI1_1
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI1_0]
-; CHECK-NEXT:    ldr q3, [x9, :lo12:.LCPI1_1]
-; CHECK-NEXT:    bsl v0.16b, v3.16b, v2.16b
+; CHECK-NEXT:    adrp x9, .LCPI1_1
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI1_0]
+; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI1_1]
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
 ; CHECK-NEXT:    ret
   %cond = icmp eq <4 x i32> %x, %y
   %add = select <4 x i1> %cond, <4 x i32> <i32 3000, i32 1, i32 -1, i32 0>, <4 x i32> <i32 42, i32 0, i32 -2, i32 -1>
@@ -41,12 +41,12 @@ define <4 x i32> @cmp_sel_C1_or_C2_vec(<4 x i32> %x, <4 x i32> %y) {
 define <4 x i32> @sel_Cplus1_or_C_vec(<4 x i1> %cond) {
 ; CHECK-LABEL: sel_Cplus1_or_C_vec:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    adrp x8, .LCPI2_0
 ; CHECK-NEXT:    adrp x9, .LCPI2_1
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    shl v0.4s, v0.4s, #31
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI2_1]
+; CHECK-NEXT:    shl v0.4s, v0.4s, #31
 ; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
 ; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -58,11 +58,11 @@ define <4 x i32> @cmp_sel_Cplus1_or_C_vec(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: cmp_sel_Cplus1_or_C_vec:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI3_0
-; CHECK-NEXT:    adrp x9, .LCPI3_1
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
-; CHECK-NEXT:    ldr q3, [x9, :lo12:.LCPI3_1]
-; CHECK-NEXT:    bsl v0.16b, v3.16b, v2.16b
+; CHECK-NEXT:    adrp x9, .LCPI3_1
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI3_0]
+; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI3_1]
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
 ; CHECK-NEXT:    ret
   %cond = icmp eq <4 x i32> %x, %y
   %add = select <4 x i1> %cond, <4 x i32> <i32 43, i32 1, i32 -1, i32 0>, <4 x i32> <i32 42, i32 0, i32 -2, i32 -1>
@@ -72,12 +72,12 @@ define <4 x i32> @cmp_sel_Cplus1_or_C_vec(<4 x i32> %x, <4 x i32> %y) {
 define <4 x i32> @sel_Cminus1_or_C_vec(<4 x i1> %cond) {
 ; CHECK-LABEL: sel_Cminus1_or_C_vec:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    adrp x8, .LCPI4_0
 ; CHECK-NEXT:    adrp x9, .LCPI4_1
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    shl v0.4s, v0.4s, #31
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI4_0]
 ; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI4_1]
+; CHECK-NEXT:    shl v0.4s, v0.4s, #31
 ; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
 ; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -89,11 +89,11 @@ define <4 x i32> @cmp_sel_Cminus1_or_C_vec(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: cmp_sel_Cminus1_or_C_vec:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI5_0
-; CHECK-NEXT:    adrp x9, .LCPI5_1
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI5_0]
-; CHECK-NEXT:    ldr q3, [x9, :lo12:.LCPI5_1]
-; CHECK-NEXT:    bsl v0.16b, v3.16b, v2.16b
+; CHECK-NEXT:    adrp x9, .LCPI5_1
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI5_0]
+; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI5_1]
+; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
 ; CHECK-NEXT:    ret
   %cond = icmp eq <4 x i32> %x, %y
   %add = select <4 x i1> %cond, <4 x i32> <i32 43, i32 1, i32 -1, i32 0>, <4 x i32> <i32 44, i32 2, i32 0, i32 1>
@@ -370,9 +370,10 @@ define <vscale x 16 x i8> @signbit_mask_xor_nxv16i8(<vscale x 16 x i8> %a, <vsca
 ; CHECK-LABEL: signbit_mask_xor_nxv16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    eor z1.d, z0.d, z1.d
 ; CHECK-NEXT:    cmplt p0.b, p0/z, z0.b, #0
-; CHECK-NEXT:    eor z0.d, z0.d, z1.d
-; CHECK-NEXT:    mov z0.b, p0/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.b, p0/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
   %cond = icmp slt <vscale x 16 x i8> %a, zeroinitializer
   %xor = xor <vscale x 16 x i8> %a, %b

diff  --git a/llvm/test/CodeGen/AArch64/vselect-ext.ll b/llvm/test/CodeGen/AArch64/vselect-ext.ll
index 350595f0dc160c..0ae09ebe916302 100644
--- a/llvm/test/CodeGen/AArch64/vselect-ext.ll
+++ b/llvm/test/CodeGen/AArch64/vselect-ext.ll
@@ -8,15 +8,14 @@ define <16 x i32> @no_existing_zext(<16 x i8> %a, <16 x i32> %op) {
 ; CHECK-NEXT:    cmhi.16b v0, v0, v5
 ; CHECK-NEXT:    sshll.8h v5, v0, #0
 ; CHECK-NEXT:    sshll2.8h v0, v0, #0
+; CHECK-NEXT:    sshll2.4s v16, v0, #0
 ; CHECK-NEXT:    sshll.4s v6, v5, #0
 ; CHECK-NEXT:    sshll.4s v7, v0, #0
-; CHECK-NEXT:    sshll2.4s v0, v0, #0
 ; CHECK-NEXT:    sshll2.4s v5, v5, #0
-; CHECK-NEXT:    and.16b v4, v4, v0
-; CHECK-NEXT:    and.16b v5, v2, v5
-; CHECK-NEXT:    and.16b v2, v3, v7
+; CHECK-NEXT:    and.16b v4, v4, v16
 ; CHECK-NEXT:    and.16b v0, v1, v6
-; CHECK-NEXT:    mov.16b v1, v5
+; CHECK-NEXT:    and.16b v1, v2, v5
+; CHECK-NEXT:    and.16b v2, v3, v7
 ; CHECK-NEXT:    mov.16b v3, v4
 ; CHECK-NEXT:    ret
 entry:
@@ -28,23 +27,23 @@ entry:
 define <16 x i32> @second_compare_operand_not_splat(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: second_compare_operand_not_splat:
 ; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    cmgt.16b v1, v0, v1
 ; CHECK-NEXT:    ushll.8h v2, v0, #0
-; CHECK-NEXT:    ushll2.8h v3, v0, #0
-; CHECK-NEXT:    cmgt.16b v0, v0, v1
+; CHECK-NEXT:    ushll2.8h v0, v0, #0
+; CHECK-NEXT:    sshll.8h v3, v1, #0
+; CHECK-NEXT:    sshll2.8h v1, v1, #0
 ; CHECK-NEXT:    ushll.4s v4, v2, #0
-; CHECK-NEXT:    ushll.4s v5, v3, #0
-; CHECK-NEXT:    ushll2.4s v1, v2, #0
-; CHECK-NEXT:    ushll2.4s v2, v3, #0
-; CHECK-NEXT:    sshll.8h v3, v0, #0
-; CHECK-NEXT:    sshll2.8h v0, v0, #0
-; CHECK-NEXT:    sshll.4s v6, v3, #0
-; CHECK-NEXT:    sshll.4s v7, v0, #0
-; CHECK-NEXT:    sshll2.4s v0, v0, #0
+; CHECK-NEXT:    ushll.4s v5, v0, #0
+; CHECK-NEXT:    ushll2.4s v2, v2, #0
+; CHECK-NEXT:    ushll2.4s v6, v0, #0
+; CHECK-NEXT:    sshll.4s v0, v3, #0
+; CHECK-NEXT:    sshll.4s v7, v1, #0
 ; CHECK-NEXT:    sshll2.4s v16, v3, #0
-; CHECK-NEXT:    and.16b v3, v2, v0
-; CHECK-NEXT:    and.16b v1, v1, v16
+; CHECK-NEXT:    sshll2.4s v1, v1, #0
+; CHECK-NEXT:    and.16b v0, v4, v0
+; CHECK-NEXT:    and.16b v3, v6, v1
+; CHECK-NEXT:    and.16b v1, v2, v16
 ; CHECK-NEXT:    and.16b v2, v5, v7
-; CHECK-NEXT:    and.16b v0, v4, v6
 ; CHECK-NEXT:    ret
 entry:
   %ext = zext <16 x i8> %a to <16 x i32>
@@ -58,22 +57,22 @@ define <16 x i32> @same_zext_used_in_cmp_signed_pred_and_select(<16 x i8> %a) {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    movi.16b v1, #10
 ; CHECK-NEXT:    ushll.8h v2, v0, #0
-; CHECK-NEXT:    ushll2.8h v3, v0, #0
 ; CHECK-NEXT:    ushll.4s v4, v2, #0
-; CHECK-NEXT:    cmgt.16b v0, v0, v1
-; CHECK-NEXT:    ushll.4s v5, v3, #0
-; CHECK-NEXT:    ushll2.4s v1, v3, #0
-; CHECK-NEXT:    sshll.8h v3, v0, #0
-; CHECK-NEXT:    sshll2.8h v0, v0, #0
 ; CHECK-NEXT:    ushll2.4s v2, v2, #0
-; CHECK-NEXT:    sshll.4s v6, v3, #0
-; CHECK-NEXT:    sshll.4s v7, v0, #0
-; CHECK-NEXT:    sshll2.4s v0, v0, #0
+; CHECK-NEXT:    cmgt.16b v1, v0, v1
+; CHECK-NEXT:    ushll2.8h v0, v0, #0
+; CHECK-NEXT:    sshll.8h v3, v1, #0
+; CHECK-NEXT:    sshll2.8h v1, v1, #0
+; CHECK-NEXT:    ushll.4s v5, v0, #0
+; CHECK-NEXT:    ushll2.4s v6, v0, #0
+; CHECK-NEXT:    sshll.4s v0, v3, #0
+; CHECK-NEXT:    sshll.4s v7, v1, #0
 ; CHECK-NEXT:    sshll2.4s v16, v3, #0
-; CHECK-NEXT:    and.16b v3, v1, v0
+; CHECK-NEXT:    sshll2.4s v1, v1, #0
+; CHECK-NEXT:    and.16b v0, v4, v0
+; CHECK-NEXT:    and.16b v3, v6, v1
 ; CHECK-NEXT:    and.16b v1, v2, v16
 ; CHECK-NEXT:    and.16b v2, v5, v7
-; CHECK-NEXT:    and.16b v0, v4, v6
 ; CHECK-NEXT:    ret
 entry:
   %ext = zext <16 x i8> %a to <16 x i32>
@@ -86,22 +85,22 @@ define <8 x i64> @same_zext_used_in_cmp_unsigned_pred_and_select_v8i64(<8 x i8>
 ; CHECK-LABEL: same_zext_used_in_cmp_unsigned_pred_and_select_v8i64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    ushll.8h v0, v0, #0
-; CHECK-NEXT:    mov w8, #10
-; CHECK-NEXT:    ushll2.4s v2, v0, #0
-; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    dup.2d v1, x8
-; CHECK-NEXT:    ushll2.2d v3, v2, #0
+; CHECK-NEXT:    mov w8, #10 ; =0xa
+; CHECK-NEXT:    dup.2d v2, x8
+; CHECK-NEXT:    ushll.4s v1, v0, #0
+; CHECK-NEXT:    ushll2.4s v0, v0, #0
+; CHECK-NEXT:    ushll.2d v3, v1, #0
 ; CHECK-NEXT:    ushll2.2d v4, v0, #0
-; CHECK-NEXT:    ushll.2d v0, v0, #0
-; CHECK-NEXT:    ushll.2d v2, v2, #0
-; CHECK-NEXT:    cmhi.2d v5, v0, v1
-; CHECK-NEXT:    cmhi.2d v6, v2, v1
-; CHECK-NEXT:    cmhi.2d v7, v3, v1
-; CHECK-NEXT:    cmhi.2d v1, v4, v1
-; CHECK-NEXT:    and.16b v3, v3, v7
-; CHECK-NEXT:    and.16b v1, v4, v1
-; CHECK-NEXT:    and.16b v2, v2, v6
-; CHECK-NEXT:    and.16b v0, v0, v5
+; CHECK-NEXT:    ushll2.2d v1, v1, #0
+; CHECK-NEXT:    ushll.2d v5, v0, #0
+; CHECK-NEXT:    cmhi.2d v0, v3, v2
+; CHECK-NEXT:    cmhi.2d v7, v1, v2
+; CHECK-NEXT:    cmhi.2d v6, v5, v2
+; CHECK-NEXT:    cmhi.2d v2, v4, v2
+; CHECK-NEXT:    and.16b v0, v3, v0
+; CHECK-NEXT:    and.16b v1, v1, v7
+; CHECK-NEXT:    and.16b v3, v4, v2
+; CHECK-NEXT:    and.16b v2, v5, v6
 ; CHECK-NEXT:    ret
   %ext = zext <8 x i8> %a to <8 x i64>
   %cmp = icmp ugt <8 x i8> %a, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
@@ -113,21 +112,21 @@ define <8 x i64> @same_zext_used_in_cmp_unsigned_pred_and_select_v8i64(<8 x i8>
 define <16 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v16i32(<16 x i8> %a) {
 ; CHECK-LABEL: same_zext_used_in_cmp_unsigned_pred_and_select_v16i32:
 ; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ushll.8h v2, v0, #0
+; CHECK-NEXT:    ushll2.8h v0, v0, #0
 ; CHECK-NEXT:    movi.4s v1, #10
-; CHECK-NEXT:    ushll2.8h v2, v0, #0
-; CHECK-NEXT:    ushll.8h v0, v0, #0
-; CHECK-NEXT:    ushll2.4s v3, v2, #0
+; CHECK-NEXT:    ushll.4s v3, v2, #0
 ; CHECK-NEXT:    ushll2.4s v4, v0, #0
-; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    ushll.4s v2, v2, #0
-; CHECK-NEXT:    cmhi.4s v5, v0, v1
-; CHECK-NEXT:    cmhi.4s v6, v2, v1
-; CHECK-NEXT:    cmhi.4s v7, v3, v1
+; CHECK-NEXT:    ushll2.4s v2, v2, #0
+; CHECK-NEXT:    ushll.4s v5, v0, #0
+; CHECK-NEXT:    cmhi.4s v0, v3, v1
+; CHECK-NEXT:    cmhi.4s v7, v2, v1
+; CHECK-NEXT:    cmhi.4s v6, v5, v1
 ; CHECK-NEXT:    cmhi.4s v1, v4, v1
-; CHECK-NEXT:    and.16b v3, v3, v7
-; CHECK-NEXT:    and.16b v1, v4, v1
-; CHECK-NEXT:    and.16b v2, v2, v6
-; CHECK-NEXT:    and.16b v0, v0, v5
+; CHECK-NEXT:    and.16b v0, v3, v0
+; CHECK-NEXT:    and.16b v3, v4, v1
+; CHECK-NEXT:    and.16b v1, v2, v7
+; CHECK-NEXT:    and.16b v2, v5, v6
 ; CHECK-NEXT:    ret
   %ext = zext <16 x i8> %a to <16 x i32>
   %cmp = icmp ugt <16 x i8> %a, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
@@ -138,14 +137,14 @@ define <16 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v16i32(<16 x i
 define <8 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v8i32(<8 x i8> %a) {
 ; CHECK-LABEL: same_zext_used_in_cmp_unsigned_pred_and_select_v8i32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    movi.4s v1, #10
 ; CHECK-NEXT:    ushll.8h v0, v0, #0
+; CHECK-NEXT:    movi.4s v1, #10
 ; CHECK-NEXT:    ushll2.4s v2, v0, #0
 ; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    cmhi.4s v3, v2, v1
-; CHECK-NEXT:    cmhi.4s v4, v0, v1
-; CHECK-NEXT:    and.16b v1, v2, v3
-; CHECK-NEXT:    and.16b v0, v0, v4
+; CHECK-NEXT:    cmhi.4s v3, v0, v1
+; CHECK-NEXT:    cmhi.4s v1, v2, v1
+; CHECK-NEXT:    and.16b v1, v2, v1
+; CHECK-NEXT:    and.16b v0, v0, v3
 ; CHECK-NEXT:    ret
   %ext = zext <8 x i8> %a to <8 x i32>
   %cmp = icmp ugt <8 x i8> %a, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
@@ -159,10 +158,10 @@ define <8 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v8i32_2(<8 x i1
 ; CHECK-NEXT:    movi.4s v1, #10
 ; CHECK-NEXT:    ushll2.4s v2, v0, #0
 ; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    cmhi.4s v3, v2, v1
-; CHECK-NEXT:    cmhi.4s v4, v0, v1
-; CHECK-NEXT:    and.16b v1, v2, v3
-; CHECK-NEXT:    and.16b v0, v0, v4
+; CHECK-NEXT:    cmhi.4s v3, v0, v1
+; CHECK-NEXT:    cmhi.4s v1, v2, v1
+; CHECK-NEXT:    and.16b v1, v2, v1
+; CHECK-NEXT:    and.16b v0, v0, v3
 ; CHECK-NEXT:    ret
   %ext = zext <8 x i16> %a to <8 x i32>
   %cmp = icmp ugt <8 x i16> %a, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
@@ -174,14 +173,14 @@ define <8 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v8i32_2(<8 x i1
 define <8 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v8i32_from_v8i15(<8 x i15> %a) {
 ; CHECK-LABEL: same_zext_used_in_cmp_unsigned_pred_and_select_v8i32_from_v8i15:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    movi.4s v1, #10
 ; CHECK-NEXT:    bic.8h v0, #128, lsl #8
+; CHECK-NEXT:    movi.4s v1, #10
 ; CHECK-NEXT:    ushll2.4s v2, v0, #0
 ; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    cmhi.4s v3, v2, v1
-; CHECK-NEXT:    cmhi.4s v4, v0, v1
-; CHECK-NEXT:    and.16b v1, v2, v3
-; CHECK-NEXT:    and.16b v0, v0, v4
+; CHECK-NEXT:    cmhi.4s v3, v0, v1
+; CHECK-NEXT:    cmhi.4s v1, v2, v1
+; CHECK-NEXT:    and.16b v1, v2, v1
+; CHECK-NEXT:    and.16b v0, v0, v3
 ; CHECK-NEXT:    ret
   %ext = zext <8 x i15> %a to <8 x i32>
   %cmp = icmp ugt <8 x i15> %a, <i15 10, i15 10, i15 10, i15 10, i15 10, i15 10, i15 10, i15 10>
@@ -193,20 +192,20 @@ define <7 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v7i32(<7 x i16>
 ; CHECK-LABEL: same_zext_used_in_cmp_unsigned_pred_and_select_v7i32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    movi.8h v1, #10
-; CHECK-NEXT:    ushll2.4s v2, v0, #0
+; CHECK-NEXT:    ushll.4s v2, v0, #0
 ; CHECK-NEXT:    cmhi.8h v1, v0, v1
-; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    ushll2.4s v0, v0, #0
 ; CHECK-NEXT:    sshll.4s v3, v1, #0
 ; CHECK-NEXT:    sshll2.4s v1, v1, #0
-; CHECK-NEXT:    and.16b v0, v0, v3
-; CHECK-NEXT:    and.16b v1, v2, v1
-; CHECK-NEXT:    mov.s w1, v0[1]
-; CHECK-NEXT:    mov.s w2, v0[2]
-; CHECK-NEXT:    mov.s w3, v0[3]
-; CHECK-NEXT:    mov.s w5, v1[1]
-; CHECK-NEXT:    mov.s w6, v1[2]
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    fmov w4, s1
+; CHECK-NEXT:    and.16b v2, v2, v3
+; CHECK-NEXT:    and.16b v0, v0, v1
+; CHECK-NEXT:    mov.s w1, v2[1]
+; CHECK-NEXT:    mov.s w2, v2[2]
+; CHECK-NEXT:    mov.s w3, v2[3]
+; CHECK-NEXT:    mov.s w5, v0[1]
+; CHECK-NEXT:    mov.s w6, v0[2]
+; CHECK-NEXT:    fmov w0, s2
+; CHECK-NEXT:    fmov w4, s0
 ; CHECK-NEXT:    ret
   %ext = zext <7 x i16> %a to <7 x i32>
   %cmp = icmp ugt <7 x i16> %a, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
@@ -220,17 +219,17 @@ define <3 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v3i16(<3 x i8>
 ; CHECK-NEXT:    fmov s0, w0
 ; CHECK-NEXT:  Lloh0:
 ; CHECK-NEXT:    adrp x8, lCPI9_0 at PAGE
-; CHECK-NEXT:    mov.h v0[1], w1
 ; CHECK-NEXT:  Lloh1:
 ; CHECK-NEXT:    ldr d2, [x8, lCPI9_0 at PAGEOFF]
+; CHECK-NEXT:    mov.h v0[1], w1
 ; CHECK-NEXT:    mov.h v0[2], w2
 ; CHECK-NEXT:    fmov d1, d0
 ; CHECK-NEXT:    bic.4h v1, #255, lsl #8
 ; CHECK-NEXT:    cmhi.4h v1, v1, v2
-; CHECK-NEXT:    movi.2d v2, #0x0000ff000000ff
 ; CHECK-NEXT:    and.8b v0, v0, v1
+; CHECK-NEXT:    movi.2d v1, #0x0000ff000000ff
 ; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    and.16b v0, v0, v2
+; CHECK-NEXT:    and.16b v0, v0, v1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:    .loh AdrpLdr Lloh0, Lloh1
   %ext = zext <3 x i8> %a to <3 x i32>
@@ -274,10 +273,10 @@ define <8 x i32> @same_zext_used_in_cmp_eq_and_select_v8i32(<8 x i16> %a) {
 ; CHECK-NEXT:    movi.4s v1, #10
 ; CHECK-NEXT:    ushll2.4s v2, v0, #0
 ; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    cmeq.4s v3, v2, v1
-; CHECK-NEXT:    cmeq.4s v4, v0, v1
-; CHECK-NEXT:    and.16b v1, v2, v3
-; CHECK-NEXT:    and.16b v0, v0, v4
+; CHECK-NEXT:    cmeq.4s v3, v0, v1
+; CHECK-NEXT:    cmeq.4s v1, v2, v1
+; CHECK-NEXT:    and.16b v1, v2, v1
+; CHECK-NEXT:    and.16b v0, v0, v3
 ; CHECK-NEXT:    ret
   %ext = zext <8 x i16> %a to <8 x i32>
   %cmp = icmp eq <8 x i16> %a, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
@@ -288,14 +287,14 @@ define <8 x i32> @same_zext_used_in_cmp_eq_and_select_v8i32(<8 x i16> %a) {
 define <8 x i32> @same_zext_used_in_cmp_eq_and_select_v8i32_from_v8i13(<8 x i13> %a) {
 ; CHECK-LABEL: same_zext_used_in_cmp_eq_and_select_v8i32_from_v8i13:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    movi.4s v1, #10
 ; CHECK-NEXT:    bic.8h v0, #224, lsl #8
+; CHECK-NEXT:    movi.4s v1, #10
 ; CHECK-NEXT:    ushll2.4s v2, v0, #0
 ; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    cmeq.4s v3, v2, v1
-; CHECK-NEXT:    cmeq.4s v4, v0, v1
-; CHECK-NEXT:    and.16b v1, v2, v3
-; CHECK-NEXT:    and.16b v0, v0, v4
+; CHECK-NEXT:    cmeq.4s v3, v0, v1
+; CHECK-NEXT:    cmeq.4s v1, v2, v1
+; CHECK-NEXT:    and.16b v1, v2, v1
+; CHECK-NEXT:    and.16b v0, v0, v3
 ; CHECK-NEXT:    ret
   %ext = zext <8 x i13> %a to <8 x i32>
   %cmp = icmp eq <8 x i13> %a, <i13 10, i13 10, i13 10, i13 10, i13 10, i13 10, i13 10, i13 10>
@@ -306,21 +305,21 @@ define <8 x i32> @same_zext_used_in_cmp_eq_and_select_v8i32_from_v8i13(<8 x i13>
 define <16 x i32> @same_zext_used_in_cmp_ne_and_select_v8i32(<16 x i8> %a) {
 ; CHECK-LABEL: same_zext_used_in_cmp_ne_and_select_v8i32:
 ; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ushll.8h v2, v0, #0
+; CHECK-NEXT:    ushll2.8h v0, v0, #0
 ; CHECK-NEXT:    movi.4s v1, #10
-; CHECK-NEXT:    ushll2.8h v2, v0, #0
-; CHECK-NEXT:    ushll.8h v0, v0, #0
-; CHECK-NEXT:    ushll2.4s v3, v2, #0
+; CHECK-NEXT:    ushll.4s v3, v2, #0
 ; CHECK-NEXT:    ushll2.4s v4, v0, #0
-; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    ushll.4s v2, v2, #0
-; CHECK-NEXT:    cmeq.4s v5, v0, v1
-; CHECK-NEXT:    cmeq.4s v6, v2, v1
-; CHECK-NEXT:    cmeq.4s v7, v3, v1
+; CHECK-NEXT:    ushll2.4s v2, v2, #0
+; CHECK-NEXT:    ushll.4s v5, v0, #0
+; CHECK-NEXT:    cmeq.4s v0, v3, v1
+; CHECK-NEXT:    cmeq.4s v7, v2, v1
+; CHECK-NEXT:    cmeq.4s v6, v5, v1
 ; CHECK-NEXT:    cmeq.4s v1, v4, v1
-; CHECK-NEXT:    bic.16b v3, v3, v7
-; CHECK-NEXT:    bic.16b v1, v4, v1
-; CHECK-NEXT:    bic.16b v2, v2, v6
-; CHECK-NEXT:    bic.16b v0, v0, v5
+; CHECK-NEXT:    bic.16b v0, v3, v0
+; CHECK-NEXT:    bic.16b v3, v4, v1
+; CHECK-NEXT:    bic.16b v1, v2, v7
+; CHECK-NEXT:    bic.16b v2, v5, v6
 ; CHECK-NEXT:    ret
   %ext = zext <16 x i8> %a to <16 x i32>
   %cmp = icmp ne <16 x i8> %a, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
@@ -333,47 +332,47 @@ define <16 x i32> @same_zext_used_in_cmp_ne_and_select_v8i32(<16 x i8> %a) {
 define <16 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_other_use(<16 x i8> %a, <16 x i64> %v, ptr %ptr) {
 ; CHECK-LABEL: same_zext_used_in_cmp_unsigned_pred_and_select_other_use:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    mov.16b v16, v2
-; CHECK-NEXT:    movi.16b v2, #10
-; CHECK-NEXT:    ushll.8h v18, v0, #0
-; CHECK-NEXT:    ushll2.8h v20, v0, #0
-; CHECK-NEXT:    mov.16b v17, v1
-; CHECK-NEXT:    ldr q1, [sp]
-; CHECK-NEXT:    cmhi.16b v0, v0, v2
-; CHECK-NEXT:    ushll.4s v19, v18, #0
-; CHECK-NEXT:    sshll2.8h v21, v0, #0
-; CHECK-NEXT:    sshll.8h v0, v0, #0
-; CHECK-NEXT:    sshll2.4s v22, v21, #0
-; CHECK-NEXT:    sshll.4s v21, v21, #0
-; CHECK-NEXT:    sshll2.2d v23, v22, #0
-; CHECK-NEXT:    sshll.2d v24, v22, #0
-; CHECK-NEXT:    sshll2.4s v25, v0, #0
-; CHECK-NEXT:    sshll2.2d v26, v21, #0
-; CHECK-NEXT:    sshll.2d v28, v21, #0
-; CHECK-NEXT:    sshll2.2d v27, v25, #0
-; CHECK-NEXT:    sshll.4s v0, v0, #0
-; CHECK-NEXT:    and.16b v1, v1, v23
-; CHECK-NEXT:    and.16b v7, v7, v24
-; CHECK-NEXT:    sshll.2d v29, v25, #0
-; CHECK-NEXT:    stp q7, q1, [x0, #96]
-; CHECK-NEXT:    and.16b v1, v6, v26
-; CHECK-NEXT:    and.16b v5, v5, v28
-; CHECK-NEXT:    ushll.4s v2, v20, #0
-; CHECK-NEXT:    stp q5, q1, [x0, #64]
-; CHECK-NEXT:    ushll2.4s v18, v18, #0
-; CHECK-NEXT:    ushll2.4s v20, v20, #0
-; CHECK-NEXT:    and.16b v1, v4, v27
-; CHECK-NEXT:    sshll2.2d v4, v0, #0
-; CHECK-NEXT:    sshll.2d v5, v0, #0
-; CHECK-NEXT:    and.16b v3, v3, v29
-; CHECK-NEXT:    stp q3, q1, [x0, #32]
-; CHECK-NEXT:    and.16b v3, v20, v22
-; CHECK-NEXT:    and.16b v1, v18, v25
-; CHECK-NEXT:    and.16b v2, v2, v21
-; CHECK-NEXT:    and.16b v0, v19, v0
-; CHECK-NEXT:    and.16b v4, v16, v4
-; CHECK-NEXT:    and.16b v5, v17, v5
-; CHECK-NEXT:    stp q5, q4, [x0]
+; CHECK-NEXT:    movi.16b v16, #10
+; CHECK-NEXT:    ushll.8h v19, v0, #0
+; CHECK-NEXT:    ldr q21, [sp]
+; CHECK-NEXT:    ushll.4s v24, v19, #0
+; CHECK-NEXT:    ushll2.4s v19, v19, #0
+; CHECK-NEXT:    cmhi.16b v16, v0, v16
+; CHECK-NEXT:    ushll2.8h v0, v0, #0
+; CHECK-NEXT:    sshll2.8h v17, v16, #0
+; CHECK-NEXT:    sshll.8h v16, v16, #0
+; CHECK-NEXT:    ushll.4s v25, v0, #0
+; CHECK-NEXT:    ushll2.4s v0, v0, #0
+; CHECK-NEXT:    sshll2.4s v18, v17, #0
+; CHECK-NEXT:    sshll.4s v17, v17, #0
+; CHECK-NEXT:    sshll2.4s v22, v16, #0
+; CHECK-NEXT:    sshll.4s v16, v16, #0
+; CHECK-NEXT:    sshll2.2d v20, v18, #0
+; CHECK-NEXT:    sshll.2d v23, v18, #0
+; CHECK-NEXT:    sshll2.2d v26, v17, #0
+; CHECK-NEXT:    sshll.2d v27, v17, #0
+; CHECK-NEXT:    and.16b v20, v21, v20
+; CHECK-NEXT:    sshll2.2d v21, v22, #0
+; CHECK-NEXT:    and.16b v7, v7, v23
+; CHECK-NEXT:    sshll.2d v23, v22, #0
+; CHECK-NEXT:    and.16b v6, v6, v26
+; CHECK-NEXT:    sshll2.2d v26, v16, #0
+; CHECK-NEXT:    and.16b v5, v5, v27
+; CHECK-NEXT:    stp q7, q20, [x0, #96]
+; CHECK-NEXT:    sshll.2d v20, v16, #0
+; CHECK-NEXT:    and.16b v21, v4, v21
+; CHECK-NEXT:    and.16b v4, v0, v18
+; CHECK-NEXT:    and.16b v7, v3, v23
+; CHECK-NEXT:    and.16b v3, v19, v22
+; CHECK-NEXT:    stp q5, q6, [x0, #64]
+; CHECK-NEXT:    and.16b v0, v24, v16
+; CHECK-NEXT:    and.16b v6, v2, v26
+; CHECK-NEXT:    and.16b v2, v25, v17
+; CHECK-NEXT:    and.16b v5, v1, v20
+; CHECK-NEXT:    mov.16b v1, v3
+; CHECK-NEXT:    mov.16b v3, v4
+; CHECK-NEXT:    stp q7, q21, [x0, #32]
+; CHECK-NEXT:    stp q5, q6, [x0]
 ; CHECK-NEXT:    ret
 entry:
   %ext = zext <16 x i8> %a to <16 x i32>
@@ -387,21 +386,21 @@ entry:
 define <16 x i32> @same_sext_used_in_cmp_signed_pred_and_select_v16i32(<16 x i8> %a) {
 ; CHECK-LABEL: same_sext_used_in_cmp_signed_pred_and_select_v16i32:
 ; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    sshll.8h v2, v0, #0
+; CHECK-NEXT:    sshll2.8h v0, v0, #0
 ; CHECK-NEXT:    movi.4s v1, #10
-; CHECK-NEXT:    sshll2.8h v2, v0, #0
-; CHECK-NEXT:    sshll.8h v0, v0, #0
-; CHECK-NEXT:    sshll2.4s v3, v2, #0
+; CHECK-NEXT:    sshll.4s v3, v2, #0
 ; CHECK-NEXT:    sshll2.4s v4, v0, #0
-; CHECK-NEXT:    sshll.4s v0, v0, #0
-; CHECK-NEXT:    sshll.4s v2, v2, #0
-; CHECK-NEXT:    cmgt.4s v5, v0, v1
-; CHECK-NEXT:    cmgt.4s v6, v2, v1
-; CHECK-NEXT:    cmgt.4s v7, v3, v1
+; CHECK-NEXT:    sshll2.4s v2, v2, #0
+; CHECK-NEXT:    sshll.4s v5, v0, #0
+; CHECK-NEXT:    cmgt.4s v0, v3, v1
+; CHECK-NEXT:    cmgt.4s v7, v2, v1
+; CHECK-NEXT:    cmgt.4s v6, v5, v1
 ; CHECK-NEXT:    cmgt.4s v1, v4, v1
-; CHECK-NEXT:    and.16b v3, v3, v7
-; CHECK-NEXT:    and.16b v1, v4, v1
-; CHECK-NEXT:    and.16b v2, v2, v6
-; CHECK-NEXT:    and.16b v0, v0, v5
+; CHECK-NEXT:    and.16b v0, v3, v0
+; CHECK-NEXT:    and.16b v3, v4, v1
+; CHECK-NEXT:    and.16b v1, v2, v7
+; CHECK-NEXT:    and.16b v2, v5, v6
 ; CHECK-NEXT:    ret
 entry:
   %ext = sext <16 x i8> %a to <16 x i32>
@@ -416,10 +415,10 @@ define <8 x i32> @same_sext_used_in_cmp_eq_and_select_v8i32(<8 x i16> %a) {
 ; CHECK-NEXT:    movi.4s v1, #10
 ; CHECK-NEXT:    sshll2.4s v2, v0, #0
 ; CHECK-NEXT:    sshll.4s v0, v0, #0
-; CHECK-NEXT:    cmeq.4s v3, v2, v1
-; CHECK-NEXT:    cmeq.4s v4, v0, v1
-; CHECK-NEXT:    and.16b v1, v2, v3
-; CHECK-NEXT:    and.16b v0, v0, v4
+; CHECK-NEXT:    cmeq.4s v3, v0, v1
+; CHECK-NEXT:    cmeq.4s v1, v2, v1
+; CHECK-NEXT:    and.16b v1, v2, v1
+; CHECK-NEXT:    and.16b v0, v0, v3
 ; CHECK-NEXT:    ret
   %ext = sext <8 x i16> %a to <8 x i32>
   %cmp = icmp eq <8 x i16> %a, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
@@ -430,17 +429,17 @@ define <8 x i32> @same_sext_used_in_cmp_eq_and_select_v8i32(<8 x i16> %a) {
 define <8 x i32> @same_sext_used_in_cmp_eq_and_select_v8i32_from_v8i13(<8 x i13> %a) {
 ; CHECK-LABEL: same_sext_used_in_cmp_eq_and_select_v8i32_from_v8i13:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    ushll2.4s v2, v0, #0
-; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    ushll.4s v2, v0, #0
+; CHECK-NEXT:    ushll2.4s v0, v0, #0
 ; CHECK-NEXT:    movi.4s v1, #10
-; CHECK-NEXT:    shl.4s v2, v2, #19
 ; CHECK-NEXT:    shl.4s v0, v0, #19
-; CHECK-NEXT:    sshr.4s v2, v2, #19
+; CHECK-NEXT:    shl.4s v2, v2, #19
 ; CHECK-NEXT:    sshr.4s v0, v0, #19
+; CHECK-NEXT:    sshr.4s v2, v2, #19
 ; CHECK-NEXT:    cmeq.4s v3, v2, v1
-; CHECK-NEXT:    cmeq.4s v4, v0, v1
-; CHECK-NEXT:    and.16b v1, v2, v3
-; CHECK-NEXT:    and.16b v0, v0, v4
+; CHECK-NEXT:    cmeq.4s v1, v0, v1
+; CHECK-NEXT:    and.16b v1, v0, v1
+; CHECK-NEXT:    and.16b v0, v2, v3
 ; CHECK-NEXT:    ret
   %ext = sext <8 x i13> %a to <8 x i32>
   %cmp = icmp eq <8 x i13> %a, <i13 10, i13 10, i13 10, i13 10, i13 10, i13 10, i13 10, i13 10>
@@ -451,21 +450,21 @@ define <8 x i32> @same_sext_used_in_cmp_eq_and_select_v8i32_from_v8i13(<8 x i13>
 define <16 x i32> @same_sext_used_in_cmp_ne_and_select_v8i32(<16 x i8> %a) {
 ; CHECK-LABEL: same_sext_used_in_cmp_ne_and_select_v8i32:
 ; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sshll.8h v2, v0, #0
+; CHECK-NEXT:    sshll2.8h v0, v0, #0
 ; CHECK-NEXT:    movi.4s v1, #10
-; CHECK-NEXT:    sshll2.8h v2, v0, #0
-; CHECK-NEXT:    sshll.8h v0, v0, #0
-; CHECK-NEXT:    sshll2.4s v3, v2, #0
+; CHECK-NEXT:    sshll.4s v3, v2, #0
 ; CHECK-NEXT:    sshll2.4s v4, v0, #0
-; CHECK-NEXT:    sshll.4s v0, v0, #0
-; CHECK-NEXT:    sshll.4s v2, v2, #0
-; CHECK-NEXT:    cmeq.4s v5, v0, v1
-; CHECK-NEXT:    cmeq.4s v6, v2, v1
-; CHECK-NEXT:    cmeq.4s v7, v3, v1
+; CHECK-NEXT:    sshll2.4s v2, v2, #0
+; CHECK-NEXT:    sshll.4s v5, v0, #0
+; CHECK-NEXT:    cmeq.4s v0, v3, v1
+; CHECK-NEXT:    cmeq.4s v7, v2, v1
+; CHECK-NEXT:    cmeq.4s v6, v5, v1
 ; CHECK-NEXT:    cmeq.4s v1, v4, v1
-; CHECK-NEXT:    bic.16b v3, v3, v7
-; CHECK-NEXT:    bic.16b v1, v4, v1
-; CHECK-NEXT:    bic.16b v2, v2, v6
-; CHECK-NEXT:    bic.16b v0, v0, v5
+; CHECK-NEXT:    bic.16b v0, v3, v0
+; CHECK-NEXT:    bic.16b v3, v4, v1
+; CHECK-NEXT:    bic.16b v1, v2, v7
+; CHECK-NEXT:    bic.16b v2, v5, v6
 ; CHECK-NEXT:    ret
   %ext = sext <16 x i8> %a to <16 x i32>
   %cmp = icmp ne <16 x i8> %a, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
@@ -479,10 +478,10 @@ define <8 x i32> @same_sext_used_in_cmp_signed_pred_and_select_v8i32(<8 x i16> %
 ; CHECK-NEXT:    movi.4s v1, #10
 ; CHECK-NEXT:    sshll2.4s v2, v0, #0
 ; CHECK-NEXT:    sshll.4s v0, v0, #0
-; CHECK-NEXT:    cmgt.4s v3, v2, v1
-; CHECK-NEXT:    cmgt.4s v4, v0, v1
-; CHECK-NEXT:    and.16b v1, v2, v3
-; CHECK-NEXT:    and.16b v0, v0, v4
+; CHECK-NEXT:    cmgt.4s v3, v0, v1
+; CHECK-NEXT:    cmgt.4s v1, v2, v1
+; CHECK-NEXT:    and.16b v1, v2, v1
+; CHECK-NEXT:    and.16b v0, v0, v3
 ; CHECK-NEXT:    ret
 entry:
   %ext = sext <8 x i16> %a to <8 x i32>
@@ -494,17 +493,17 @@ entry:
 define <8 x i32> @same_sext_used_in_cmp_unsigned_pred_and_select_v8i32_from_v8i15(<8 x i15> %a) {
 ; CHECK-LABEL: same_sext_used_in_cmp_unsigned_pred_and_select_v8i32_from_v8i15:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    ushll2.4s v2, v0, #0
-; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    ushll.4s v2, v0, #0
+; CHECK-NEXT:    ushll2.4s v0, v0, #0
 ; CHECK-NEXT:    movi.4s v1, #10
-; CHECK-NEXT:    shl.4s v2, v2, #17
 ; CHECK-NEXT:    shl.4s v0, v0, #17
-; CHECK-NEXT:    sshr.4s v2, v2, #17
+; CHECK-NEXT:    shl.4s v2, v2, #17
 ; CHECK-NEXT:    sshr.4s v0, v0, #17
+; CHECK-NEXT:    sshr.4s v2, v2, #17
 ; CHECK-NEXT:    cmge.4s v3, v2, v1
-; CHECK-NEXT:    cmge.4s v4, v0, v1
-; CHECK-NEXT:    and.16b v1, v2, v3
-; CHECK-NEXT:    and.16b v0, v0, v4
+; CHECK-NEXT:    cmge.4s v1, v0, v1
+; CHECK-NEXT:    and.16b v1, v0, v1
+; CHECK-NEXT:    and.16b v0, v2, v3
 ; CHECK-NEXT:    ret
   %ext = sext <8 x i15> %a to <8 x i32>
   %cmp = icmp sge <8 x i15> %a, <i15 10, i15 10, i15 10, i15 10, i15 10, i15 10, i15 10, i15 10>
@@ -516,23 +515,23 @@ define <16 x i32> @same_sext_used_in_cmp_unsigned_pred_and_select(<16 x i8> %a)
 ; CHECK-LABEL: same_sext_used_in_cmp_unsigned_pred_and_select:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    movi.16b v1, #10
-; CHECK-NEXT:    sshll.8h v3, v0, #0
-; CHECK-NEXT:    sshll2.8h v2, v0, #0
-; CHECK-NEXT:    cmhi.16b v0, v0, v1
-; CHECK-NEXT:    ext.16b v1, v3, v3, #8
-; CHECK-NEXT:    sshll.8h v5, v0, #0
-; CHECK-NEXT:    sshll2.8h v0, v0, #0
+; CHECK-NEXT:    sshll.8h v2, v0, #0
 ; CHECK-NEXT:    ext.16b v4, v2, v2, #8
-; CHECK-NEXT:    ext.16b v6, v5, v5, #8
-; CHECK-NEXT:    ext.16b v7, v0, v0, #8
-; CHECK-NEXT:    and.8b v0, v2, v0
-; CHECK-NEXT:    sshll.4s v2, v0, #0
-; CHECK-NEXT:    and.8b v0, v3, v5
-; CHECK-NEXT:    and.8b v1, v1, v6
-; CHECK-NEXT:    and.8b v3, v4, v7
-; CHECK-NEXT:    sshll.4s v0, v0, #0
-; CHECK-NEXT:    sshll.4s v1, v1, #0
+; CHECK-NEXT:    cmhi.16b v1, v0, v1
+; CHECK-NEXT:    sshll2.8h v0, v0, #0
+; CHECK-NEXT:    sshll.8h v3, v1, #0
+; CHECK-NEXT:    sshll2.8h v1, v1, #0
+; CHECK-NEXT:    ext.16b v5, v0, v0, #8
+; CHECK-NEXT:    ext.16b v6, v3, v3, #8
+; CHECK-NEXT:    ext.16b v7, v1, v1, #8
+; CHECK-NEXT:    and.8b v2, v2, v3
+; CHECK-NEXT:    and.8b v1, v0, v1
+; CHECK-NEXT:    sshll.4s v0, v2, #0
+; CHECK-NEXT:    and.8b v3, v5, v7
+; CHECK-NEXT:    and.8b v4, v4, v6
+; CHECK-NEXT:    sshll.4s v2, v1, #0
 ; CHECK-NEXT:    sshll.4s v3, v3, #0
+; CHECK-NEXT:    sshll.4s v1, v4, #0
 ; CHECK-NEXT:    ret
 entry:
   %ext = sext <16 x i8> %a to <16 x i32>
@@ -546,22 +545,22 @@ define <16 x i32> @same_zext_used_in_cmp_signed_pred_and_select_can_convert_to_u
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    movi.2d v1, #0xffffffffffffffff
 ; CHECK-NEXT:    ushll.8h v2, v0, #0
-; CHECK-NEXT:    ushll2.8h v3, v0, #0
 ; CHECK-NEXT:    ushll.4s v4, v2, #0
-; CHECK-NEXT:    cmgt.16b v0, v0, v1
-; CHECK-NEXT:    ushll.4s v5, v3, #0
-; CHECK-NEXT:    ushll2.4s v1, v3, #0
-; CHECK-NEXT:    sshll.8h v3, v0, #0
-; CHECK-NEXT:    sshll2.8h v0, v0, #0
 ; CHECK-NEXT:    ushll2.4s v2, v2, #0
-; CHECK-NEXT:    sshll.4s v6, v3, #0
-; CHECK-NEXT:    sshll.4s v7, v0, #0
-; CHECK-NEXT:    sshll2.4s v0, v0, #0
+; CHECK-NEXT:    cmgt.16b v1, v0, v1
+; CHECK-NEXT:    ushll2.8h v0, v0, #0
+; CHECK-NEXT:    sshll.8h v3, v1, #0
+; CHECK-NEXT:    sshll2.8h v1, v1, #0
+; CHECK-NEXT:    ushll.4s v5, v0, #0
+; CHECK-NEXT:    ushll2.4s v6, v0, #0
+; CHECK-NEXT:    sshll.4s v0, v3, #0
+; CHECK-NEXT:    sshll.4s v7, v1, #0
 ; CHECK-NEXT:    sshll2.4s v16, v3, #0
-; CHECK-NEXT:    and.16b v3, v1, v0
+; CHECK-NEXT:    sshll2.4s v1, v1, #0
+; CHECK-NEXT:    and.16b v0, v4, v0
+; CHECK-NEXT:    and.16b v3, v6, v1
 ; CHECK-NEXT:    and.16b v1, v2, v16
 ; CHECK-NEXT:    and.16b v2, v5, v7
-; CHECK-NEXT:    and.16b v0, v4, v6
 ; CHECK-NEXT:    ret
 entry:
   %ext = zext <16 x i8> %a to <16 x i32>
@@ -574,52 +573,53 @@ define void @extension_in_loop_v16i8_to_v16i32(ptr %src, ptr %dst) {
 ; CHECK-LABEL: extension_in_loop_v16i8_to_v16i32:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh2:
-; CHECK-NEXT:    adrp x9, lCPI24_0 at PAGE
+; CHECK-NEXT:    adrp x8, lCPI24_0 at PAGE
+; CHECK-NEXT:    movi.2d v0, #0xffffffffffffffff
 ; CHECK-NEXT:  Lloh3:
-; CHECK-NEXT:    adrp x10, lCPI24_1 at PAGE
+; CHECK-NEXT:    adrp x9, lCPI24_2 at PAGE
 ; CHECK-NEXT:  Lloh4:
-; CHECK-NEXT:    adrp x11, lCPI24_2 at PAGE
+; CHECK-NEXT:    ldr q1, [x8, lCPI24_0 at PAGEOFF]
 ; CHECK-NEXT:  Lloh5:
-; CHECK-NEXT:    adrp x12, lCPI24_3 at PAGE
-; CHECK-NEXT:    movi.2d v2, #0xffffffffffffffff
-; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    adrp x8, lCPI24_1 at PAGE
 ; CHECK-NEXT:  Lloh6:
-; CHECK-NEXT:    ldr q0, [x9, lCPI24_0 at PAGEOFF]
+; CHECK-NEXT:    adrp x10, lCPI24_3 at PAGE
 ; CHECK-NEXT:  Lloh7:
-; CHECK-NEXT:    ldr q1, [x10, lCPI24_1 at PAGEOFF]
+; CHECK-NEXT:    ldr q2, [x8, lCPI24_1 at PAGEOFF]
 ; CHECK-NEXT:  Lloh8:
-; CHECK-NEXT:    ldr q3, [x11, lCPI24_2 at PAGEOFF]
+; CHECK-NEXT:    ldr q3, [x9, lCPI24_2 at PAGEOFF]
 ; CHECK-NEXT:  Lloh9:
-; CHECK-NEXT:    ldr q4, [x12, lCPI24_3 at PAGEOFF]
+; CHECK-NEXT:    ldr q4, [x10, lCPI24_3 at PAGEOFF]
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB24_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr q5, [x0, x8]
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    cmp x8, #128
-; CHECK-NEXT:    cmgt.16b v6, v5, v2
-; CHECK-NEXT:    tbl.16b v7, { v5 }, v0
+; CHECK-NEXT:    cmgt.16b v6, v5, v0
 ; CHECK-NEXT:    tbl.16b v16, { v5 }, v1
-; CHECK-NEXT:    sshll2.8h v18, v6, #0
-; CHECK-NEXT:    tbl.16b v17, { v5 }, v3
-; CHECK-NEXT:    sshll2.4s v19, v18, #0
-; CHECK-NEXT:    sshll.4s v18, v18, #0
+; CHECK-NEXT:    tbl.16b v17, { v5 }, v2
+; CHECK-NEXT:    tbl.16b v19, { v5 }, v3
 ; CHECK-NEXT:    tbl.16b v5, { v5 }, v4
+; CHECK-NEXT:    sshll2.8h v7, v6, #0
 ; CHECK-NEXT:    sshll.8h v6, v6, #0
-; CHECK-NEXT:    and.16b v7, v7, v19
-; CHECK-NEXT:    and.16b v16, v16, v18
-; CHECK-NEXT:    stp q16, q7, [x1, #32]
-; CHECK-NEXT:    sshll2.4s v7, v6, #0
+; CHECK-NEXT:    sshll2.4s v18, v7, #0
+; CHECK-NEXT:    sshll.4s v7, v7, #0
+; CHECK-NEXT:    sshll2.4s v20, v6, #0
 ; CHECK-NEXT:    sshll.4s v6, v6, #0
+; CHECK-NEXT:    and.16b v16, v16, v18
 ; CHECK-NEXT:    and.16b v7, v17, v7
+; CHECK-NEXT:    and.16b v17, v19, v20
 ; CHECK-NEXT:    and.16b v5, v5, v6
-; CHECK-NEXT:    stp q5, q7, [x1], #64
+; CHECK-NEXT:    stp q7, q16, [x1, #32]
+; CHECK-NEXT:    stp q5, q17, [x1], #64
 ; CHECK-NEXT:    b.ne LBB24_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh5, Lloh9
-; CHECK-NEXT:    .loh AdrpLdr Lloh4, Lloh8
-; CHECK-NEXT:    .loh AdrpLdr Lloh3, Lloh7
-; CHECK-NEXT:    .loh AdrpLdr Lloh2, Lloh6
+; CHECK-NEXT:    .loh AdrpLdr Lloh6, Lloh9
+; CHECK-NEXT:    .loh AdrpLdr Lloh5, Lloh7
+; CHECK-NEXT:    .loh AdrpLdr Lloh3, Lloh8
+; CHECK-NEXT:    .loh AdrpAdrp Lloh2, Lloh5
+; CHECK-NEXT:    .loh AdrpLdr Lloh2, Lloh4
 entry:
   br label %loop
 
@@ -644,52 +644,53 @@ define void @extension_in_loop_as_shuffle_v16i8_to_v16i32(ptr %src, ptr %dst) {
 ; CHECK-LABEL: extension_in_loop_as_shuffle_v16i8_to_v16i32:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh10:
-; CHECK-NEXT:    adrp x9, lCPI25_0 at PAGE
+; CHECK-NEXT:    adrp x8, lCPI25_0 at PAGE
+; CHECK-NEXT:    movi.2d v0, #0xffffffffffffffff
 ; CHECK-NEXT:  Lloh11:
-; CHECK-NEXT:    adrp x10, lCPI25_1 at PAGE
+; CHECK-NEXT:    adrp x9, lCPI25_2 at PAGE
 ; CHECK-NEXT:  Lloh12:
-; CHECK-NEXT:    adrp x11, lCPI25_2 at PAGE
+; CHECK-NEXT:    ldr q1, [x8, lCPI25_0 at PAGEOFF]
 ; CHECK-NEXT:  Lloh13:
-; CHECK-NEXT:    adrp x12, lCPI25_3 at PAGE
-; CHECK-NEXT:    movi.2d v2, #0xffffffffffffffff
-; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    adrp x8, lCPI25_1 at PAGE
 ; CHECK-NEXT:  Lloh14:
-; CHECK-NEXT:    ldr q0, [x9, lCPI25_0 at PAGEOFF]
+; CHECK-NEXT:    adrp x10, lCPI25_3 at PAGE
 ; CHECK-NEXT:  Lloh15:
-; CHECK-NEXT:    ldr q1, [x10, lCPI25_1 at PAGEOFF]
+; CHECK-NEXT:    ldr q2, [x8, lCPI25_1 at PAGEOFF]
 ; CHECK-NEXT:  Lloh16:
-; CHECK-NEXT:    ldr q3, [x11, lCPI25_2 at PAGEOFF]
+; CHECK-NEXT:    ldr q3, [x9, lCPI25_2 at PAGEOFF]
 ; CHECK-NEXT:  Lloh17:
-; CHECK-NEXT:    ldr q4, [x12, lCPI25_3 at PAGEOFF]
+; CHECK-NEXT:    ldr q4, [x10, lCPI25_3 at PAGEOFF]
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB25_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr q5, [x0, x8]
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    cmp x8, #128
-; CHECK-NEXT:    cmgt.16b v6, v5, v2
-; CHECK-NEXT:    tbl.16b v7, { v5 }, v0
+; CHECK-NEXT:    cmgt.16b v6, v5, v0
 ; CHECK-NEXT:    tbl.16b v16, { v5 }, v1
-; CHECK-NEXT:    sshll2.8h v18, v6, #0
-; CHECK-NEXT:    tbl.16b v17, { v5 }, v3
-; CHECK-NEXT:    sshll2.4s v19, v18, #0
-; CHECK-NEXT:    sshll.4s v18, v18, #0
+; CHECK-NEXT:    tbl.16b v17, { v5 }, v2
+; CHECK-NEXT:    tbl.16b v19, { v5 }, v3
 ; CHECK-NEXT:    tbl.16b v5, { v5 }, v4
+; CHECK-NEXT:    sshll2.8h v7, v6, #0
 ; CHECK-NEXT:    sshll.8h v6, v6, #0
-; CHECK-NEXT:    and.16b v7, v7, v19
-; CHECK-NEXT:    and.16b v16, v16, v18
-; CHECK-NEXT:    stp q16, q7, [x1, #32]
-; CHECK-NEXT:    sshll2.4s v7, v6, #0
+; CHECK-NEXT:    sshll2.4s v18, v7, #0
+; CHECK-NEXT:    sshll.4s v7, v7, #0
+; CHECK-NEXT:    sshll2.4s v20, v6, #0
 ; CHECK-NEXT:    sshll.4s v6, v6, #0
+; CHECK-NEXT:    and.16b v16, v16, v18
 ; CHECK-NEXT:    and.16b v7, v17, v7
+; CHECK-NEXT:    and.16b v17, v19, v20
 ; CHECK-NEXT:    and.16b v5, v5, v6
-; CHECK-NEXT:    stp q5, q7, [x1], #64
+; CHECK-NEXT:    stp q7, q16, [x1, #32]
+; CHECK-NEXT:    stp q5, q17, [x1], #64
 ; CHECK-NEXT:    b.ne LBB25_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh13, Lloh17
-; CHECK-NEXT:    .loh AdrpLdr Lloh12, Lloh16
-; CHECK-NEXT:    .loh AdrpLdr Lloh11, Lloh15
-; CHECK-NEXT:    .loh AdrpLdr Lloh10, Lloh14
+; CHECK-NEXT:    .loh AdrpLdr Lloh14, Lloh17
+; CHECK-NEXT:    .loh AdrpLdr Lloh13, Lloh15
+; CHECK-NEXT:    .loh AdrpLdr Lloh11, Lloh16
+; CHECK-NEXT:    .loh AdrpAdrp Lloh10, Lloh13
+; CHECK-NEXT:    .loh AdrpLdr Lloh10, Lloh12
 entry:
   br label %loop
 
@@ -715,52 +716,53 @@ define void @shuffle_in_loop_is_no_extend_v16i8_to_v16i32(ptr %src, ptr %dst) {
 ; CHECK-LABEL: shuffle_in_loop_is_no_extend_v16i8_to_v16i32:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh18:
-; CHECK-NEXT:    adrp x9, lCPI26_0 at PAGE
+; CHECK-NEXT:    adrp x8, lCPI26_0 at PAGE
+; CHECK-NEXT:    movi.2d v0, #0xffffffffffffffff
 ; CHECK-NEXT:  Lloh19:
-; CHECK-NEXT:    adrp x10, lCPI26_1 at PAGE
+; CHECK-NEXT:    adrp x9, lCPI26_2 at PAGE
 ; CHECK-NEXT:  Lloh20:
-; CHECK-NEXT:    adrp x11, lCPI26_2 at PAGE
+; CHECK-NEXT:    ldr q1, [x8, lCPI26_0 at PAGEOFF]
 ; CHECK-NEXT:  Lloh21:
-; CHECK-NEXT:    adrp x12, lCPI26_3 at PAGE
-; CHECK-NEXT:    movi.2d v2, #0xffffffffffffffff
-; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    adrp x8, lCPI26_1 at PAGE
 ; CHECK-NEXT:  Lloh22:
-; CHECK-NEXT:    ldr q0, [x9, lCPI26_0 at PAGEOFF]
+; CHECK-NEXT:    adrp x10, lCPI26_3 at PAGE
 ; CHECK-NEXT:  Lloh23:
-; CHECK-NEXT:    ldr q1, [x10, lCPI26_1 at PAGEOFF]
+; CHECK-NEXT:    ldr q2, [x8, lCPI26_1 at PAGEOFF]
 ; CHECK-NEXT:  Lloh24:
-; CHECK-NEXT:    ldr q3, [x11, lCPI26_2 at PAGEOFF]
+; CHECK-NEXT:    ldr q3, [x9, lCPI26_2 at PAGEOFF]
 ; CHECK-NEXT:  Lloh25:
-; CHECK-NEXT:    ldr q4, [x12, lCPI26_3 at PAGEOFF]
+; CHECK-NEXT:    ldr q4, [x10, lCPI26_3 at PAGEOFF]
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB26_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr q5, [x0, x8]
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    cmp x8, #128
-; CHECK-NEXT:    cmgt.16b v6, v5, v2
-; CHECK-NEXT:    tbl.16b v7, { v5 }, v0
+; CHECK-NEXT:    cmgt.16b v6, v5, v0
 ; CHECK-NEXT:    tbl.16b v16, { v5 }, v1
-; CHECK-NEXT:    sshll2.8h v18, v6, #0
-; CHECK-NEXT:    tbl.16b v17, { v5 }, v3
-; CHECK-NEXT:    sshll2.4s v19, v18, #0
-; CHECK-NEXT:    sshll.4s v18, v18, #0
+; CHECK-NEXT:    tbl.16b v17, { v5 }, v2
+; CHECK-NEXT:    tbl.16b v19, { v5 }, v3
 ; CHECK-NEXT:    tbl.16b v5, { v5 }, v4
+; CHECK-NEXT:    sshll2.8h v7, v6, #0
 ; CHECK-NEXT:    sshll.8h v6, v6, #0
-; CHECK-NEXT:    and.16b v7, v7, v19
-; CHECK-NEXT:    and.16b v16, v16, v18
-; CHECK-NEXT:    stp q16, q7, [x1, #32]
-; CHECK-NEXT:    sshll2.4s v7, v6, #0
+; CHECK-NEXT:    sshll2.4s v18, v7, #0
+; CHECK-NEXT:    sshll.4s v7, v7, #0
+; CHECK-NEXT:    sshll2.4s v20, v6, #0
 ; CHECK-NEXT:    sshll.4s v6, v6, #0
+; CHECK-NEXT:    and.16b v16, v16, v18
 ; CHECK-NEXT:    and.16b v7, v17, v7
+; CHECK-NEXT:    and.16b v17, v19, v20
 ; CHECK-NEXT:    and.16b v5, v5, v6
-; CHECK-NEXT:    stp q5, q7, [x1], #64
+; CHECK-NEXT:    stp q7, q16, [x1, #32]
+; CHECK-NEXT:    stp q5, q17, [x1], #64
 ; CHECK-NEXT:    b.ne LBB26_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh21, Lloh25
-; CHECK-NEXT:    .loh AdrpLdr Lloh20, Lloh24
-; CHECK-NEXT:    .loh AdrpLdr Lloh19, Lloh23
-; CHECK-NEXT:    .loh AdrpLdr Lloh18, Lloh22
+; CHECK-NEXT:    .loh AdrpLdr Lloh22, Lloh25
+; CHECK-NEXT:    .loh AdrpLdr Lloh21, Lloh23
+; CHECK-NEXT:    .loh AdrpLdr Lloh19, Lloh24
+; CHECK-NEXT:    .loh AdrpAdrp Lloh18, Lloh21
+; CHECK-NEXT:    .loh AdrpLdr Lloh18, Lloh20
 entry:
   br label %loop
 

diff  --git a/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
index 53f0d784ceed15..e21015ad3db30c 100644
--- a/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -105,14 +105,14 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-NEXT:    ldr x8, [x1]
 ; ALL-NEXT:    ldp x10, x9, [x0]
 ; ALL-NEXT:    lsl x8, x8, #3
-; ALL-NEXT:    and x11, x8, #0x38
-; ALL-NEXT:    mvn w12, w8
+; ALL-NEXT:    lsl x11, x9, #1
+; ALL-NEXT:    and x12, x8, #0x38
+; ALL-NEXT:    mvn w13, w8
 ; ALL-NEXT:    tst x8, #0x40
-; ALL-NEXT:    lsl x13, x9, #1
-; ALL-NEXT:    lsr x10, x10, x11
-; ALL-NEXT:    lsl x12, x13, x12
-; ALL-NEXT:    lsr x9, x9, x11
-; ALL-NEXT:    orr x8, x12, x10
+; ALL-NEXT:    lsr x10, x10, x12
+; ALL-NEXT:    lsl x11, x11, x13
+; ALL-NEXT:    lsr x9, x9, x12
+; ALL-NEXT:    orr x8, x11, x10
 ; ALL-NEXT:    csel x10, xzr, x9, ne
 ; ALL-NEXT:    csel x8, x9, x8, ne
 ; ALL-NEXT:    stp x8, x10, [x2]
@@ -130,14 +130,14 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-NEXT:    ldr x8, [x1]
 ; ALL-NEXT:    ldp x9, x10, [x0]
 ; ALL-NEXT:    lsl x8, x8, #3
-; ALL-NEXT:    and x11, x8, #0x38
-; ALL-NEXT:    mvn w12, w8
-; ALL-NEXT:    lsr x13, x9, #1
+; ALL-NEXT:    lsr x11, x9, #1
+; ALL-NEXT:    and x12, x8, #0x38
+; ALL-NEXT:    mvn w13, w8
 ; ALL-NEXT:    tst x8, #0x40
-; ALL-NEXT:    lsl x10, x10, x11
-; ALL-NEXT:    lsr x12, x13, x12
-; ALL-NEXT:    lsl x9, x9, x11
-; ALL-NEXT:    orr x8, x10, x12
+; ALL-NEXT:    lsl x10, x10, x12
+; ALL-NEXT:    lsr x11, x11, x13
+; ALL-NEXT:    lsl x9, x9, x12
+; ALL-NEXT:    orr x8, x10, x11
 ; ALL-NEXT:    csel x10, xzr, x9, ne
 ; ALL-NEXT:    csel x8, x9, x8, ne
 ; ALL-NEXT:    stp x10, x8, [x2]
@@ -155,18 +155,18 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-NEXT:    ldr x8, [x1]
 ; ALL-NEXT:    ldp x10, x9, [x0]
 ; ALL-NEXT:    lsl x8, x8, #3
-; ALL-NEXT:    and x11, x8, #0x38
-; ALL-NEXT:    mvn w12, w8
+; ALL-NEXT:    lsl x11, x9, #1
+; ALL-NEXT:    and x12, x8, #0x38
+; ALL-NEXT:    mvn w13, w8
 ; ALL-NEXT:    tst x8, #0x40
-; ALL-NEXT:    lsl x13, x9, #1
-; ALL-NEXT:    asr x8, x9, #63
-; ALL-NEXT:    lsr x10, x10, x11
-; ALL-NEXT:    lsl x12, x13, x12
-; ALL-NEXT:    asr x11, x9, x11
-; ALL-NEXT:    orr x9, x12, x10
-; ALL-NEXT:    csel x8, x8, x11, ne
-; ALL-NEXT:    csel x9, x11, x9, ne
-; ALL-NEXT:    stp x9, x8, [x2]
+; ALL-NEXT:    lsr x10, x10, x12
+; ALL-NEXT:    lsl x11, x11, x13
+; ALL-NEXT:    asr x12, x9, x12
+; ALL-NEXT:    asr x9, x9, #63
+; ALL-NEXT:    orr x8, x11, x10
+; ALL-NEXT:    csel x9, x9, x12, ne
+; ALL-NEXT:    csel x8, x12, x8, ne
+; ALL-NEXT:    stp x8, x9, [x2]
 ; ALL-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
   %byteOff = load i128, ptr %byteOff.ptr, align 1
@@ -180,20 +180,20 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: lshr_32bytes:
 ; ALL:       // %bb.0:
 ; ALL-NEXT:    sub sp, sp, #64
-; ALL-NEXT:    ldr x9, [x1]
-; ALL-NEXT:    mov x8, sp
-; ALL-NEXT:    ldp x10, x11, [x0, #16]
+; ALL-NEXT:    ldp x9, x8, [x0, #16]
 ; ALL-NEXT:    movi v0.2d, #0000000000000000
+; ALL-NEXT:    ldr x10, [x1]
 ; ALL-NEXT:    ldr q1, [x0]
-; ALL-NEXT:    and x9, x9, #0x1f
+; ALL-NEXT:    stp x9, x8, [sp, #16]
+; ALL-NEXT:    mov x8, sp
+; ALL-NEXT:    and x9, x10, #0x1f
+; ALL-NEXT:    str q1, [sp]
 ; ALL-NEXT:    add x8, x8, x9
 ; ALL-NEXT:    stp q0, q0, [sp, #32]
-; ALL-NEXT:    stp x10, x11, [sp, #16]
-; ALL-NEXT:    str q1, [sp]
 ; ALL-NEXT:    ldp x10, x9, [x8, #16]
 ; ALL-NEXT:    ldr q0, [x8]
-; ALL-NEXT:    stp x10, x9, [x2, #16]
 ; ALL-NEXT:    str q0, [x2]
+; ALL-NEXT:    stp x10, x9, [x2, #16]
 ; ALL-NEXT:    add sp, sp, #64
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
@@ -207,21 +207,21 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: shl_32bytes:
 ; ALL:       // %bb.0:
 ; ALL-NEXT:    sub sp, sp, #64
-; ALL-NEXT:    ldr x9, [x1]
-; ALL-NEXT:    mov x8, sp
-; ALL-NEXT:    ldp x10, x11, [x0, #16]
+; ALL-NEXT:    ldp x9, x8, [x0, #16]
 ; ALL-NEXT:    movi v0.2d, #0000000000000000
-; ALL-NEXT:    add x8, x8, #32
+; ALL-NEXT:    ldr x10, [x1]
 ; ALL-NEXT:    ldr q1, [x0]
-; ALL-NEXT:    and x9, x9, #0x1f
-; ALL-NEXT:    sub x8, x8, x9
+; ALL-NEXT:    stp x9, x8, [sp, #48]
+; ALL-NEXT:    mov x8, sp
+; ALL-NEXT:    and x9, x10, #0x1f
+; ALL-NEXT:    add x8, x8, #32
 ; ALL-NEXT:    stp q0, q0, [sp]
-; ALL-NEXT:    stp x10, x11, [sp, #48]
 ; ALL-NEXT:    str q1, [sp, #32]
+; ALL-NEXT:    sub x8, x8, x9
 ; ALL-NEXT:    ldp x9, x10, [x8, #16]
 ; ALL-NEXT:    ldr q0, [x8]
-; ALL-NEXT:    stp x9, x10, [x2, #16]
 ; ALL-NEXT:    str q0, [x2]
+; ALL-NEXT:    stp x9, x10, [x2, #16]
 ; ALL-NEXT:    add sp, sp, #64
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
@@ -235,21 +235,21 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: ashr_32bytes:
 ; ALL:       // %bb.0:
 ; ALL-NEXT:    sub sp, sp, #64
-; ALL-NEXT:    ldp x11, x10, [x0, #16]
-; ALL-NEXT:    mov x8, sp
-; ALL-NEXT:    ldr x9, [x1]
+; ALL-NEXT:    ldp x9, x8, [x0, #16]
+; ALL-NEXT:    ldr x10, [x1]
 ; ALL-NEXT:    ldr q0, [x0]
-; ALL-NEXT:    asr x12, x10, #63
-; ALL-NEXT:    and x9, x9, #0x1f
-; ALL-NEXT:    add x8, x8, x9
-; ALL-NEXT:    stp x11, x10, [sp, #16]
+; ALL-NEXT:    and x10, x10, #0x1f
+; ALL-NEXT:    stp x9, x8, [sp, #16]
+; ALL-NEXT:    asr x8, x8, #63
+; ALL-NEXT:    mov x9, sp
 ; ALL-NEXT:    str q0, [sp]
-; ALL-NEXT:    stp x12, x12, [sp, #48]
-; ALL-NEXT:    stp x12, x12, [sp, #32]
+; ALL-NEXT:    stp x8, x8, [sp, #48]
+; ALL-NEXT:    stp x8, x8, [sp, #32]
+; ALL-NEXT:    add x8, x9, x10
 ; ALL-NEXT:    ldp x10, x9, [x8, #16]
 ; ALL-NEXT:    ldr q0, [x8]
-; ALL-NEXT:    stp x10, x9, [x2, #16]
 ; ALL-NEXT:    str q0, [x2]
+; ALL-NEXT:    stp x10, x9, [x2, #16]
 ; ALL-NEXT:    add sp, sp, #64
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1

diff  --git a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
index 906aa153d73001..a4da6db57ecae3 100644
--- a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll
@@ -1,12 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s --check-prefixes=ALL
-
 define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: lshr_4bytes:
 ; ALL:       // %bb.0:
-; ALL-NEXT:    ldr w8, [x1]
-; ALL-NEXT:    ldr w9, [x0]
-; ALL-NEXT:    lsr w8, w9, w8
+; ALL-NEXT:    ldr w8, [x0]
+; ALL-NEXT:    ldr w9, [x1]
+; ALL-NEXT:    lsr w8, w8, w9
 ; ALL-NEXT:    str w8, [x2]
 ; ALL-NEXT:    ret
   %src = load i32, ptr %src.ptr, align 1
@@ -18,9 +17,9 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: shl_4bytes:
 ; ALL:       // %bb.0:
-; ALL-NEXT:    ldr w8, [x1]
-; ALL-NEXT:    ldr w9, [x0]
-; ALL-NEXT:    lsl w8, w9, w8
+; ALL-NEXT:    ldr w8, [x0]
+; ALL-NEXT:    ldr w9, [x1]
+; ALL-NEXT:    lsl w8, w8, w9
 ; ALL-NEXT:    str w8, [x2]
 ; ALL-NEXT:    ret
   %src = load i32, ptr %src.ptr, align 1
@@ -32,9 +31,9 @@ define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: ashr_4bytes:
 ; ALL:       // %bb.0:
-; ALL-NEXT:    ldr w8, [x1]
-; ALL-NEXT:    ldr w9, [x0]
-; ALL-NEXT:    asr w8, w9, w8
+; ALL-NEXT:    ldr w8, [x0]
+; ALL-NEXT:    ldr w9, [x1]
+; ALL-NEXT:    asr w8, w8, w9
 ; ALL-NEXT:    str w8, [x2]
 ; ALL-NEXT:    ret
   %src = load i32, ptr %src.ptr, align 1
@@ -43,13 +42,12 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
   store i32 %res, ptr %dst, align 1
   ret void
 }
-
 define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: lshr_8bytes:
 ; ALL:       // %bb.0:
-; ALL-NEXT:    ldr x8, [x1]
-; ALL-NEXT:    ldr x9, [x0]
-; ALL-NEXT:    lsr x8, x9, x8
+; ALL-NEXT:    ldr x8, [x0]
+; ALL-NEXT:    ldr x9, [x1]
+; ALL-NEXT:    lsr x8, x8, x9
 ; ALL-NEXT:    str x8, [x2]
 ; ALL-NEXT:    ret
   %src = load i64, ptr %src.ptr, align 1
@@ -61,9 +59,9 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: shl_8bytes:
 ; ALL:       // %bb.0:
-; ALL-NEXT:    ldr x8, [x1]
-; ALL-NEXT:    ldr x9, [x0]
-; ALL-NEXT:    lsl x8, x9, x8
+; ALL-NEXT:    ldr x8, [x0]
+; ALL-NEXT:    ldr x9, [x1]
+; ALL-NEXT:    lsl x8, x8, x9
 ; ALL-NEXT:    str x8, [x2]
 ; ALL-NEXT:    ret
   %src = load i64, ptr %src.ptr, align 1
@@ -75,9 +73,9 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: ashr_8bytes:
 ; ALL:       // %bb.0:
-; ALL-NEXT:    ldr x8, [x1]
-; ALL-NEXT:    ldr x9, [x0]
-; ALL-NEXT:    asr x8, x9, x8
+; ALL-NEXT:    ldr x8, [x0]
+; ALL-NEXT:    ldr x9, [x1]
+; ALL-NEXT:    asr x8, x8, x9
 ; ALL-NEXT:    str x8, [x2]
 ; ALL-NEXT:    ret
   %src = load i64, ptr %src.ptr, align 1
@@ -86,21 +84,20 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
   store i64 %res, ptr %dst, align 1
   ret void
 }
-
 define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: lshr_16bytes:
 ; ALL:       // %bb.0:
-; ALL-NEXT:    ldr x8, [x1]
-; ALL-NEXT:    ldp x10, x9, [x0]
-; ALL-NEXT:    mvn w11, w8
-; ALL-NEXT:    tst x8, #0x40
-; ALL-NEXT:    lsr x10, x10, x8
-; ALL-NEXT:    lsl x12, x9, #1
-; ALL-NEXT:    lsr x9, x9, x8
-; ALL-NEXT:    lsl x11, x12, x11
-; ALL-NEXT:    orr x8, x11, x10
-; ALL-NEXT:    csel x10, xzr, x9, ne
-; ALL-NEXT:    csel x8, x9, x8, ne
+; ALL-NEXT:    ldp x10, x8, [x0]
+; ALL-NEXT:    ldr x9, [x1]
+; ALL-NEXT:    mvn w12, w9
+; ALL-NEXT:    tst x9, #0x40
+; ALL-NEXT:    lsl x11, x8, #1
+; ALL-NEXT:    lsr x10, x10, x9
+; ALL-NEXT:    lsr x8, x8, x9
+; ALL-NEXT:    lsl x11, x11, x12
+; ALL-NEXT:    orr x9, x11, x10
+; ALL-NEXT:    csel x10, xzr, x8, ne
+; ALL-NEXT:    csel x8, x8, x9, ne
 ; ALL-NEXT:    stp x8, x10, [x2]
 ; ALL-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
@@ -112,17 +109,17 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: shl_16bytes:
 ; ALL:       // %bb.0:
-; ALL-NEXT:    ldr x8, [x1]
-; ALL-NEXT:    ldp x9, x10, [x0]
-; ALL-NEXT:    mvn w11, w8
-; ALL-NEXT:    tst x8, #0x40
-; ALL-NEXT:    lsr x12, x9, #1
-; ALL-NEXT:    lsl x9, x9, x8
-; ALL-NEXT:    lsl x10, x10, x8
-; ALL-NEXT:    lsr x11, x12, x11
-; ALL-NEXT:    orr x8, x10, x11
-; ALL-NEXT:    csel x10, xzr, x9, ne
-; ALL-NEXT:    csel x8, x9, x8, ne
+; ALL-NEXT:    ldp x8, x10, [x0]
+; ALL-NEXT:    ldr x9, [x1]
+; ALL-NEXT:    mvn w12, w9
+; ALL-NEXT:    tst x9, #0x40
+; ALL-NEXT:    lsr x11, x8, #1
+; ALL-NEXT:    lsl x10, x10, x9
+; ALL-NEXT:    lsl x8, x8, x9
+; ALL-NEXT:    lsr x11, x11, x12
+; ALL-NEXT:    orr x9, x10, x11
+; ALL-NEXT:    csel x10, xzr, x8, ne
+; ALL-NEXT:    csel x8, x8, x9, ne
 ; ALL-NEXT:    stp x10, x8, [x2]
 ; ALL-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
@@ -134,16 +131,16 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: ashr_16bytes:
 ; ALL:       // %bb.0:
-; ALL-NEXT:    ldr x8, [x1]
-; ALL-NEXT:    ldp x10, x9, [x0]
-; ALL-NEXT:    mvn w11, w8
-; ALL-NEXT:    tst x8, #0x40
-; ALL-NEXT:    lsr x10, x10, x8
-; ALL-NEXT:    lsl x12, x9, #1
-; ALL-NEXT:    lsl x11, x12, x11
-; ALL-NEXT:    asr x12, x9, x8
-; ALL-NEXT:    asr x8, x9, #63
-; ALL-NEXT:    orr x9, x11, x10
+; ALL-NEXT:    ldp x9, x8, [x0]
+; ALL-NEXT:    ldr x10, [x1]
+; ALL-NEXT:    mvn w12, w10
+; ALL-NEXT:    tst x10, #0x40
+; ALL-NEXT:    lsl x11, x8, #1
+; ALL-NEXT:    lsr x9, x9, x10
+; ALL-NEXT:    lsl x11, x11, x12
+; ALL-NEXT:    asr x12, x8, x10
+; ALL-NEXT:    asr x8, x8, #63
+; ALL-NEXT:    orr x9, x11, x9
 ; ALL-NEXT:    csel x8, x8, x12, ne
 ; ALL-NEXT:    csel x9, x12, x9, ne
 ; ALL-NEXT:    stp x9, x8, [x2]
@@ -154,41 +151,39 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
   store i128 %res, ptr %dst, align 1
   ret void
 }
-
 define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: lshr_32bytes:
 ; ALL:       // %bb.0:
 ; ALL-NEXT:    sub sp, sp, #64
-; ALL-NEXT:    ldr x9, [x1]
-; ALL-NEXT:    mov x8, sp
-; ALL-NEXT:    ldp x10, x11, [x0, #16]
+; ALL-NEXT:    ldp x9, x8, [x0, #16]
 ; ALL-NEXT:    movi v0.2d, #0000000000000000
+; ALL-NEXT:    ldr x10, [x1]
 ; ALL-NEXT:    ldr q1, [x0]
-; ALL-NEXT:    ubfx x12, x9, #3, #5
-; ALL-NEXT:    add x8, x8, x12
-; ALL-NEXT:    and x9, x9, #0x7
-; ALL-NEXT:    mvn w13, w9
-; ALL-NEXT:    stp q0, q0, [sp, #32]
-; ALL-NEXT:    stp x10, x11, [sp, #16]
+; ALL-NEXT:    stp x9, x8, [sp, #16]
+; ALL-NEXT:    ubfx x8, x10, #3, #5
+; ALL-NEXT:    mov x9, sp
 ; ALL-NEXT:    str q1, [sp]
-; ALL-NEXT:    ldp x11, x10, [x8, #8]
-; ALL-NEXT:    ldr x12, [x8]
-; ALL-NEXT:    ldr x8, [x8, #24]
-; ALL-NEXT:    lsr x15, x11, x9
-; ALL-NEXT:    lsl x11, x11, #1
-; ALL-NEXT:    lsl x14, x10, #1
-; ALL-NEXT:    lsr x10, x10, x9
-; ALL-NEXT:    lsr x12, x12, x9
-; ALL-NEXT:    lsr x9, x8, x9
-; ALL-NEXT:    lsl x8, x8, #1
-; ALL-NEXT:    lsl x11, x11, x13
-; ALL-NEXT:    lsl x8, x8, x13
-; ALL-NEXT:    orr x11, x11, x12
-; ALL-NEXT:    orr x8, x8, x10
-; ALL-NEXT:    lsl x10, x14, x13
-; ALL-NEXT:    orr x10, x15, x10
-; ALL-NEXT:    stp x8, x9, [x2, #16]
-; ALL-NEXT:    stp x11, x10, [x2]
+; ALL-NEXT:    and x10, x10, #0x7
+; ALL-NEXT:    stp q0, q0, [sp, #32]
+; ALL-NEXT:    add x8, x9, x8
+; ALL-NEXT:    mvn w13, w10
+; ALL-NEXT:    ldp x11, x9, [x8, #16]
+; ALL-NEXT:    ldp x8, x12, [x8]
+; ALL-NEXT:    lsl x14, x9, #1
+; ALL-NEXT:    lsl x15, x11, #1
+; ALL-NEXT:    lsr x11, x11, x10
+; ALL-NEXT:    lsl x16, x12, #1
+; ALL-NEXT:    lsr x9, x9, x10
+; ALL-NEXT:    lsr x12, x12, x10
+; ALL-NEXT:    lsl x14, x14, x13
+; ALL-NEXT:    lsr x8, x8, x10
+; ALL-NEXT:    lsl x10, x16, x13
+; ALL-NEXT:    lsl x13, x15, x13
+; ALL-NEXT:    orr x11, x14, x11
+; ALL-NEXT:    stp x11, x9, [x2, #16]
+; ALL-NEXT:    orr x8, x10, x8
+; ALL-NEXT:    orr x9, x12, x13
+; ALL-NEXT:    stp x8, x9, [x2]
 ; ALL-NEXT:    add sp, sp, #64
 ; ALL-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
@@ -201,35 +196,35 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: shl_32bytes:
 ; ALL:       // %bb.0:
 ; ALL-NEXT:    sub sp, sp, #64
-; ALL-NEXT:    ldr x9, [x1]
-; ALL-NEXT:    mov x8, sp
-; ALL-NEXT:    ldp x10, x11, [x0, #16]
+; ALL-NEXT:    ldp x9, x8, [x0, #16]
 ; ALL-NEXT:    movi v0.2d, #0000000000000000
-; ALL-NEXT:    add x8, x8, #32
+; ALL-NEXT:    ldr x10, [x1]
 ; ALL-NEXT:    ldr q1, [x0]
-; ALL-NEXT:    ubfx x12, x9, #3, #5
-; ALL-NEXT:    sub x8, x8, x12
-; ALL-NEXT:    and x9, x9, #0x7
-; ALL-NEXT:    mvn w12, w9
-; ALL-NEXT:    stp q0, q0, [sp]
-; ALL-NEXT:    stp x10, x11, [sp, #48]
-; ALL-NEXT:    str q1, [sp, #32]
-; ALL-NEXT:    ldp x10, x11, [x8]
-; ALL-NEXT:    ldp x13, x8, [x8, #16]
-; ALL-NEXT:    lsr x14, x10, #1
-; ALL-NEXT:    lsl x10, x10, x9
-; ALL-NEXT:    lsl x15, x11, x9
-; ALL-NEXT:    lsr x11, x11, #1
-; ALL-NEXT:    lsr x14, x14, x12
-; ALL-NEXT:    lsr x11, x11, x12
-; ALL-NEXT:    lsl x8, x8, x9
-; ALL-NEXT:    lsl x9, x13, x9
-; ALL-NEXT:    lsr x13, x13, #1
-; ALL-NEXT:    orr x14, x15, x14
-; ALL-NEXT:    lsr x13, x13, x12
-; ALL-NEXT:    orr x9, x9, x11
-; ALL-NEXT:    orr x8, x8, x13
-; ALL-NEXT:    stp x10, x14, [x2]
+; ALL-NEXT:    stp x9, x8, [sp, #48]
+; ALL-NEXT:    mov x8, sp
+; ALL-NEXT:    ubfx x9, x10, #3, #5
+; ALL-NEXT:    add x8, x8, #32
+; ALL-NEXT:    stp q0, q1, [sp, #16]
+; ALL-NEXT:    and x10, x10, #0x7
+; ALL-NEXT:    str q0, [sp]
+; ALL-NEXT:    sub x8, x8, x9
+; ALL-NEXT:    mvn w13, w10
+; ALL-NEXT:    ldp x9, x11, [x8]
+; ALL-NEXT:    ldp x12, x8, [x8, #16]
+; ALL-NEXT:    lsr x14, x9, #1
+; ALL-NEXT:    lsr x15, x11, #1
+; ALL-NEXT:    lsl x11, x11, x10
+; ALL-NEXT:    lsr x16, x12, #1
+; ALL-NEXT:    lsl x9, x9, x10
+; ALL-NEXT:    lsl x12, x12, x10
+; ALL-NEXT:    lsr x14, x14, x13
+; ALL-NEXT:    lsl x8, x8, x10
+; ALL-NEXT:    lsr x10, x16, x13
+; ALL-NEXT:    lsr x13, x15, x13
+; ALL-NEXT:    orr x11, x11, x14
+; ALL-NEXT:    stp x9, x11, [x2]
+; ALL-NEXT:    orr x8, x8, x10
+; ALL-NEXT:    orr x9, x12, x13
 ; ALL-NEXT:    stp x9, x8, [x2, #16]
 ; ALL-NEXT:    add sp, sp, #64
 ; ALL-NEXT:    ret
@@ -243,35 +238,35 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; ALL-LABEL: ashr_32bytes:
 ; ALL:       // %bb.0:
 ; ALL-NEXT:    sub sp, sp, #64
-; ALL-NEXT:    ldp x11, x10, [x0, #16]
-; ALL-NEXT:    mov x8, sp
-; ALL-NEXT:    ldr x9, [x1]
+; ALL-NEXT:    ldp x9, x8, [x0, #16]
+; ALL-NEXT:    mov x11, sp
+; ALL-NEXT:    ldr x10, [x1]
 ; ALL-NEXT:    ldr q0, [x0]
-; ALL-NEXT:    asr x12, x10, #63
-; ALL-NEXT:    stp x11, x10, [sp, #16]
-; ALL-NEXT:    ubfx x10, x9, #3, #5
+; ALL-NEXT:    stp x9, x8, [sp, #16]
+; ALL-NEXT:    asr x8, x8, #63
+; ALL-NEXT:    ubfx x9, x10, #3, #5
 ; ALL-NEXT:    str q0, [sp]
-; ALL-NEXT:    add x8, x8, x10
-; ALL-NEXT:    and x9, x9, #0x7
-; ALL-NEXT:    stp x12, x12, [sp, #48]
-; ALL-NEXT:    stp x12, x12, [sp, #32]
-; ALL-NEXT:    mvn w12, w9
-; ALL-NEXT:    ldp x10, x11, [x8, #16]
-; ALL-NEXT:    ldp x8, x13, [x8]
-; ALL-NEXT:    lsl x14, x10, #1
-; ALL-NEXT:    lsr x10, x10, x9
+; ALL-NEXT:    and x10, x10, #0x7
+; ALL-NEXT:    stp x8, x8, [sp, #48]
+; ALL-NEXT:    add x9, x11, x9
+; ALL-NEXT:    mvn w13, w10
+; ALL-NEXT:    stp x8, x8, [sp, #32]
+; ALL-NEXT:    ldp x11, x8, [x9, #16]
+; ALL-NEXT:    ldp x9, x12, [x9]
+; ALL-NEXT:    lsl x14, x8, #1
 ; ALL-NEXT:    lsl x15, x11, #1
-; ALL-NEXT:    asr x11, x11, x9
-; ALL-NEXT:    lsl x15, x15, x12
-; ALL-NEXT:    lsl x14, x14, x12
-; ALL-NEXT:    orr x10, x15, x10
-; ALL-NEXT:    lsl x15, x13, #1
-; ALL-NEXT:    lsl x12, x15, x12
-; ALL-NEXT:    lsr x8, x8, x9
-; ALL-NEXT:    lsr x9, x13, x9
-; ALL-NEXT:    orr x8, x12, x8
-; ALL-NEXT:    orr x9, x9, x14
-; ALL-NEXT:    stp x10, x11, [x2, #16]
+; ALL-NEXT:    lsr x11, x11, x10
+; ALL-NEXT:    lsl x16, x12, #1
+; ALL-NEXT:    asr x8, x8, x10
+; ALL-NEXT:    lsr x12, x12, x10
+; ALL-NEXT:    lsl x14, x14, x13
+; ALL-NEXT:    lsr x9, x9, x10
+; ALL-NEXT:    lsl x10, x16, x13
+; ALL-NEXT:    lsl x13, x15, x13
+; ALL-NEXT:    orr x11, x14, x11
+; ALL-NEXT:    stp x11, x8, [x2, #16]
+; ALL-NEXT:    orr x8, x10, x9
+; ALL-NEXT:    orr x9, x12, x13
 ; ALL-NEXT:    stp x8, x9, [x2]
 ; ALL-NEXT:    add sp, sp, #64
 ; ALL-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/win64_vararg.ll b/llvm/test/CodeGen/AArch64/win64_vararg.ll
index cda1fb9bfeca4e..aaf4cad6087403 100644
--- a/llvm/test/CodeGen/AArch64/win64_vararg.ll
+++ b/llvm/test/CodeGen/AArch64/win64_vararg.ll
@@ -7,8 +7,8 @@ define void @pass_va(i32 %count, ...) nounwind {
 ; CHECK-NEXT:    str x30, [sp, #-80]! // 8-byte Folded Spill
 ; CHECK-NEXT:    add x8, sp, #24
 ; CHECK-NEXT:    add x0, sp, #24
-; CHECK-NEXT:    stp x3, x4, [sp, #40]
 ; CHECK-NEXT:    stp x1, x2, [sp, #24]
+; CHECK-NEXT:    stp x3, x4, [sp, #40]
 ; CHECK-NEXT:    stp x5, x6, [sp, #56]
 ; CHECK-NEXT:    str x7, [sp, #72]
 ; CHECK-NEXT:    str x8, [sp, #8]
@@ -82,8 +82,8 @@ define void @copy1(i64 %a0, ...) nounwind {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    sub sp, sp, #80
 ; CHECK-NEXT:    add x8, sp, #24
-; CHECK-NEXT:    stp x3, x4, [sp, #40]
 ; CHECK-NEXT:    stp x1, x2, [sp, #24]
+; CHECK-NEXT:    stp x3, x4, [sp, #40]
 ; CHECK-NEXT:    stp x5, x6, [sp, #56]
 ; CHECK-NEXT:    str x7, [sp, #72]
 ; CHECK-NEXT:    stp x8, x8, [sp], #80
@@ -178,17 +178,17 @@ define void @vla(i32, ptr, ...) local_unnamed_addr {
 ; CHECK-NEXT:    add x29, sp, #40
 ; CHECK-NEXT:    .seh_add_fp 40
 ; CHECK-NEXT:    .seh_endprologue
-; CHECK-NEXT:    add x8, x29, #24
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    add x8, x29, #24
 ; CHECK-NEXT:    mov w9, w0
 ; CHECK-NEXT:    mov x19, x1
-; CHECK-NEXT:    mov x23, sp
-; CHECK-NEXT:    stp x3, x4, [x29, #32]
-; CHECK-NEXT:    stp x8, x2, [x29, #16]
+; CHECK-NEXT:    str x8, [x29, #16]
 ; CHECK-NEXT:    add x8, x9, #15
+; CHECK-NEXT:    mov x23, sp
 ; CHECK-NEXT:    lsr x15, x8, #4
-; CHECK-NEXT:    stp x5, x6, [x29, #48]
-; CHECK-NEXT:    str x7, [x29, #64]
+; CHECK-NEXT:    stp x2, x3, [x29, #24]
+; CHECK-NEXT:    stp x4, x5, [x29, #40]
+; CHECK-NEXT:    stp x6, x7, [x29, #56]
 ; CHECK-NEXT:    bl __chkstk
 ; CHECK-NEXT:    sub x20, sp, x15, lsl #4
 ; CHECK-NEXT:    mov sp, x20

diff  --git a/llvm/test/CodeGen/AArch64/win64_vararg2.ll b/llvm/test/CodeGen/AArch64/win64_vararg2.ll
index c155d906e94448..c7a45122fa7731 100644
--- a/llvm/test/CodeGen/AArch64/win64_vararg2.ll
+++ b/llvm/test/CodeGen/AArch64/win64_vararg2.ll
@@ -39,30 +39,26 @@ define i1 @va_func(i32 %a, i8 %b, i8 %c, ...) {
 ; GISEL-LABEL: va_func:
 ; GISEL:       .seh_proc va_func
 ; GISEL-NEXT:  // %bb.0:
-; GISEL-NEXT:    sub sp, sp, #80
-; GISEL-NEXT:    .seh_stackalloc 80
-; GISEL-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
-; GISEL-NEXT:    .seh_save_reg x19, 16
-; GISEL-NEXT:    str x30, [sp, #24] // 8-byte Folded Spill
-; GISEL-NEXT:    .seh_save_reg x30, 24
+; GISEL-NEXT:    str x19, [sp, #-64]! // 8-byte Folded Spill
+; GISEL-NEXT:    .seh_save_reg_x x19, 64
+; GISEL-NEXT:    str x30, [sp, #8] // 8-byte Folded Spill
+; GISEL-NEXT:    .seh_save_reg x30, 8
 ; GISEL-NEXT:    .seh_endprologue
-; GISEL-NEXT:    stp x3, x4, [sp, #40]
+; GISEL-NEXT:    stp x3, x4, [sp, #24]
 ; GISEL-NEXT:    mov w19, w0
-; GISEL-NEXT:    stp x5, x6, [sp, #56]
-; GISEL-NEXT:    str w0, [sp, #12]
-; GISEL-NEXT:    str x7, [sp, #72]
-; GISEL-NEXT:    strb w1, [sp, #11]
-; GISEL-NEXT:    strb w2, [sp, #10]
+; GISEL-NEXT:    stp x5, x6, [sp, #40]
+; GISEL-NEXT:    str x7, [sp, #56]
+; GISEL-NEXT:    str w0, [sp, #4]
+; GISEL-NEXT:    strb w1, [sp, #3]
+; GISEL-NEXT:    strb w2, [sp, #2]
 ; GISEL-NEXT:    bl other
 ; GISEL-NEXT:    cmp w19, w0
 ; GISEL-NEXT:    cset w0, ls
 ; GISEL-NEXT:    .seh_startepilogue
-; GISEL-NEXT:    ldr x30, [sp, #24] // 8-byte Folded Reload
-; GISEL-NEXT:    .seh_save_reg x30, 24
-; GISEL-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
-; GISEL-NEXT:    .seh_save_reg x19, 16
-; GISEL-NEXT:    add sp, sp, #80
-; GISEL-NEXT:    .seh_stackalloc 80
+; GISEL-NEXT:    ldr x30, [sp, #8] // 8-byte Folded Reload
+; GISEL-NEXT:    .seh_save_reg x30, 8
+; GISEL-NEXT:    ldr x19, [sp], #64 // 8-byte Folded Reload
+; GISEL-NEXT:    .seh_save_reg_x x19, 64
 ; GISEL-NEXT:    .seh_endepilogue
 ; GISEL-NEXT:    ret
 ; GISEL-NEXT:    .seh_endfunclet

diff  --git a/llvm/test/CodeGen/AArch64/win64_vararg_float.ll b/llvm/test/CodeGen/AArch64/win64_vararg_float.ll
index 6bf37abf8cfa4a..842f48941421d2 100644
--- a/llvm/test/CodeGen/AArch64/win64_vararg_float.ll
+++ b/llvm/test/CodeGen/AArch64/win64_vararg_float.ll
@@ -7,13 +7,13 @@ define void @float_va_fn(float %a, i32 %b, ...) nounwind {
 ; DAGISEL-LABEL: float_va_fn:
 ; DAGISEL:       // %bb.0: // %entry
 ; DAGISEL-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
-; DAGISEL-NEXT:    add x8, sp, #16
 ; DAGISEL-NEXT:    fmov s0, w0
+; DAGISEL-NEXT:    add x8, sp, #16
 ; DAGISEL-NEXT:    add x0, sp, #16
-; DAGISEL-NEXT:    stp x3, x4, [sp, #24]
-; DAGISEL-NEXT:    stp x5, x6, [sp, #40]
-; DAGISEL-NEXT:    stp x8, x2, [sp, #8]
-; DAGISEL-NEXT:    str x7, [sp, #56]
+; DAGISEL-NEXT:    stp x2, x3, [sp, #16]
+; DAGISEL-NEXT:    stp x4, x5, [sp, #32]
+; DAGISEL-NEXT:    stp x6, x7, [sp, #48]
+; DAGISEL-NEXT:    str x8, [sp, #8]
 ; DAGISEL-NEXT:    bl f_va_list
 ; DAGISEL-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
 ; DAGISEL-NEXT:    ret
@@ -57,13 +57,13 @@ define void @double_va_fn(double %a, i32 %b, ...) nounwind {
 ; DAGISEL-LABEL: double_va_fn:
 ; DAGISEL:       // %bb.0: // %entry
 ; DAGISEL-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
-; DAGISEL-NEXT:    add x8, sp, #16
 ; DAGISEL-NEXT:    fmov d0, x0
+; DAGISEL-NEXT:    add x8, sp, #16
 ; DAGISEL-NEXT:    add x0, sp, #16
-; DAGISEL-NEXT:    stp x3, x4, [sp, #24]
-; DAGISEL-NEXT:    stp x5, x6, [sp, #40]
-; DAGISEL-NEXT:    stp x8, x2, [sp, #8]
-; DAGISEL-NEXT:    str x7, [sp, #56]
+; DAGISEL-NEXT:    stp x2, x3, [sp, #16]
+; DAGISEL-NEXT:    stp x4, x5, [sp, #32]
+; DAGISEL-NEXT:    stp x6, x7, [sp, #48]
+; DAGISEL-NEXT:    str x8, [sp, #8]
 ; DAGISEL-NEXT:    bl d_va_list
 ; DAGISEL-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
 ; DAGISEL-NEXT:    ret
@@ -102,28 +102,28 @@ declare void @d_va_list(double, ptr)
 define void @call_f_va() nounwind {
 ; DAGISEL-LABEL: call_f_va:
 ; DAGISEL:       // %bb.0: // %entry
-; DAGISEL-NEXT:    mov w0, #1065353216
-; DAGISEL-NEXT:    mov w1, #2
-; DAGISEL-NEXT:    mov x2, #4613937818241073152
-; DAGISEL-NEXT:    mov w3, #4
+; DAGISEL-NEXT:    mov w0, #1065353216 // =0x3f800000
+; DAGISEL-NEXT:    mov w1, #2 // =0x2
+; DAGISEL-NEXT:    mov x2, #4613937818241073152 // =0x4008000000000000
+; DAGISEL-NEXT:    mov w3, #4 // =0x4
 ; DAGISEL-NEXT:    b other_f_va_fn
 ;
 ; FASTISEL-LABEL: call_f_va:
 ; FASTISEL:       // %bb.0: // %entry
-; FASTISEL-NEXT:    mov w0, #1065353216
-; FASTISEL-NEXT:    mov w1, #2
-; FASTISEL-NEXT:    mov x2, #4613937818241073152
-; FASTISEL-NEXT:    mov w3, #4
+; FASTISEL-NEXT:    mov w0, #1065353216 // =0x3f800000
+; FASTISEL-NEXT:    mov w1, #2 // =0x2
+; FASTISEL-NEXT:    mov x2, #4613937818241073152 // =0x4008000000000000
+; FASTISEL-NEXT:    mov w3, #4 // =0x4
 ; FASTISEL-NEXT:    b other_f_va_fn
 ;
 ; GISEL-LABEL: call_f_va:
 ; GISEL:       // %bb.0: // %entry
 ; GISEL-NEXT:    fmov s0, #1.00000000
 ; GISEL-NEXT:    fmov w0, s0
-; GISEL-NEXT:    mov w1, #2
+; GISEL-NEXT:    mov w1, #2 // =0x2
 ; GISEL-NEXT:    fmov d0, #3.00000000
 ; GISEL-NEXT:    fmov x2, d0
-; GISEL-NEXT:    mov w3, #4
+; GISEL-NEXT:    mov w3, #4 // =0x4
 ; GISEL-NEXT:    b other_f_va_fn
 entry:
   tail call void (float, i32, ...) @other_f_va_fn(float 1.000000e+00, i32 2, double 3.000000e+00, i32 4)
@@ -135,28 +135,28 @@ declare void @other_f_va_fn(float, i32, ...)
 define void @call_d_va() nounwind {
 ; DAGISEL-LABEL: call_d_va:
 ; DAGISEL:       // %bb.0: // %entry
-; DAGISEL-NEXT:    mov x0, #4607182418800017408
-; DAGISEL-NEXT:    mov w1, #2
-; DAGISEL-NEXT:    mov x2, #4613937818241073152
-; DAGISEL-NEXT:    mov w3, #4
+; DAGISEL-NEXT:    mov x0, #4607182418800017408 // =0x3ff0000000000000
+; DAGISEL-NEXT:    mov w1, #2 // =0x2
+; DAGISEL-NEXT:    mov x2, #4613937818241073152 // =0x4008000000000000
+; DAGISEL-NEXT:    mov w3, #4 // =0x4
 ; DAGISEL-NEXT:    b other_d_va_fn
 ;
 ; FASTISEL-LABEL: call_d_va:
 ; FASTISEL:       // %bb.0: // %entry
-; FASTISEL-NEXT:    mov x0, #4607182418800017408
-; FASTISEL-NEXT:    mov w1, #2
-; FASTISEL-NEXT:    mov x2, #4613937818241073152
-; FASTISEL-NEXT:    mov w3, #4
+; FASTISEL-NEXT:    mov x0, #4607182418800017408 // =0x3ff0000000000000
+; FASTISEL-NEXT:    mov w1, #2 // =0x2
+; FASTISEL-NEXT:    mov x2, #4613937818241073152 // =0x4008000000000000
+; FASTISEL-NEXT:    mov w3, #4 // =0x4
 ; FASTISEL-NEXT:    b other_d_va_fn
 ;
 ; GISEL-LABEL: call_d_va:
 ; GISEL:       // %bb.0: // %entry
 ; GISEL-NEXT:    fmov d0, #1.00000000
 ; GISEL-NEXT:    fmov x0, d0
-; GISEL-NEXT:    mov w1, #2
+; GISEL-NEXT:    mov w1, #2 // =0x2
 ; GISEL-NEXT:    fmov d0, #3.00000000
 ; GISEL-NEXT:    fmov x2, d0
-; GISEL-NEXT:    mov w3, #4
+; GISEL-NEXT:    mov w3, #4 // =0x4
 ; GISEL-NEXT:    b other_d_va_fn
 entry:
   tail call void (double, i32, ...) @other_d_va_fn(double 1.000000e+00, i32 2, double 3.000000e+00, i32 4)
@@ -170,16 +170,16 @@ define void @call_d_non_va() nounwind {
 ; DAGISEL:       // %bb.0: // %entry
 ; DAGISEL-NEXT:    fmov d0, #1.00000000
 ; DAGISEL-NEXT:    fmov d1, #3.00000000
-; DAGISEL-NEXT:    mov w0, #2
-; DAGISEL-NEXT:    mov w1, #4
+; DAGISEL-NEXT:    mov w0, #2 // =0x2
+; DAGISEL-NEXT:    mov w1, #4 // =0x4
 ; DAGISEL-NEXT:    b other_d_non_va_fn
 ;
 ; O0-LABEL: call_d_non_va:
 ; O0:       // %bb.0: // %entry
 ; O0-NEXT:    fmov d0, #1.00000000
-; O0-NEXT:    mov w0, #2
+; O0-NEXT:    mov w0, #2 // =0x2
 ; O0-NEXT:    fmov d1, #3.00000000
-; O0-NEXT:    mov w1, #4
+; O0-NEXT:    mov w1, #4 // =0x4
 ; O0-NEXT:    b other_d_non_va_fn
 entry:
   tail call void (double, i32, double, i32) @other_d_non_va_fn(double 1.000000e+00, i32 2, double 3.000000e+00, i32 4)

diff  --git a/llvm/test/CodeGen/AArch64/win64_vararg_float_cc.ll b/llvm/test/CodeGen/AArch64/win64_vararg_float_cc.ll
index 26d7af848a152f..90f878327abc8e 100644
--- a/llvm/test/CodeGen/AArch64/win64_vararg_float_cc.ll
+++ b/llvm/test/CodeGen/AArch64/win64_vararg_float_cc.ll
@@ -5,9 +5,13 @@
 
 ; Check that non-vararg functions compilation is not broken
 define win64cc float @foo(float %arg) nounwind {
-; GISEL-LABEL: foo:
-; GISEL-NEXT:  // %bb.0: // %entry
-; GISEL-NEXT:  ret
+; DAGISEL-LABEL: foo:
+; DAGISEL:       // %bb.0: // %entry
+; DAGISEL-NEXT:    ret
+;
+; O0-LABEL: foo:
+; O0:       // %bb.0: // %entry
+; O0-NEXT:    ret
 entry:
   ret float %arg
 }
@@ -16,13 +20,13 @@ define win64cc void @float_va_fn(float %a, i32 %b, ...) nounwind {
 ; DAGISEL-LABEL: float_va_fn:
 ; DAGISEL:       // %bb.0: // %entry
 ; DAGISEL-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
-; DAGISEL-NEXT:    add x8, sp, #16
 ; DAGISEL-NEXT:    fmov s0, w0
+; DAGISEL-NEXT:    add x8, sp, #16
 ; DAGISEL-NEXT:    add x0, sp, #16
-; DAGISEL-NEXT:    stp x3, x4, [sp, #24]
-; DAGISEL-NEXT:    stp x5, x6, [sp, #40]
-; DAGISEL-NEXT:    stp x8, x2, [sp, #8]
-; DAGISEL-NEXT:    str x7, [sp, #56]
+; DAGISEL-NEXT:    stp x2, x3, [sp, #16]
+; DAGISEL-NEXT:    stp x4, x5, [sp, #32]
+; DAGISEL-NEXT:    stp x6, x7, [sp, #48]
+; DAGISEL-NEXT:    str x8, [sp, #8]
 ; DAGISEL-NEXT:    bl f_va_list
 ; DAGISEL-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
 ; DAGISEL-NEXT:    ret
@@ -66,13 +70,13 @@ define win64cc void @double_va_fn(double %a, i32 %b, ...) nounwind {
 ; DAGISEL-LABEL: double_va_fn:
 ; DAGISEL:       // %bb.0: // %entry
 ; DAGISEL-NEXT:    str x30, [sp, #-64]! // 8-byte Folded Spill
-; DAGISEL-NEXT:    add x8, sp, #16
 ; DAGISEL-NEXT:    fmov d0, x0
+; DAGISEL-NEXT:    add x8, sp, #16
 ; DAGISEL-NEXT:    add x0, sp, #16
-; DAGISEL-NEXT:    stp x3, x4, [sp, #24]
-; DAGISEL-NEXT:    stp x5, x6, [sp, #40]
-; DAGISEL-NEXT:    stp x8, x2, [sp, #8]
-; DAGISEL-NEXT:    str x7, [sp, #56]
+; DAGISEL-NEXT:    stp x2, x3, [sp, #16]
+; DAGISEL-NEXT:    stp x4, x5, [sp, #32]
+; DAGISEL-NEXT:    stp x6, x7, [sp, #48]
+; DAGISEL-NEXT:    str x8, [sp, #8]
 ; DAGISEL-NEXT:    bl d_va_list
 ; DAGISEL-NEXT:    ldr x30, [sp], #64 // 8-byte Folded Reload
 ; DAGISEL-NEXT:    ret
@@ -112,10 +116,10 @@ define void @call_f_va() nounwind {
 ; DAGISEL-LABEL: call_f_va:
 ; DAGISEL:       // %bb.0: // %entry
 ; DAGISEL-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; DAGISEL-NEXT:    mov w0, #1065353216
-; DAGISEL-NEXT:    mov w1, #2
-; DAGISEL-NEXT:    mov x2, #4613937818241073152
-; DAGISEL-NEXT:    mov w3, #4
+; DAGISEL-NEXT:    mov w0, #1065353216 // =0x3f800000
+; DAGISEL-NEXT:    mov w1, #2 // =0x2
+; DAGISEL-NEXT:    mov x2, #4613937818241073152 // =0x4008000000000000
+; DAGISEL-NEXT:    mov w3, #4 // =0x4
 ; DAGISEL-NEXT:    bl other_f_va_fn
 ; DAGISEL-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; DAGISEL-NEXT:    ret
@@ -123,10 +127,10 @@ define void @call_f_va() nounwind {
 ; FASTISEL-LABEL: call_f_va:
 ; FASTISEL:       // %bb.0: // %entry
 ; FASTISEL-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; FASTISEL-NEXT:    mov w0, #1065353216
-; FASTISEL-NEXT:    mov w1, #2
-; FASTISEL-NEXT:    mov x2, #4613937818241073152
-; FASTISEL-NEXT:    mov w3, #4
+; FASTISEL-NEXT:    mov w0, #1065353216 // =0x3f800000
+; FASTISEL-NEXT:    mov w1, #2 // =0x2
+; FASTISEL-NEXT:    mov x2, #4613937818241073152 // =0x4008000000000000
+; FASTISEL-NEXT:    mov w3, #4 // =0x4
 ; FASTISEL-NEXT:    bl other_f_va_fn
 ; FASTISEL-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; FASTISEL-NEXT:    ret
@@ -136,10 +140,10 @@ define void @call_f_va() nounwind {
 ; GISEL-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GISEL-NEXT:    fmov s0, #1.00000000
 ; GISEL-NEXT:    fmov w0, s0
-; GISEL-NEXT:    mov w1, #2
+; GISEL-NEXT:    mov w1, #2 // =0x2
 ; GISEL-NEXT:    fmov d0, #3.00000000
 ; GISEL-NEXT:    fmov x2, d0
-; GISEL-NEXT:    mov w3, #4
+; GISEL-NEXT:    mov w3, #4 // =0x4
 ; GISEL-NEXT:    bl other_f_va_fn
 ; GISEL-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; GISEL-NEXT:    ret
@@ -154,10 +158,10 @@ define void @call_d_va() nounwind {
 ; DAGISEL-LABEL: call_d_va:
 ; DAGISEL:       // %bb.0: // %entry
 ; DAGISEL-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; DAGISEL-NEXT:    mov x0, #4607182418800017408
-; DAGISEL-NEXT:    mov w1, #2
-; DAGISEL-NEXT:    mov x2, #4613937818241073152
-; DAGISEL-NEXT:    mov w3, #4
+; DAGISEL-NEXT:    mov x0, #4607182418800017408 // =0x3ff0000000000000
+; DAGISEL-NEXT:    mov w1, #2 // =0x2
+; DAGISEL-NEXT:    mov x2, #4613937818241073152 // =0x4008000000000000
+; DAGISEL-NEXT:    mov w3, #4 // =0x4
 ; DAGISEL-NEXT:    bl other_d_va_fn
 ; DAGISEL-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; DAGISEL-NEXT:    ret
@@ -165,10 +169,10 @@ define void @call_d_va() nounwind {
 ; FASTISEL-LABEL: call_d_va:
 ; FASTISEL:       // %bb.0: // %entry
 ; FASTISEL-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; FASTISEL-NEXT:    mov x0, #4607182418800017408
-; FASTISEL-NEXT:    mov w1, #2
-; FASTISEL-NEXT:    mov x2, #4613937818241073152
-; FASTISEL-NEXT:    mov w3, #4
+; FASTISEL-NEXT:    mov x0, #4607182418800017408 // =0x3ff0000000000000
+; FASTISEL-NEXT:    mov w1, #2 // =0x2
+; FASTISEL-NEXT:    mov x2, #4613937818241073152 // =0x4008000000000000
+; FASTISEL-NEXT:    mov w3, #4 // =0x4
 ; FASTISEL-NEXT:    bl other_d_va_fn
 ; FASTISEL-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; FASTISEL-NEXT:    ret
@@ -178,10 +182,10 @@ define void @call_d_va() nounwind {
 ; GISEL-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; GISEL-NEXT:    fmov d0, #1.00000000
 ; GISEL-NEXT:    fmov x0, d0
-; GISEL-NEXT:    mov w1, #2
+; GISEL-NEXT:    mov w1, #2 // =0x2
 ; GISEL-NEXT:    fmov d0, #3.00000000
 ; GISEL-NEXT:    fmov x2, d0
-; GISEL-NEXT:    mov w3, #4
+; GISEL-NEXT:    mov w3, #4 // =0x4
 ; GISEL-NEXT:    bl other_d_va_fn
 ; GISEL-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; GISEL-NEXT:    ret
@@ -198,8 +202,8 @@ define void @call_d_non_va() nounwind {
 ; DAGISEL-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; DAGISEL-NEXT:    fmov d0, #1.00000000
 ; DAGISEL-NEXT:    fmov d1, #3.00000000
-; DAGISEL-NEXT:    mov w0, #2
-; DAGISEL-NEXT:    mov w1, #4
+; DAGISEL-NEXT:    mov w0, #2 // =0x2
+; DAGISEL-NEXT:    mov w1, #4 // =0x4
 ; DAGISEL-NEXT:    bl other_d_non_va_fn
 ; DAGISEL-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; DAGISEL-NEXT:    ret
@@ -208,9 +212,9 @@ define void @call_d_non_va() nounwind {
 ; O0:       // %bb.0: // %entry
 ; O0-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; O0-NEXT:    fmov d0, #1.00000000
-; O0-NEXT:    mov w0, #2
+; O0-NEXT:    mov w0, #2 // =0x2
 ; O0-NEXT:    fmov d1, #3.00000000
-; O0-NEXT:    mov w1, #4
+; O0-NEXT:    mov w1, #4 // =0x4
 ; O0-NEXT:    bl other_d_non_va_fn
 ; O0-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; O0-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/wineh-bti.ll b/llvm/test/CodeGen/AArch64/wineh-bti.ll
index 125b0cb7923943..aa6a685fc365bc 100644
--- a/llvm/test/CodeGen/AArch64/wineh-bti.ll
+++ b/llvm/test/CodeGen/AArch64/wineh-bti.ll
@@ -43,7 +43,7 @@ lbl4:
 
 ; CHECK:      .LBB0_2:
 ; CHECK-NEXT: hint #36
-; CHECK-NEXT: mov w0, #1
+; CHECK: mov w0, #1
 
 ; CHECK:      .LBB0_3:
 ; CHECK-NEXT: hint #36

diff  --git a/llvm/test/CodeGen/AArch64/zero-call-used-regs.ll b/llvm/test/CodeGen/AArch64/zero-call-used-regs.ll
index 0cbd1557829f26..75a7c7f4a0511e 100644
--- a/llvm/test/CodeGen/AArch64/zero-call-used-regs.ll
+++ b/llvm/test/CodeGen/AArch64/zero-call-used-regs.ll
@@ -22,8 +22,8 @@ define dso_local i32 @used_gpr_arg(i32 noundef %a, i32 noundef %b, i32 noundef %
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mul w8, w1, w0
 ; CHECK-NEXT:    orr w0, w8, w2
-; CHECK-NEXT:    mov x1, #0
-; CHECK-NEXT:    mov x2, #0
+; CHECK-NEXT:    mov x1, #0 // =0x0
+; CHECK-NEXT:    mov x2, #0 // =0x0
 ; CHECK-NEXT:    ret
 
 entry:
@@ -37,9 +37,9 @@ define dso_local i32 @used_gpr(i32 noundef %a, i32 noundef %b, i32 noundef %c) l
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mul w8, w1, w0
 ; CHECK-NEXT:    orr w0, w8, w2
-; CHECK-NEXT:    mov x1, #0
-; CHECK-NEXT:    mov x2, #0
-; CHECK-NEXT:    mov x8, #0
+; CHECK-NEXT:    mov x1, #0 // =0x0
+; CHECK-NEXT:    mov x2, #0 // =0x0
+; CHECK-NEXT:    mov x8, #0 // =0x0
 ; CHECK-NEXT:    ret
 
 entry:
@@ -53,8 +53,8 @@ define dso_local i32 @used_arg(i32 noundef %a, i32 noundef %b, i32 noundef %c) l
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mul w8, w1, w0
 ; CHECK-NEXT:    orr w0, w8, w2
-; CHECK-NEXT:    mov x1, #0
-; CHECK-NEXT:    mov x2, #0
+; CHECK-NEXT:    mov x1, #0 // =0x0
+; CHECK-NEXT:    mov x2, #0 // =0x0
 ; CHECK-NEXT:    ret
 
 entry:
@@ -68,9 +68,9 @@ define dso_local i32 @used(i32 noundef %a, i32 noundef %b, i32 noundef %c) local
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mul w8, w1, w0
 ; CHECK-NEXT:    orr w0, w8, w2
-; CHECK-NEXT:    mov x1, #0
-; CHECK-NEXT:    mov x2, #0
-; CHECK-NEXT:    mov x8, #0
+; CHECK-NEXT:    mov x1, #0 // =0x0
+; CHECK-NEXT:    mov x2, #0 // =0x0
+; CHECK-NEXT:    mov x8, #0 // =0x0
 ; CHECK-NEXT:    ret
 
 entry:
@@ -83,16 +83,16 @@ define dso_local i32 @all_gpr_arg(i32 noundef %a, i32 noundef %b, i32 noundef %c
 ; CHECK-LABEL: all_gpr_arg:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mul w8, w1, w0
-; CHECK-NEXT:    mov x1, #0
-; CHECK-NEXT:    mov x3, #0
-; CHECK-NEXT:    mov x4, #0
+; CHECK-NEXT:    mov x1, #0 // =0x0
+; CHECK-NEXT:    mov x3, #0 // =0x0
+; CHECK-NEXT:    mov x4, #0 // =0x0
+; CHECK-NEXT:    mov x5, #0 // =0x0
+; CHECK-NEXT:    mov x6, #0 // =0x0
+; CHECK-NEXT:    mov x7, #0 // =0x0
+; CHECK-NEXT:    mov x18, #0 // =0x0
 ; CHECK-NEXT:    orr w0, w8, w2
-; CHECK-NEXT:    mov x2, #0
-; CHECK-NEXT:    mov x5, #0
-; CHECK-NEXT:    mov x6, #0
-; CHECK-NEXT:    mov x7, #0
-; CHECK-NEXT:    mov x8, #0
-; CHECK-NEXT:    mov x18, #0
+; CHECK-NEXT:    mov x2, #0 // =0x0
+; CHECK-NEXT:    mov x8, #0 // =0x0
 ; CHECK-NEXT:    ret
 
 entry:
@@ -105,25 +105,25 @@ define dso_local i32 @all_gpr(i32 noundef %a, i32 noundef %b, i32 noundef %c) lo
 ; CHECK-LABEL: all_gpr:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mul w8, w1, w0
-; CHECK-NEXT:    mov x1, #0
-; CHECK-NEXT:    mov x3, #0
-; CHECK-NEXT:    mov x4, #0
+; CHECK-NEXT:    mov x1, #0 // =0x0
+; CHECK-NEXT:    mov x3, #0 // =0x0
+; CHECK-NEXT:    mov x4, #0 // =0x0
+; CHECK-NEXT:    mov x5, #0 // =0x0
+; CHECK-NEXT:    mov x6, #0 // =0x0
+; CHECK-NEXT:    mov x7, #0 // =0x0
+; CHECK-NEXT:    mov x9, #0 // =0x0
+; CHECK-NEXT:    mov x10, #0 // =0x0
 ; CHECK-NEXT:    orr w0, w8, w2
-; CHECK-NEXT:    mov x2, #0
-; CHECK-NEXT:    mov x5, #0
-; CHECK-NEXT:    mov x6, #0
-; CHECK-NEXT:    mov x7, #0
-; CHECK-NEXT:    mov x8, #0
-; CHECK-NEXT:    mov x9, #0
-; CHECK-NEXT:    mov x10, #0
-; CHECK-NEXT:    mov x11, #0
-; CHECK-NEXT:    mov x12, #0
-; CHECK-NEXT:    mov x13, #0
-; CHECK-NEXT:    mov x14, #0
-; CHECK-NEXT:    mov x15, #0
-; CHECK-NEXT:    mov x16, #0
-; CHECK-NEXT:    mov x17, #0
-; CHECK-NEXT:    mov x18, #0
+; CHECK-NEXT:    mov x2, #0 // =0x0
+; CHECK-NEXT:    mov x8, #0 // =0x0
+; CHECK-NEXT:    mov x11, #0 // =0x0
+; CHECK-NEXT:    mov x12, #0 // =0x0
+; CHECK-NEXT:    mov x13, #0 // =0x0
+; CHECK-NEXT:    mov x14, #0 // =0x0
+; CHECK-NEXT:    mov x15, #0 // =0x0
+; CHECK-NEXT:    mov x16, #0 // =0x0
+; CHECK-NEXT:    mov x17, #0 // =0x0
+; CHECK-NEXT:    mov x18, #0 // =0x0
 ; CHECK-NEXT:    ret
 
 entry:
@@ -136,17 +136,17 @@ define dso_local i32 @all_arg(i32 noundef %a, i32 noundef %b, i32 noundef %c) lo
 ; DEFAULT-LABEL: all_arg:
 ; DEFAULT:       // %bb.0: // %entry
 ; DEFAULT-NEXT:    mul w8, w1, w0
-; DEFAULT-NEXT:    mov x1, #0
-; DEFAULT-NEXT:    mov x3, #0
-; DEFAULT-NEXT:    mov x4, #0
-; DEFAULT-NEXT:    orr w0, w8, w2
-; DEFAULT-NEXT:    mov x2, #0
-; DEFAULT-NEXT:    mov x5, #0
-; DEFAULT-NEXT:    mov x6, #0
-; DEFAULT-NEXT:    mov x7, #0
-; DEFAULT-NEXT:    mov x8, #0
-; DEFAULT-NEXT:    mov x18, #0
+; DEFAULT-NEXT:    mov x1, #0 // =0x0
+; DEFAULT-NEXT:    mov x3, #0 // =0x0
+; DEFAULT-NEXT:    mov x4, #0 // =0x0
+; DEFAULT-NEXT:    mov x5, #0 // =0x0
+; DEFAULT-NEXT:    mov x6, #0 // =0x0
+; DEFAULT-NEXT:    mov x7, #0 // =0x0
+; DEFAULT-NEXT:    mov x18, #0 // =0x0
 ; DEFAULT-NEXT:    movi v0.2d, #0000000000000000
+; DEFAULT-NEXT:    orr w0, w8, w2
+; DEFAULT-NEXT:    mov x2, #0 // =0x0
+; DEFAULT-NEXT:    mov x8, #0 // =0x0
 ; DEFAULT-NEXT:    movi v1.2d, #0000000000000000
 ; DEFAULT-NEXT:    movi v2.2d, #0000000000000000
 ; DEFAULT-NEXT:    movi v3.2d, #0000000000000000
@@ -159,17 +159,17 @@ define dso_local i32 @all_arg(i32 noundef %a, i32 noundef %b, i32 noundef %c) lo
 ; SVE-LABEL: all_arg:
 ; SVE:       // %bb.0: // %entry
 ; SVE-NEXT:    mul w8, w1, w0
-; SVE-NEXT:    mov x1, #0
-; SVE-NEXT:    mov x3, #0
-; SVE-NEXT:    mov x4, #0
-; SVE-NEXT:    orr w0, w8, w2
-; SVE-NEXT:    mov x2, #0
-; SVE-NEXT:    mov x5, #0
-; SVE-NEXT:    mov x6, #0
-; SVE-NEXT:    mov x7, #0
-; SVE-NEXT:    mov x8, #0
-; SVE-NEXT:    mov x18, #0
+; SVE-NEXT:    mov x1, #0 // =0x0
+; SVE-NEXT:    mov x3, #0 // =0x0
+; SVE-NEXT:    mov x4, #0 // =0x0
+; SVE-NEXT:    mov x5, #0 // =0x0
+; SVE-NEXT:    mov x6, #0 // =0x0
+; SVE-NEXT:    mov x7, #0 // =0x0
+; SVE-NEXT:    mov x18, #0 // =0x0
 ; SVE-NEXT:    mov z0.d, #0 // =0x0
+; SVE-NEXT:    orr w0, w8, w2
+; SVE-NEXT:    mov x2, #0 // =0x0
+; SVE-NEXT:    mov x8, #0 // =0x0
 ; SVE-NEXT:    mov z1.d, #0 // =0x0
 ; SVE-NEXT:    mov z2.d, #0 // =0x0
 ; SVE-NEXT:    mov z3.d, #0 // =0x0
@@ -193,25 +193,25 @@ define dso_local i32 @all(i32 noundef %a, i32 noundef %b, i32 noundef %c) local_
 ; DEFAULT-LABEL: all:
 ; DEFAULT:       // %bb.0: // %entry
 ; DEFAULT-NEXT:    mul w8, w1, w0
-; DEFAULT-NEXT:    mov x1, #0
-; DEFAULT-NEXT:    mov x3, #0
-; DEFAULT-NEXT:    mov x4, #0
+; DEFAULT-NEXT:    mov x1, #0 // =0x0
+; DEFAULT-NEXT:    mov x3, #0 // =0x0
+; DEFAULT-NEXT:    mov x4, #0 // =0x0
+; DEFAULT-NEXT:    mov x5, #0 // =0x0
+; DEFAULT-NEXT:    mov x6, #0 // =0x0
+; DEFAULT-NEXT:    mov x7, #0 // =0x0
+; DEFAULT-NEXT:    mov x9, #0 // =0x0
+; DEFAULT-NEXT:    mov x10, #0 // =0x0
 ; DEFAULT-NEXT:    orr w0, w8, w2
-; DEFAULT-NEXT:    mov x2, #0
-; DEFAULT-NEXT:    mov x5, #0
-; DEFAULT-NEXT:    mov x6, #0
-; DEFAULT-NEXT:    mov x7, #0
-; DEFAULT-NEXT:    mov x8, #0
-; DEFAULT-NEXT:    mov x9, #0
-; DEFAULT-NEXT:    mov x10, #0
-; DEFAULT-NEXT:    mov x11, #0
-; DEFAULT-NEXT:    mov x12, #0
-; DEFAULT-NEXT:    mov x13, #0
-; DEFAULT-NEXT:    mov x14, #0
-; DEFAULT-NEXT:    mov x15, #0
-; DEFAULT-NEXT:    mov x16, #0
-; DEFAULT-NEXT:    mov x17, #0
-; DEFAULT-NEXT:    mov x18, #0
+; DEFAULT-NEXT:    mov x2, #0 // =0x0
+; DEFAULT-NEXT:    mov x8, #0 // =0x0
+; DEFAULT-NEXT:    mov x11, #0 // =0x0
+; DEFAULT-NEXT:    mov x12, #0 // =0x0
+; DEFAULT-NEXT:    mov x13, #0 // =0x0
+; DEFAULT-NEXT:    mov x14, #0 // =0x0
+; DEFAULT-NEXT:    mov x15, #0 // =0x0
+; DEFAULT-NEXT:    mov x16, #0 // =0x0
+; DEFAULT-NEXT:    mov x17, #0 // =0x0
+; DEFAULT-NEXT:    mov x18, #0 // =0x0
 ; DEFAULT-NEXT:    movi v0.2d, #0000000000000000
 ; DEFAULT-NEXT:    movi v1.2d, #0000000000000000
 ; DEFAULT-NEXT:    movi v2.2d, #0000000000000000
@@ -241,25 +241,25 @@ define dso_local i32 @all(i32 noundef %a, i32 noundef %b, i32 noundef %c) local_
 ; SVE-LABEL: all:
 ; SVE:       // %bb.0: // %entry
 ; SVE-NEXT:    mul w8, w1, w0
-; SVE-NEXT:    mov x1, #0
-; SVE-NEXT:    mov x3, #0
-; SVE-NEXT:    mov x4, #0
+; SVE-NEXT:    mov x1, #0 // =0x0
+; SVE-NEXT:    mov x3, #0 // =0x0
+; SVE-NEXT:    mov x4, #0 // =0x0
+; SVE-NEXT:    mov x5, #0 // =0x0
+; SVE-NEXT:    mov x6, #0 // =0x0
+; SVE-NEXT:    mov x7, #0 // =0x0
+; SVE-NEXT:    mov x9, #0 // =0x0
+; SVE-NEXT:    mov x10, #0 // =0x0
 ; SVE-NEXT:    orr w0, w8, w2
-; SVE-NEXT:    mov x2, #0
-; SVE-NEXT:    mov x5, #0
-; SVE-NEXT:    mov x6, #0
-; SVE-NEXT:    mov x7, #0
-; SVE-NEXT:    mov x8, #0
-; SVE-NEXT:    mov x9, #0
-; SVE-NEXT:    mov x10, #0
-; SVE-NEXT:    mov x11, #0
-; SVE-NEXT:    mov x12, #0
-; SVE-NEXT:    mov x13, #0
-; SVE-NEXT:    mov x14, #0
-; SVE-NEXT:    mov x15, #0
-; SVE-NEXT:    mov x16, #0
-; SVE-NEXT:    mov x17, #0
-; SVE-NEXT:    mov x18, #0
+; SVE-NEXT:    mov x2, #0 // =0x0
+; SVE-NEXT:    mov x8, #0 // =0x0
+; SVE-NEXT:    mov x11, #0 // =0x0
+; SVE-NEXT:    mov x12, #0 // =0x0
+; SVE-NEXT:    mov x13, #0 // =0x0
+; SVE-NEXT:    mov x14, #0 // =0x0
+; SVE-NEXT:    mov x15, #0 // =0x0
+; SVE-NEXT:    mov x16, #0 // =0x0
+; SVE-NEXT:    mov x17, #0 // =0x0
+; SVE-NEXT:    mov x18, #0 // =0x0
 ; SVE-NEXT:    mov z0.d, #0 // =0x0
 ; SVE-NEXT:    mov z1.d, #0 // =0x0
 ; SVE-NEXT:    mov z2.d, #0 // =0x0
@@ -394,16 +394,16 @@ define dso_local double @all_gpr_arg_float(double noundef %a, float noundef %b)
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fcvt d1, s1
 ; CHECK-NEXT:    fmul d0, d1, d0
-; CHECK-NEXT:    mov x0, #0
-; CHECK-NEXT:    mov x1, #0
-; CHECK-NEXT:    mov x2, #0
-; CHECK-NEXT:    mov x3, #0
-; CHECK-NEXT:    mov x4, #0
-; CHECK-NEXT:    mov x5, #0
-; CHECK-NEXT:    mov x6, #0
-; CHECK-NEXT:    mov x7, #0
-; CHECK-NEXT:    mov x8, #0
-; CHECK-NEXT:    mov x18, #0
+; CHECK-NEXT:    mov x0, #0 // =0x0
+; CHECK-NEXT:    mov x1, #0 // =0x0
+; CHECK-NEXT:    mov x2, #0 // =0x0
+; CHECK-NEXT:    mov x3, #0 // =0x0
+; CHECK-NEXT:    mov x4, #0 // =0x0
+; CHECK-NEXT:    mov x5, #0 // =0x0
+; CHECK-NEXT:    mov x6, #0 // =0x0
+; CHECK-NEXT:    mov x7, #0 // =0x0
+; CHECK-NEXT:    mov x8, #0 // =0x0
+; CHECK-NEXT:    mov x18, #0 // =0x0
 ; CHECK-NEXT:    ret
 
 entry:
@@ -417,25 +417,25 @@ define dso_local double @all_gpr_float(double noundef %a, float noundef %b) loca
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fcvt d1, s1
 ; CHECK-NEXT:    fmul d0, d1, d0
-; CHECK-NEXT:    mov x0, #0
-; CHECK-NEXT:    mov x1, #0
-; CHECK-NEXT:    mov x2, #0
-; CHECK-NEXT:    mov x3, #0
-; CHECK-NEXT:    mov x4, #0
-; CHECK-NEXT:    mov x5, #0
-; CHECK-NEXT:    mov x6, #0
-; CHECK-NEXT:    mov x7, #0
-; CHECK-NEXT:    mov x8, #0
-; CHECK-NEXT:    mov x9, #0
-; CHECK-NEXT:    mov x10, #0
-; CHECK-NEXT:    mov x11, #0
-; CHECK-NEXT:    mov x12, #0
-; CHECK-NEXT:    mov x13, #0
-; CHECK-NEXT:    mov x14, #0
-; CHECK-NEXT:    mov x15, #0
-; CHECK-NEXT:    mov x16, #0
-; CHECK-NEXT:    mov x17, #0
-; CHECK-NEXT:    mov x18, #0
+; CHECK-NEXT:    mov x0, #0 // =0x0
+; CHECK-NEXT:    mov x1, #0 // =0x0
+; CHECK-NEXT:    mov x2, #0 // =0x0
+; CHECK-NEXT:    mov x3, #0 // =0x0
+; CHECK-NEXT:    mov x4, #0 // =0x0
+; CHECK-NEXT:    mov x5, #0 // =0x0
+; CHECK-NEXT:    mov x6, #0 // =0x0
+; CHECK-NEXT:    mov x7, #0 // =0x0
+; CHECK-NEXT:    mov x8, #0 // =0x0
+; CHECK-NEXT:    mov x9, #0 // =0x0
+; CHECK-NEXT:    mov x10, #0 // =0x0
+; CHECK-NEXT:    mov x11, #0 // =0x0
+; CHECK-NEXT:    mov x12, #0 // =0x0
+; CHECK-NEXT:    mov x13, #0 // =0x0
+; CHECK-NEXT:    mov x14, #0 // =0x0
+; CHECK-NEXT:    mov x15, #0 // =0x0
+; CHECK-NEXT:    mov x16, #0 // =0x0
+; CHECK-NEXT:    mov x17, #0 // =0x0
+; CHECK-NEXT:    mov x18, #0 // =0x0
 ; CHECK-NEXT:    ret
 
 entry:
@@ -449,16 +449,16 @@ define dso_local double @all_arg_float(double noundef %a, float noundef %b) loca
 ; DEFAULT:       // %bb.0: // %entry
 ; DEFAULT-NEXT:    fcvt d1, s1
 ; DEFAULT-NEXT:    fmul d0, d1, d0
-; DEFAULT-NEXT:    mov x0, #0
-; DEFAULT-NEXT:    mov x1, #0
-; DEFAULT-NEXT:    mov x2, #0
-; DEFAULT-NEXT:    mov x3, #0
-; DEFAULT-NEXT:    mov x4, #0
-; DEFAULT-NEXT:    mov x5, #0
-; DEFAULT-NEXT:    mov x6, #0
-; DEFAULT-NEXT:    mov x7, #0
-; DEFAULT-NEXT:    mov x8, #0
-; DEFAULT-NEXT:    mov x18, #0
+; DEFAULT-NEXT:    mov x0, #0 // =0x0
+; DEFAULT-NEXT:    mov x1, #0 // =0x0
+; DEFAULT-NEXT:    mov x2, #0 // =0x0
+; DEFAULT-NEXT:    mov x3, #0 // =0x0
+; DEFAULT-NEXT:    mov x4, #0 // =0x0
+; DEFAULT-NEXT:    mov x5, #0 // =0x0
+; DEFAULT-NEXT:    mov x6, #0 // =0x0
+; DEFAULT-NEXT:    mov x7, #0 // =0x0
+; DEFAULT-NEXT:    mov x8, #0 // =0x0
+; DEFAULT-NEXT:    mov x18, #0 // =0x0
 ; DEFAULT-NEXT:    movi v1.2d, #0000000000000000
 ; DEFAULT-NEXT:    movi v2.2d, #0000000000000000
 ; DEFAULT-NEXT:    movi v3.2d, #0000000000000000
@@ -472,16 +472,16 @@ define dso_local double @all_arg_float(double noundef %a, float noundef %b) loca
 ; SVE:       // %bb.0: // %entry
 ; SVE-NEXT:    fcvt d1, s1
 ; SVE-NEXT:    fmul d0, d1, d0
-; SVE-NEXT:    mov x0, #0
-; SVE-NEXT:    mov x1, #0
-; SVE-NEXT:    mov x2, #0
-; SVE-NEXT:    mov x3, #0
-; SVE-NEXT:    mov x4, #0
-; SVE-NEXT:    mov x5, #0
-; SVE-NEXT:    mov x6, #0
-; SVE-NEXT:    mov x7, #0
-; SVE-NEXT:    mov x8, #0
-; SVE-NEXT:    mov x18, #0
+; SVE-NEXT:    mov x0, #0 // =0x0
+; SVE-NEXT:    mov x1, #0 // =0x0
+; SVE-NEXT:    mov x2, #0 // =0x0
+; SVE-NEXT:    mov x3, #0 // =0x0
+; SVE-NEXT:    mov x4, #0 // =0x0
+; SVE-NEXT:    mov x5, #0 // =0x0
+; SVE-NEXT:    mov x6, #0 // =0x0
+; SVE-NEXT:    mov x7, #0 // =0x0
+; SVE-NEXT:    mov x8, #0 // =0x0
+; SVE-NEXT:    mov x18, #0 // =0x0
 ; SVE-NEXT:    mov z1.d, #0 // =0x0
 ; SVE-NEXT:    mov z2.d, #0 // =0x0
 ; SVE-NEXT:    mov z3.d, #0 // =0x0
@@ -506,25 +506,25 @@ define dso_local double @all_float(double noundef %a, float noundef %b) local_un
 ; DEFAULT:       // %bb.0: // %entry
 ; DEFAULT-NEXT:    fcvt d1, s1
 ; DEFAULT-NEXT:    fmul d0, d1, d0
-; DEFAULT-NEXT:    mov x0, #0
-; DEFAULT-NEXT:    mov x1, #0
-; DEFAULT-NEXT:    mov x2, #0
-; DEFAULT-NEXT:    mov x3, #0
-; DEFAULT-NEXT:    mov x4, #0
-; DEFAULT-NEXT:    mov x5, #0
-; DEFAULT-NEXT:    mov x6, #0
-; DEFAULT-NEXT:    mov x7, #0
-; DEFAULT-NEXT:    mov x8, #0
-; DEFAULT-NEXT:    mov x9, #0
-; DEFAULT-NEXT:    mov x10, #0
-; DEFAULT-NEXT:    mov x11, #0
-; DEFAULT-NEXT:    mov x12, #0
-; DEFAULT-NEXT:    mov x13, #0
-; DEFAULT-NEXT:    mov x14, #0
-; DEFAULT-NEXT:    mov x15, #0
-; DEFAULT-NEXT:    mov x16, #0
-; DEFAULT-NEXT:    mov x17, #0
-; DEFAULT-NEXT:    mov x18, #0
+; DEFAULT-NEXT:    mov x0, #0 // =0x0
+; DEFAULT-NEXT:    mov x1, #0 // =0x0
+; DEFAULT-NEXT:    mov x2, #0 // =0x0
+; DEFAULT-NEXT:    mov x3, #0 // =0x0
+; DEFAULT-NEXT:    mov x4, #0 // =0x0
+; DEFAULT-NEXT:    mov x5, #0 // =0x0
+; DEFAULT-NEXT:    mov x6, #0 // =0x0
+; DEFAULT-NEXT:    mov x7, #0 // =0x0
+; DEFAULT-NEXT:    mov x8, #0 // =0x0
+; DEFAULT-NEXT:    mov x9, #0 // =0x0
+; DEFAULT-NEXT:    mov x10, #0 // =0x0
+; DEFAULT-NEXT:    mov x11, #0 // =0x0
+; DEFAULT-NEXT:    mov x12, #0 // =0x0
+; DEFAULT-NEXT:    mov x13, #0 // =0x0
+; DEFAULT-NEXT:    mov x14, #0 // =0x0
+; DEFAULT-NEXT:    mov x15, #0 // =0x0
+; DEFAULT-NEXT:    mov x16, #0 // =0x0
+; DEFAULT-NEXT:    mov x17, #0 // =0x0
+; DEFAULT-NEXT:    mov x18, #0 // =0x0
 ; DEFAULT-NEXT:    movi v1.2d, #0000000000000000
 ; DEFAULT-NEXT:    movi v2.2d, #0000000000000000
 ; DEFAULT-NEXT:    movi v3.2d, #0000000000000000
@@ -554,25 +554,25 @@ define dso_local double @all_float(double noundef %a, float noundef %b) local_un
 ; SVE:       // %bb.0: // %entry
 ; SVE-NEXT:    fcvt d1, s1
 ; SVE-NEXT:    fmul d0, d1, d0
-; SVE-NEXT:    mov x0, #0
-; SVE-NEXT:    mov x1, #0
-; SVE-NEXT:    mov x2, #0
-; SVE-NEXT:    mov x3, #0
-; SVE-NEXT:    mov x4, #0
-; SVE-NEXT:    mov x5, #0
-; SVE-NEXT:    mov x6, #0
-; SVE-NEXT:    mov x7, #0
-; SVE-NEXT:    mov x8, #0
-; SVE-NEXT:    mov x9, #0
-; SVE-NEXT:    mov x10, #0
-; SVE-NEXT:    mov x11, #0
-; SVE-NEXT:    mov x12, #0
-; SVE-NEXT:    mov x13, #0
-; SVE-NEXT:    mov x14, #0
-; SVE-NEXT:    mov x15, #0
-; SVE-NEXT:    mov x16, #0
-; SVE-NEXT:    mov x17, #0
-; SVE-NEXT:    mov x18, #0
+; SVE-NEXT:    mov x0, #0 // =0x0
+; SVE-NEXT:    mov x1, #0 // =0x0
+; SVE-NEXT:    mov x2, #0 // =0x0
+; SVE-NEXT:    mov x3, #0 // =0x0
+; SVE-NEXT:    mov x4, #0 // =0x0
+; SVE-NEXT:    mov x5, #0 // =0x0
+; SVE-NEXT:    mov x6, #0 // =0x0
+; SVE-NEXT:    mov x7, #0 // =0x0
+; SVE-NEXT:    mov x8, #0 // =0x0
+; SVE-NEXT:    mov x9, #0 // =0x0
+; SVE-NEXT:    mov x10, #0 // =0x0
+; SVE-NEXT:    mov x11, #0 // =0x0
+; SVE-NEXT:    mov x12, #0 // =0x0
+; SVE-NEXT:    mov x13, #0 // =0x0
+; SVE-NEXT:    mov x14, #0 // =0x0
+; SVE-NEXT:    mov x15, #0 // =0x0
+; SVE-NEXT:    mov x16, #0 // =0x0
+; SVE-NEXT:    mov x17, #0 // =0x0
+; SVE-NEXT:    mov x18, #0 // =0x0
 ; SVE-NEXT:    mov z1.d, #0 // =0x0
 ; SVE-NEXT:    mov z2.d, #0 // =0x0
 ; SVE-NEXT:    mov z3.d, #0 // =0x0

diff  --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
index c4dd9a1eb1a6cb..f24abb56840009 100644
--- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -146,22 +146,22 @@ define void @zext_v16i8_to_v16i32_in_loop(ptr %src, ptr %dst) {
 ; CHECK-LABEL: zext_v16i8_to_v16i32_in_loop:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh0:
-; CHECK-NEXT:    adrp x9, lCPI0_0 at PAGE
+; CHECK-NEXT:    adrp x8, lCPI0_0 at PAGE
 ; CHECK-NEXT:  Lloh1:
-; CHECK-NEXT:    adrp x10, lCPI0_1 at PAGE
+; CHECK-NEXT:    adrp x9, lCPI0_1 at PAGE
 ; CHECK-NEXT:  Lloh2:
-; CHECK-NEXT:    adrp x11, lCPI0_2 at PAGE
+; CHECK-NEXT:    adrp x10, lCPI0_2 at PAGE
 ; CHECK-NEXT:  Lloh3:
-; CHECK-NEXT:    adrp x12, lCPI0_3 at PAGE
-; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    ldr q0, [x8, lCPI0_0 at PAGEOFF]
 ; CHECK-NEXT:  Lloh4:
-; CHECK-NEXT:    ldr q0, [x9, lCPI0_0 at PAGEOFF]
+; CHECK-NEXT:    adrp x8, lCPI0_3 at PAGE
 ; CHECK-NEXT:  Lloh5:
-; CHECK-NEXT:    ldr q1, [x10, lCPI0_1 at PAGEOFF]
+; CHECK-NEXT:    ldr q1, [x9, lCPI0_1 at PAGEOFF]
 ; CHECK-NEXT:  Lloh6:
-; CHECK-NEXT:    ldr q2, [x11, lCPI0_2 at PAGEOFF]
+; CHECK-NEXT:    ldr q2, [x10, lCPI0_2 at PAGEOFF]
 ; CHECK-NEXT:  Lloh7:
-; CHECK-NEXT:    ldr q3, [x12, lCPI0_3 at PAGEOFF]
+; CHECK-NEXT:    ldr q3, [x8, lCPI0_3 at PAGEOFF]
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB0_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr q4, [x0, x8]
@@ -176,10 +176,11 @@ define void @zext_v16i8_to_v16i32_in_loop(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    b.ne LBB0_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh3, Lloh7
+; CHECK-NEXT:    .loh AdrpLdr Lloh4, Lloh7
 ; CHECK-NEXT:    .loh AdrpLdr Lloh2, Lloh6
 ; CHECK-NEXT:    .loh AdrpLdr Lloh1, Lloh5
-; CHECK-NEXT:    .loh AdrpLdr Lloh0, Lloh4
+; CHECK-NEXT:    .loh AdrpAdrp Lloh0, Lloh4
+; CHECK-NEXT:    .loh AdrpLdr Lloh0, Lloh3
 ;
 ; CHECK-BE-LABEL: zext_v16i8_to_v16i32_in_loop:
 ; CHECK-BE:       // %bb.0: // %entry
@@ -199,21 +200,21 @@ define void @zext_v16i8_to_v16i32_in_loop(ptr %src, ptr %dst) {
 ; CHECK-BE-NEXT:  .LBB0_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    add x9, x0, x8
-; CHECK-BE-NEXT:    add x10, x1, #32
 ; CHECK-BE-NEXT:    add x8, x8, #16
-; CHECK-BE-NEXT:    cmp x8, #128
 ; CHECK-BE-NEXT:    ld1 { v4.16b }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #48
+; CHECK-BE-NEXT:    cmp x8, #128
 ; CHECK-BE-NEXT:    tbl v5.16b, { v4.16b }, v3.16b
-; CHECK-BE-NEXT:    tbl v6.16b, { v4.16b }, v0.16b
-; CHECK-BE-NEXT:    tbl v7.16b, { v4.16b }, v2.16b
-; CHECK-BE-NEXT:    tbl v4.16b, { v4.16b }, v1.16b
+; CHECK-BE-NEXT:    tbl v6.16b, { v4.16b }, v2.16b
+; CHECK-BE-NEXT:    tbl v7.16b, { v4.16b }, v1.16b
+; CHECK-BE-NEXT:    tbl v4.16b, { v4.16b }, v0.16b
 ; CHECK-BE-NEXT:    st1 { v5.16b }, [x9]
+; CHECK-BE-NEXT:    add x9, x1, #32
+; CHECK-BE-NEXT:    st1 { v6.16b }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #16
-; CHECK-BE-NEXT:    st1 { v6.16b }, [x1]
+; CHECK-BE-NEXT:    st1 { v4.16b }, [x1]
 ; CHECK-BE-NEXT:    add x1, x1, #64
-; CHECK-BE-NEXT:    st1 { v7.16b }, [x10]
-; CHECK-BE-NEXT:    st1 { v4.16b }, [x9]
+; CHECK-BE-NEXT:    st1 { v7.16b }, [x9]
 ; CHECK-BE-NEXT:    b.ne .LBB0_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    ret
@@ -281,7 +282,6 @@ define void @zext_v16i8_to_v16i32_in_loop_not_header(ptr %src, ptr %dst, i1 %c)
 ; CHECK-BE-NEXT:    // in Loop: Header=BB1_2 Depth=1
 ; CHECK-BE-NEXT:    add x9, x0, x8
 ; CHECK-BE-NEXT:    add x10, x1, #32
-; CHECK-BE-NEXT:    add x11, x1, #16
 ; CHECK-BE-NEXT:    ld1 { v0.16b }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #48
 ; CHECK-BE-NEXT:    ushll2 v1.8h, v0.16b, #0
@@ -289,10 +289,11 @@ define void @zext_v16i8_to_v16i32_in_loop_not_header(ptr %src, ptr %dst, i1 %c)
 ; CHECK-BE-NEXT:    ushll2 v2.4s, v1.8h, #0
 ; CHECK-BE-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-BE-NEXT:    ushll2 v3.4s, v0.8h, #0
-; CHECK-BE-NEXT:    st1 { v2.4s }, [x9]
 ; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    st1 { v2.4s }, [x9]
+; CHECK-BE-NEXT:    add x9, x1, #16
 ; CHECK-BE-NEXT:    st1 { v1.4s }, [x10]
-; CHECK-BE-NEXT:    st1 { v3.4s }, [x11]
+; CHECK-BE-NEXT:    st1 { v3.4s }, [x9]
 ; CHECK-BE-NEXT:    st1 { v0.4s }, [x1]
 ; CHECK-BE-NEXT:    b .LBB1_1
 ; CHECK-BE-NEXT:  .LBB1_4: // %exit
@@ -345,13 +346,13 @@ define void @zext_v16i8_to_v16i32_no_loop(ptr %src, ptr %dst) {
 ; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-BE-NEXT:    ushll2 v2.4s, v1.8h, #0
 ; CHECK-BE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-BE-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-BE-NEXT:    st1 { v2.4s }, [x8]
 ; CHECK-BE-NEXT:    add x8, x1, #32
 ; CHECK-BE-NEXT:    st1 { v1.4s }, [x8]
 ; CHECK-BE-NEXT:    add x8, x1, #16
-; CHECK-BE-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-BE-NEXT:    st1 { v1.4s }, [x8]
+; CHECK-BE-NEXT:    st1 { v3.4s }, [x8]
 ; CHECK-BE-NEXT:    st1 { v0.4s }, [x1]
 ; CHECK-BE-NEXT:    ret
 entry:
@@ -389,23 +390,23 @@ define void @zext_v16i8_to_v16i32_in_loop_optsize(ptr %src, ptr %dst) optsize {
 ; CHECK-BE-NEXT:  .LBB3_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    add x9, x0, x8
-; CHECK-BE-NEXT:    add x10, x1, #32
 ; CHECK-BE-NEXT:    add x8, x8, #16
-; CHECK-BE-NEXT:    cmp x8, #128
 ; CHECK-BE-NEXT:    ld1 { v0.16b }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #48
+; CHECK-BE-NEXT:    cmp x8, #128
 ; CHECK-BE-NEXT:    ushll2 v1.8h, v0.16b, #0
 ; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-BE-NEXT:    ushll2 v2.4s, v1.8h, #0
 ; CHECK-BE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-BE-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-BE-NEXT:    st1 { v2.4s }, [x9]
+; CHECK-BE-NEXT:    add x9, x1, #32
+; CHECK-BE-NEXT:    st1 { v1.4s }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #16
-; CHECK-BE-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-BE-NEXT:    st1 { v1.4s }, [x10]
-; CHECK-BE-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-BE-NEXT:    st1 { v2.4s }, [x1]
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x1]
 ; CHECK-BE-NEXT:    add x1, x1, #64
-; CHECK-BE-NEXT:    st1 { v0.4s }, [x9]
+; CHECK-BE-NEXT:    st1 { v3.4s }, [x9]
 ; CHECK-BE-NEXT:    b.ne .LBB3_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    ret
@@ -455,23 +456,23 @@ define void @zext_v16i8_to_v16i32_in_loop_minsize(ptr %src, ptr %dst) minsize {
 ; CHECK-BE-NEXT:  .LBB4_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    add x9, x0, x8
-; CHECK-BE-NEXT:    add x10, x1, #32
 ; CHECK-BE-NEXT:    add x8, x8, #16
-; CHECK-BE-NEXT:    cmp x8, #128
 ; CHECK-BE-NEXT:    ld1 { v0.16b }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #48
+; CHECK-BE-NEXT:    cmp x8, #128
 ; CHECK-BE-NEXT:    ushll2 v1.8h, v0.16b, #0
 ; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-BE-NEXT:    ushll2 v2.4s, v1.8h, #0
 ; CHECK-BE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-BE-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-BE-NEXT:    st1 { v2.4s }, [x9]
+; CHECK-BE-NEXT:    add x9, x1, #32
+; CHECK-BE-NEXT:    st1 { v1.4s }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #16
-; CHECK-BE-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-BE-NEXT:    st1 { v1.4s }, [x10]
-; CHECK-BE-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-BE-NEXT:    st1 { v2.4s }, [x1]
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x1]
 ; CHECK-BE-NEXT:    add x1, x1, #64
-; CHECK-BE-NEXT:    st1 { v0.4s }, [x9]
+; CHECK-BE-NEXT:    st1 { v3.4s }, [x9]
 ; CHECK-BE-NEXT:    b.ne .LBB4_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    ret
@@ -516,14 +517,14 @@ define void @zext_v16i8_to_v16i16_in_loop(ptr %src, ptr %dst) {
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    add x9, x0, x8
 ; CHECK-BE-NEXT:    add x8, x8, #16
-; CHECK-BE-NEXT:    cmp x8, #128
 ; CHECK-BE-NEXT:    ld1 { v0.16b }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #16
-; CHECK-BE-NEXT:    ushll v1.8h, v0.8b, #0
-; CHECK-BE-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-BE-NEXT:    st1 { v1.8h }, [x1]
+; CHECK-BE-NEXT:    cmp x8, #128
+; CHECK-BE-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:    st1 { v0.8h }, [x1]
 ; CHECK-BE-NEXT:    add x1, x1, #32
-; CHECK-BE-NEXT:    st1 { v0.8h }, [x9]
+; CHECK-BE-NEXT:    st1 { v1.8h }, [x9]
 ; CHECK-BE-NEXT:    b.ne .LBB5_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    ret
@@ -621,14 +622,14 @@ define void @zext_v8i8_to_v8i32_in_loop(ptr %src, ptr %dst) {
 ; CHECK-LABEL: zext_v8i8_to_v8i32_in_loop:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh8:
-; CHECK-NEXT:    adrp x9, lCPI6_0 at PAGE
+; CHECK-NEXT:    adrp x8, lCPI6_0 at PAGE
 ; CHECK-NEXT:  Lloh9:
-; CHECK-NEXT:    adrp x10, lCPI6_1 at PAGE
-; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    adrp x9, lCPI6_1 at PAGE
 ; CHECK-NEXT:  Lloh10:
-; CHECK-NEXT:    ldr q0, [x9, lCPI6_0 at PAGEOFF]
+; CHECK-NEXT:    ldr q0, [x8, lCPI6_0 at PAGEOFF]
 ; CHECK-NEXT:  Lloh11:
-; CHECK-NEXT:    ldr q1, [x10, lCPI6_1 at PAGEOFF]
+; CHECK-NEXT:    ldr q1, [x9, lCPI6_1 at PAGEOFF]
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB6_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr d2, [x0, x8]
@@ -656,14 +657,14 @@ define void @zext_v8i8_to_v8i32_in_loop(ptr %src, ptr %dst) {
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    add x9, x0, x8
 ; CHECK-BE-NEXT:    add x8, x8, #16
-; CHECK-BE-NEXT:    cmp x8, #128
 ; CHECK-BE-NEXT:    ld1 { v2.8b }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #16
-; CHECK-BE-NEXT:    tbl v3.16b, { v2.16b }, v0.16b
-; CHECK-BE-NEXT:    tbl v2.16b, { v2.16b }, v1.16b
-; CHECK-BE-NEXT:    st1 { v3.16b }, [x1]
+; CHECK-BE-NEXT:    cmp x8, #128
+; CHECK-BE-NEXT:    tbl v3.16b, { v2.16b }, v1.16b
+; CHECK-BE-NEXT:    tbl v2.16b, { v2.16b }, v0.16b
+; CHECK-BE-NEXT:    st1 { v2.16b }, [x1]
 ; CHECK-BE-NEXT:    add x1, x1, #64
-; CHECK-BE-NEXT:    st1 { v2.16b }, [x9]
+; CHECK-BE-NEXT:    st1 { v3.16b }, [x9]
 ; CHECK-BE-NEXT:    b.ne .LBB6_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    ret
@@ -694,24 +695,24 @@ define void @zext_v16i8_to_v16i64_in_loop(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    ldr q0, [x0, x8]
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    cmp x8, #128
-; CHECK-NEXT:    ushll.8h v1, v0, #0
-; CHECK-NEXT:    ushll2.8h v0, v0, #0
+; CHECK-NEXT:    ushll2.8h v1, v0, #0
+; CHECK-NEXT:    ushll.8h v0, v0, #0
 ; CHECK-NEXT:    ushll2.4s v2, v1, #0
-; CHECK-NEXT:    ushll2.4s v3, v0, #0
+; CHECK-NEXT:    ushll.4s v1, v1, #0
+; CHECK-NEXT:    ushll2.4s v4, v0, #0
 ; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    ushll2.2d v4, v3, #0
-; CHECK-NEXT:    ushll2.2d v5, v0, #0
-; CHECK-NEXT:    ushll.2d v0, v0, #0
-; CHECK-NEXT:    ushll.2d v3, v3, #0
-; CHECK-NEXT:    stp q0, q5, [x1, #64]
-; CHECK-NEXT:    ushll.4s v0, v1, #0
-; CHECK-NEXT:    stp q3, q4, [x1, #96]
 ; CHECK-NEXT:    ushll2.2d v3, v2, #0
 ; CHECK-NEXT:    ushll.2d v2, v2, #0
-; CHECK-NEXT:    ushll2.2d v1, v0, #0
+; CHECK-NEXT:    ushll2.2d v5, v1, #0
+; CHECK-NEXT:    ushll.2d v1, v1, #0
+; CHECK-NEXT:    stp q2, q3, [x1, #96]
+; CHECK-NEXT:    ushll2.2d v3, v4, #0
+; CHECK-NEXT:    ushll.2d v2, v4, #0
+; CHECK-NEXT:    ushll2.2d v4, v0, #0
 ; CHECK-NEXT:    ushll.2d v0, v0, #0
+; CHECK-NEXT:    stp q1, q5, [x1, #64]
 ; CHECK-NEXT:    stp q2, q3, [x1, #32]
-; CHECK-NEXT:    stp q0, q1, [x1], #128
+; CHECK-NEXT:    stp q0, q4, [x1], #128
 ; CHECK-NEXT:    b.ne LBB7_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
@@ -722,39 +723,39 @@ define void @zext_v16i8_to_v16i64_in_loop(ptr %src, ptr %dst) {
 ; CHECK-BE-NEXT:  .LBB7_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    add x9, x0, x8
-; CHECK-BE-NEXT:    add x10, x1, #96
 ; CHECK-BE-NEXT:    add x8, x8, #16
-; CHECK-BE-NEXT:    cmp x8, #128
 ; CHECK-BE-NEXT:    ld1 { v0.16b }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #112
+; CHECK-BE-NEXT:    cmp x8, #128
 ; CHECK-BE-NEXT:    ushll2 v1.8h, v0.16b, #0
 ; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-BE-NEXT:    ushll2 v2.4s, v1.8h, #0
 ; CHECK-BE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-BE-NEXT:    ushll2 v4.4s, v0.8h, #0
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-BE-NEXT:    ushll2 v3.2d, v2.4s, #0
 ; CHECK-BE-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-BE-NEXT:    ushll2 v5.2d, v1.4s, #0
+; CHECK-BE-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-BE-NEXT:    st1 { v3.2d }, [x9]
+; CHECK-BE-NEXT:    add x9, x1, #96
+; CHECK-BE-NEXT:    ushll2 v3.2d, v4.4s, #0
+; CHECK-BE-NEXT:    st1 { v2.2d }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #80
-; CHECK-BE-NEXT:    ushll2 v3.2d, v1.4s, #0
-; CHECK-BE-NEXT:    st1 { v2.2d }, [x10]
-; CHECK-BE-NEXT:    ushll2 v2.4s, v0.8h, #0
-; CHECK-BE-NEXT:    add x10, x1, #48
+; CHECK-BE-NEXT:    ushll v2.2d, v4.2s, #0
+; CHECK-BE-NEXT:    st1 { v5.2d }, [x9]
+; CHECK-BE-NEXT:    add x9, x1, #48
 ; CHECK-BE-NEXT:    st1 { v3.2d }, [x9]
+; CHECK-BE-NEXT:    ushll2 v3.2d, v0.4s, #0
+; CHECK-BE-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-BE-NEXT:    add x9, x1, #64
-; CHECK-BE-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-BE-NEXT:    ushll2 v4.2d, v2.4s, #0
 ; CHECK-BE-NEXT:    st1 { v1.2d }, [x9]
-; CHECK-BE-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-BE-NEXT:    add x9, x1, #32
+; CHECK-BE-NEXT:    st1 { v2.2d }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #16
-; CHECK-BE-NEXT:    st1 { v4.2d }, [x10]
-; CHECK-BE-NEXT:    add x10, x1, #32
-; CHECK-BE-NEXT:    st1 { v1.2d }, [x1]
+; CHECK-BE-NEXT:    st1 { v0.2d }, [x1]
 ; CHECK-BE-NEXT:    add x1, x1, #128
-; CHECK-BE-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-BE-NEXT:    ushll v2.2d, v2.2s, #0
-; CHECK-BE-NEXT:    st1 { v0.2d }, [x9]
-; CHECK-BE-NEXT:    st1 { v2.2d }, [x10]
+; CHECK-BE-NEXT:    st1 { v3.2d }, [x9]
 ; CHECK-BE-NEXT:    b.ne .LBB7_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    ret
@@ -806,24 +807,24 @@ define void @zext_v8i8_to_v8i64_in_loop(ptr %src, ptr %dst) {
 ; CHECK-BE-NEXT:  .LBB8_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    add x9, x0, x8
-; CHECK-BE-NEXT:    add x10, x1, #32
 ; CHECK-BE-NEXT:    add x8, x8, #16
-; CHECK-BE-NEXT:    cmp x8, #128
 ; CHECK-BE-NEXT:    ld1 { v0.8b }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #48
+; CHECK-BE-NEXT:    cmp x8, #128
 ; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-BE-NEXT:    ushll2 v1.4s, v0.8h, #0
 ; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-BE-NEXT:    ushll2 v2.2d, v1.4s, #0
 ; CHECK-BE-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-BE-NEXT:    ushll2 v3.2d, v0.4s, #0
+; CHECK-BE-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-BE-NEXT:    st1 { v2.2d }, [x9]
+; CHECK-BE-NEXT:    add x9, x1, #32
+; CHECK-BE-NEXT:    st1 { v1.2d }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #16
-; CHECK-BE-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-BE-NEXT:    st1 { v1.2d }, [x10]
-; CHECK-BE-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-BE-NEXT:    st1 { v2.2d }, [x1]
+; CHECK-BE-NEXT:    st1 { v0.2d }, [x1]
 ; CHECK-BE-NEXT:    add x1, x1, #128
-; CHECK-BE-NEXT:    st1 { v0.2d }, [x9]
+; CHECK-BE-NEXT:    st1 { v3.2d }, [x9]
 ; CHECK-BE-NEXT:    b.ne .LBB8_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    ret
@@ -869,8 +870,8 @@ define void @zext_v8i8_to_v8i16_in_loop(ptr %src, ptr %dst) {
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    add x9, x0, x8
 ; CHECK-BE-NEXT:    add x8, x8, #16
-; CHECK-BE-NEXT:    cmp x8, #128
 ; CHECK-BE-NEXT:    ld1 { v0.8b }, [x9]
+; CHECK-BE-NEXT:    cmp x8, #128
 ; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-BE-NEXT:    st1 { v0.8h }, [x1]
 ; CHECK-BE-NEXT:    add x1, x1, #32
@@ -910,24 +911,24 @@ define void @zext_v8i8_to_v8i20_in_loop(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    ushll.8h v0, v0, #0
 ; CHECK-NEXT:    ushll2.4s v1, v0, #0
 ; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    mov.s w11, v1[1]
-; CHECK-NEXT:    mov.s w13, v0[1]
-; CHECK-NEXT:    fmov w12, s1
-; CHECK-NEXT:    mov.s w14, v1[2]
-; CHECK-NEXT:    fmov w15, s0
+; CHECK-NEXT:    mov.s w9, v1[1]
+; CHECK-NEXT:    mov.s w11, v0[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w14, s0
+; CHECK-NEXT:    mov.s w13, v1[2]
 ; CHECK-NEXT:    mov.s w16, v0[2]
-; CHECK-NEXT:    mov.s w9, v1[3]
-; CHECK-NEXT:    mov.s w10, v0[3]
-; CHECK-NEXT:    orr x11, x12, x11, lsl #20
-; CHECK-NEXT:    orr x12, x15, x13, lsl #20
-; CHECK-NEXT:    orr x11, x11, x14, lsl #40
-; CHECK-NEXT:    orr x12, x12, x16, lsl #40
-; CHECK-NEXT:    lsr w13, w9, #4
-; CHECK-NEXT:    lsr w14, w10, #4
-; CHECK-NEXT:    orr x9, x11, x9, lsl #60
-; CHECK-NEXT:    orr x10, x12, x10, lsl #60
-; CHECK-NEXT:    strh w13, [x1, #18]
-; CHECK-NEXT:    strh w14, [x1, #8]
+; CHECK-NEXT:    mov.s w12, v1[3]
+; CHECK-NEXT:    mov.s w15, v0[3]
+; CHECK-NEXT:    orr x9, x10, x9, lsl #20
+; CHECK-NEXT:    orr x10, x14, x11, lsl #20
+; CHECK-NEXT:    orr x9, x9, x13, lsl #40
+; CHECK-NEXT:    orr x10, x10, x16, lsl #40
+; CHECK-NEXT:    lsr w11, w12, #4
+; CHECK-NEXT:    lsr w13, w15, #4
+; CHECK-NEXT:    orr x9, x9, x12, lsl #60
+; CHECK-NEXT:    orr x10, x10, x15, lsl #60
+; CHECK-NEXT:    strh w11, [x1, #18]
+; CHECK-NEXT:    strh w13, [x1, #8]
 ; CHECK-NEXT:    stur x9, [x1, #10]
 ; CHECK-NEXT:    str x10, [x1], #64
 ; CHECK-NEXT:    b.ne LBB10_1
@@ -941,35 +942,35 @@ define void @zext_v8i8_to_v8i20_in_loop(ptr %src, ptr %dst) {
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    add x9, x0, x8
 ; CHECK-BE-NEXT:    add x8, x8, #16
-; CHECK-BE-NEXT:    cmp x8, #128
 ; CHECK-BE-NEXT:    ld1 { v0.8b }, [x9]
+; CHECK-BE-NEXT:    cmp x8, #128
 ; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-BE-NEXT:    ushll2 v1.4s, v0.8h, #0
 ; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-BE-NEXT:    mov w9, v1.s[1]
-; CHECK-BE-NEXT:    mov w11, v1.s[2]
+; CHECK-BE-NEXT:    mov w10, v0.s[1]
 ; CHECK-BE-NEXT:    fmov w12, s1
-; CHECK-BE-NEXT:    mov w13, v0.s[1]
-; CHECK-BE-NEXT:    mov w15, v0.s[2]
-; CHECK-BE-NEXT:    mov w10, v1.s[3]
-; CHECK-BE-NEXT:    mov w14, v0.s[3]
+; CHECK-BE-NEXT:    fmov w14, s0
+; CHECK-BE-NEXT:    mov w11, v1.s[2]
+; CHECK-BE-NEXT:    mov w13, v0.s[2]
+; CHECK-BE-NEXT:    mov w15, v1.s[3]
 ; CHECK-BE-NEXT:    lsl x9, x9, #40
+; CHECK-BE-NEXT:    lsl x10, x10, #40
 ; CHECK-BE-NEXT:    orr x9, x9, x12, lsl #60
+; CHECK-BE-NEXT:    orr x10, x10, x14, lsl #60
 ; CHECK-BE-NEXT:    lsr x12, x12, #4
+; CHECK-BE-NEXT:    strh w15, [x1, #18]
 ; CHECK-BE-NEXT:    orr x9, x9, x11, lsl #20
-; CHECK-BE-NEXT:    fmov w11, s0
-; CHECK-BE-NEXT:    lsl x13, x13, #40
+; CHECK-BE-NEXT:    orr x10, x10, x13, lsl #20
+; CHECK-BE-NEXT:    mov w11, v0.s[3]
+; CHECK-BE-NEXT:    lsr x13, x14, #4
 ; CHECK-BE-NEXT:    lsr x9, x9, #16
+; CHECK-BE-NEXT:    lsr x10, x10, #16
 ; CHECK-BE-NEXT:    bfi x9, x12, #48, #4
-; CHECK-BE-NEXT:    strh w10, [x1, #18]
-; CHECK-BE-NEXT:    orr x13, x13, x11, lsl #60
-; CHECK-BE-NEXT:    lsr x11, x11, #4
-; CHECK-BE-NEXT:    orr x13, x13, x15, lsl #20
-; CHECK-BE-NEXT:    strh w14, [x1, #8]
-; CHECK-BE-NEXT:    lsr x12, x13, #16
+; CHECK-BE-NEXT:    bfi x10, x13, #48, #4
+; CHECK-BE-NEXT:    strh w11, [x1, #8]
 ; CHECK-BE-NEXT:    stur x9, [x1, #10]
-; CHECK-BE-NEXT:    bfi x12, x11, #48, #4
-; CHECK-BE-NEXT:    str x12, [x1], #64
+; CHECK-BE-NEXT:    str x10, [x1], #64
 ; CHECK-BE-NEXT:    b.ne .LBB10_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    ret
@@ -1155,18 +1156,18 @@ define void @zext_v12i8_to_v12i32_in_loop(ptr %src, ptr %dst) {
 ; CHECK-LABEL: zext_v12i8_to_v12i32_in_loop:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh12:
-; CHECK-NEXT:    adrp x9, lCPI12_0 at PAGE
+; CHECK-NEXT:    adrp x8, lCPI12_0 at PAGE
 ; CHECK-NEXT:  Lloh13:
-; CHECK-NEXT:    adrp x10, lCPI12_1 at PAGE
+; CHECK-NEXT:    adrp x9, lCPI12_1 at PAGE
 ; CHECK-NEXT:  Lloh14:
-; CHECK-NEXT:    adrp x11, lCPI12_2 at PAGE
-; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    adrp x10, lCPI12_2 at PAGE
 ; CHECK-NEXT:  Lloh15:
-; CHECK-NEXT:    ldr q0, [x9, lCPI12_0 at PAGEOFF]
+; CHECK-NEXT:    ldr q0, [x8, lCPI12_0 at PAGEOFF]
 ; CHECK-NEXT:  Lloh16:
-; CHECK-NEXT:    ldr q1, [x10, lCPI12_1 at PAGEOFF]
+; CHECK-NEXT:    ldr q1, [x9, lCPI12_1 at PAGEOFF]
 ; CHECK-NEXT:  Lloh17:
-; CHECK-NEXT:    ldr q2, [x11, lCPI12_2 at PAGEOFF]
+; CHECK-NEXT:    ldr q2, [x10, lCPI12_2 at PAGEOFF]
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB12_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr q3, [x0, x8]
@@ -1199,18 +1200,18 @@ define void @zext_v12i8_to_v12i32_in_loop(ptr %src, ptr %dst) {
 ; CHECK-BE-NEXT:  .LBB12_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    add x9, x0, x8
-; CHECK-BE-NEXT:    add x10, x1, #16
 ; CHECK-BE-NEXT:    add x8, x8, #16
-; CHECK-BE-NEXT:    cmp x8, #128
+; CHECK-BE-NEXT:    add x10, x1, #16
 ; CHECK-BE-NEXT:    ld1 { v3.16b }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #32
-; CHECK-BE-NEXT:    tbl v4.16b, { v3.16b }, v0.16b
-; CHECK-BE-NEXT:    tbl v5.16b, { v3.16b }, v2.16b
-; CHECK-BE-NEXT:    tbl v3.16b, { v3.16b }, v1.16b
-; CHECK-BE-NEXT:    st1 { v4.16b }, [x1]
+; CHECK-BE-NEXT:    cmp x8, #128
+; CHECK-BE-NEXT:    tbl v4.16b, { v3.16b }, v2.16b
+; CHECK-BE-NEXT:    tbl v5.16b, { v3.16b }, v1.16b
+; CHECK-BE-NEXT:    tbl v3.16b, { v3.16b }, v0.16b
+; CHECK-BE-NEXT:    st1 { v3.16b }, [x1]
 ; CHECK-BE-NEXT:    add x1, x1, #64
-; CHECK-BE-NEXT:    st1 { v5.16b }, [x9]
-; CHECK-BE-NEXT:    st1 { v3.16b }, [x10]
+; CHECK-BE-NEXT:    st1 { v4.16b }, [x9]
+; CHECK-BE-NEXT:    st1 { v5.16b }, [x10]
 ; CHECK-BE-NEXT:    b.ne .LBB12_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    ret
@@ -1244,11 +1245,11 @@ define void @zext_v16i4_to_v16i32_in_loop(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    ldr x9, [x0, x8]
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    cmp x8, #128
-; CHECK-NEXT:    and w10, w9, #0xf
-; CHECK-NEXT:    ubfx w11, w9, #4, #4
-; CHECK-NEXT:    fmov s1, w10
+; CHECK-NEXT:    and w11, w9, #0xf
+; CHECK-NEXT:    ubfx w10, w9, #4, #4
+; CHECK-NEXT:    fmov s1, w11
+; CHECK-NEXT:    mov.b v1[1], w10
 ; CHECK-NEXT:    ubfx w10, w9, #8, #4
-; CHECK-NEXT:    mov.b v1[1], w11
 ; CHECK-NEXT:    mov.b v1[2], w10
 ; CHECK-NEXT:    ubfx w10, w9, #12, #4
 ; CHECK-NEXT:    mov.b v1[3], w10
@@ -1279,18 +1280,18 @@ define void @zext_v16i4_to_v16i32_in_loop(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    ext.16b v2, v1, v1, #8
 ; CHECK-NEXT:    zip2.8b v3, v1, v0
 ; CHECK-NEXT:    zip1.8b v1, v1, v0
-; CHECK-NEXT:    zip1.8b v4, v2, v0
-; CHECK-NEXT:    zip2.8b v2, v2, v0
+; CHECK-NEXT:    zip2.8b v4, v2, v0
+; CHECK-NEXT:    zip1.8b v2, v2, v0
 ; CHECK-NEXT:    ushll.4s v3, v3, #0
 ; CHECK-NEXT:    ushll.4s v1, v1, #0
 ; CHECK-NEXT:    and.16b v3, v3, v0
+; CHECK-NEXT:    ushll.4s v4, v4, #0
+; CHECK-NEXT:    ushll.4s v2, v2, #0
 ; CHECK-NEXT:    and.16b v1, v1, v0
-; CHECK-NEXT:    stp q1, q3, [x1]
-; CHECK-NEXT:    ushll.4s v1, v2, #0
-; CHECK-NEXT:    ushll.4s v2, v4, #0
-; CHECK-NEXT:    and.16b v1, v1, v0
+; CHECK-NEXT:    and.16b v4, v4, v0
 ; CHECK-NEXT:    and.16b v2, v2, v0
-; CHECK-NEXT:    stp q2, q1, [x1, #32]
+; CHECK-NEXT:    stp q1, q3, [x1]
+; CHECK-NEXT:    stp q2, q4, [x1, #32]
 ; CHECK-NEXT:    add x1, x1, #64
 ; CHECK-NEXT:    b.ne LBB13_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
@@ -1310,7 +1311,6 @@ define void @zext_v16i4_to_v16i32_in_loop(ptr %src, ptr %dst) {
 ; CHECK-BE-NEXT:    fmov s1, w10
 ; CHECK-BE-NEXT:    ubfx x10, x9, #52, #4
 ; CHECK-BE-NEXT:    mov v1.b[1], w11
-; CHECK-BE-NEXT:    add x11, x1, #32
 ; CHECK-BE-NEXT:    mov v1.b[2], w10
 ; CHECK-BE-NEXT:    ubfx x10, x9, #48, #4
 ; CHECK-BE-NEXT:    mov v1.b[3], w10
@@ -1337,31 +1337,32 @@ define void @zext_v16i4_to_v16i32_in_loop(ptr %src, ptr %dst) {
 ; CHECK-BE-NEXT:    ubfx w10, w9, #4, #4
 ; CHECK-BE-NEXT:    and w9, w9, #0xf
 ; CHECK-BE-NEXT:    mov v1.b[14], w10
-; CHECK-BE-NEXT:    add x10, x1, #48
+; CHECK-BE-NEXT:    add x10, x1, #32
 ; CHECK-BE-NEXT:    mov v1.b[15], w9
 ; CHECK-BE-NEXT:    add x9, x1, #16
 ; CHECK-BE-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
 ; CHECK-BE-NEXT:    zip2 v3.8b, v1.8b, v0.8b
 ; CHECK-BE-NEXT:    zip1 v1.8b, v1.8b, v0.8b
-; CHECK-BE-NEXT:    zip1 v4.8b, v2.8b, v0.8b
-; CHECK-BE-NEXT:    zip2 v2.8b, v2.8b, v0.8b
-; CHECK-BE-NEXT:    rev16 v1.8b, v1.8b
+; CHECK-BE-NEXT:    zip2 v4.8b, v2.8b, v0.8b
+; CHECK-BE-NEXT:    zip1 v2.8b, v2.8b, v0.8b
 ; CHECK-BE-NEXT:    rev16 v3.8b, v3.8b
+; CHECK-BE-NEXT:    rev16 v1.8b, v1.8b
 ; CHECK-BE-NEXT:    rev16 v4.8b, v4.8b
 ; CHECK-BE-NEXT:    rev16 v2.8b, v2.8b
-; CHECK-BE-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-BE-NEXT:    ushll v3.4s, v3.4h, #0
-; CHECK-BE-NEXT:    and v1.16b, v1.16b, v0.16b
-; CHECK-BE-NEXT:    st1 { v1.4s }, [x1]
-; CHECK-BE-NEXT:    add x1, x1, #64
-; CHECK-BE-NEXT:    ushll v1.4s, v2.4h, #0
-; CHECK-BE-NEXT:    ushll v2.4s, v4.4h, #0
+; CHECK-BE-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-BE-NEXT:    and v3.16b, v3.16b, v0.16b
+; CHECK-BE-NEXT:    ushll v4.4s, v4.4h, #0
+; CHECK-BE-NEXT:    ushll v2.4s, v2.4h, #0
 ; CHECK-BE-NEXT:    and v1.16b, v1.16b, v0.16b
 ; CHECK-BE-NEXT:    st1 { v3.4s }, [x9]
+; CHECK-BE-NEXT:    add x9, x1, #48
+; CHECK-BE-NEXT:    and v4.16b, v4.16b, v0.16b
 ; CHECK-BE-NEXT:    and v2.16b, v2.16b, v0.16b
-; CHECK-BE-NEXT:    st1 { v1.4s }, [x10]
-; CHECK-BE-NEXT:    st1 { v2.4s }, [x11]
+; CHECK-BE-NEXT:    st1 { v1.4s }, [x1]
+; CHECK-BE-NEXT:    add x1, x1, #64
+; CHECK-BE-NEXT:    st1 { v4.4s }, [x9]
+; CHECK-BE-NEXT:    st1 { v2.4s }, [x10]
 ; CHECK-BE-NEXT:    b.ne .LBB13_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    ret
@@ -1393,24 +1394,24 @@ define void @zext_v16i16_to_v16i64_in_loop(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x9, x0, x8
 ; CHECK-NEXT:    add x8, x8, #32
+; CHECK-NEXT:    ldp q1, q0, [x9]
 ; CHECK-NEXT:    cmp x8, #256
-; CHECK-NEXT:    ldp q0, q1, [x9]
-; CHECK-NEXT:    ushll.4s v2, v0, #0
-; CHECK-NEXT:    ushll2.4s v0, v0, #0
-; CHECK-NEXT:    ushll.4s v3, v1, #0
-; CHECK-NEXT:    ushll2.4s v1, v1, #0
-; CHECK-NEXT:    ushll2.2d v5, v0, #0
-; CHECK-NEXT:    ushll2.2d v4, v1, #0
-; CHECK-NEXT:    ushll.2d v1, v1, #0
-; CHECK-NEXT:    ushll.2d v0, v0, #0
-; CHECK-NEXT:    stp q1, q4, [x1, #96]
-; CHECK-NEXT:    ushll2.2d v1, v3, #0
-; CHECK-NEXT:    stp q0, q5, [x1, #32]
+; CHECK-NEXT:    ushll2.4s v2, v0, #0
+; CHECK-NEXT:    ushll2.4s v3, v1, #0
+; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    ushll.4s v1, v1, #0
+; CHECK-NEXT:    ushll2.2d v4, v2, #0
+; CHECK-NEXT:    ushll.2d v2, v2, #0
+; CHECK-NEXT:    ushll2.2d v5, v3, #0
 ; CHECK-NEXT:    ushll.2d v3, v3, #0
-; CHECK-NEXT:    ushll2.2d v0, v2, #0
-; CHECK-NEXT:    stp q3, q1, [x1, #64]
-; CHECK-NEXT:    ushll.2d v1, v2, #0
-; CHECK-NEXT:    stp q1, q0, [x1], #128
+; CHECK-NEXT:    stp q2, q4, [x1, #96]
+; CHECK-NEXT:    ushll2.2d v4, v0, #0
+; CHECK-NEXT:    ushll.2d v0, v0, #0
+; CHECK-NEXT:    ushll2.2d v2, v1, #0
+; CHECK-NEXT:    ushll.2d v1, v1, #0
+; CHECK-NEXT:    stp q3, q5, [x1, #32]
+; CHECK-NEXT:    stp q0, q4, [x1, #64]
+; CHECK-NEXT:    stp q1, q2, [x1], #128
 ; CHECK-NEXT:    b.ne LBB14_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
@@ -1421,39 +1422,39 @@ define void @zext_v16i16_to_v16i64_in_loop(ptr %src, ptr %dst) {
 ; CHECK-BE-NEXT:  .LBB14_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    add x9, x0, x8
-; CHECK-BE-NEXT:    add x10, x1, #48
 ; CHECK-BE-NEXT:    add x8, x8, #32
-; CHECK-BE-NEXT:    cmp x8, #256
 ; CHECK-BE-NEXT:    ld1 { v0.8h }, [x9]
 ; CHECK-BE-NEXT:    add x9, x9, #16
-; CHECK-BE-NEXT:    ld1 { v2.8h }, [x9]
-; CHECK-BE-NEXT:    add x9, x1, #32
-; CHECK-BE-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-BE-NEXT:    cmp x8, #256
+; CHECK-BE-NEXT:    ld1 { v1.8h }, [x9]
+; CHECK-BE-NEXT:    add x9, x1, #48
+; CHECK-BE-NEXT:    ushll2 v2.4s, v0.8h, #0
 ; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-BE-NEXT:    ushll2 v3.2d, v1.4s, #0
-; CHECK-BE-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-BE-NEXT:    st1 { v3.2d }, [x10]
-; CHECK-BE-NEXT:    add x10, x1, #112
-; CHECK-BE-NEXT:    st1 { v1.2d }, [x9]
+; CHECK-BE-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-BE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-BE-NEXT:    ushll2 v4.2d, v2.4s, #0
+; CHECK-BE-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-BE-NEXT:    ushll2 v5.2d, v0.4s, #0
+; CHECK-BE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT:    st1 { v4.2d }, [x9]
+; CHECK-BE-NEXT:    add x9, x1, #32
+; CHECK-BE-NEXT:    ushll2 v4.2d, v3.4s, #0
+; CHECK-BE-NEXT:    st1 { v2.2d }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #16
-; CHECK-BE-NEXT:    ushll2 v3.2d, v0.4s, #0
-; CHECK-BE-NEXT:    ushll2 v1.4s, v2.8h, #0
-; CHECK-BE-NEXT:    st1 { v3.2d }, [x9]
+; CHECK-BE-NEXT:    ushll v2.2d, v3.2s, #0
+; CHECK-BE-NEXT:    st1 { v5.2d }, [x9]
+; CHECK-BE-NEXT:    add x9, x1, #112
+; CHECK-BE-NEXT:    ushll2 v3.2d, v1.4s, #0
+; CHECK-BE-NEXT:    st1 { v4.2d }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #96
-; CHECK-BE-NEXT:    ushll2 v4.2d, v1.4s, #0
-; CHECK-BE-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-BE-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-BE-NEXT:    st1 { v4.2d }, [x10]
-; CHECK-BE-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-BE-NEXT:    add x10, x1, #80
-; CHECK-BE-NEXT:    st1 { v0.2d }, [x1]
-; CHECK-BE-NEXT:    st1 { v1.2d }, [x9]
+; CHECK-BE-NEXT:    st1 { v2.2d }, [x9]
+; CHECK-BE-NEXT:    add x9, x1, #80
+; CHECK-BE-NEXT:    st1 { v3.2d }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #64
+; CHECK-BE-NEXT:    st1 { v0.2d }, [x1]
 ; CHECK-BE-NEXT:    add x1, x1, #128
-; CHECK-BE-NEXT:    ushll v3.2d, v2.2s, #0
-; CHECK-BE-NEXT:    ushll2 v2.2d, v2.4s, #0
-; CHECK-BE-NEXT:    st1 { v3.2d }, [x9]
-; CHECK-BE-NEXT:    st1 { v2.2d }, [x10]
+; CHECK-BE-NEXT:    st1 { v1.2d }, [x9]
 ; CHECK-BE-NEXT:    b.ne .LBB14_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    ret
@@ -1485,21 +1486,21 @@ define void @zext_v16i32_to_v16i64_in_loop(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x9, x0, x8
 ; CHECK-NEXT:    add x8, x8, #64
-; CHECK-NEXT:    cmp x8, #512
 ; CHECK-NEXT:    ldp q1, q0, [x9, #32]
-; CHECK-NEXT:    ushll2.2d v5, v1, #0
-; CHECK-NEXT:    ushll.2d v1, v1, #0
-; CHECK-NEXT:    ldp q3, q2, [x9]
-; CHECK-NEXT:    ushll2.2d v4, v0, #0
-; CHECK-NEXT:    stp q1, q5, [x1, #64]
+; CHECK-NEXT:    cmp x8, #512
+; CHECK-NEXT:    ldp q5, q4, [x9]
+; CHECK-NEXT:    ushll2.2d v2, v0, #0
 ; CHECK-NEXT:    ushll.2d v0, v0, #0
-; CHECK-NEXT:    stp q0, q4, [x1, #96]
-; CHECK-NEXT:    ushll2.2d v1, v3, #0
-; CHECK-NEXT:    ushll2.2d v0, v2, #0
-; CHECK-NEXT:    ushll.2d v2, v2, #0
-; CHECK-NEXT:    stp q2, q0, [x1, #32]
-; CHECK-NEXT:    ushll.2d v0, v3, #0
-; CHECK-NEXT:    stp q0, q1, [x1], #128
+; CHECK-NEXT:    ushll2.2d v3, v1, #0
+; CHECK-NEXT:    ushll.2d v1, v1, #0
+; CHECK-NEXT:    stp q0, q2, [x1, #96]
+; CHECK-NEXT:    ushll2.2d v2, v4, #0
+; CHECK-NEXT:    ushll.2d v0, v4, #0
+; CHECK-NEXT:    stp q1, q3, [x1, #64]
+; CHECK-NEXT:    ushll2.2d v3, v5, #0
+; CHECK-NEXT:    ushll.2d v1, v5, #0
+; CHECK-NEXT:    stp q0, q2, [x1, #32]
+; CHECK-NEXT:    stp q1, q3, [x1], #128
 ; CHECK-NEXT:    b.ne LBB15_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
@@ -1511,37 +1512,37 @@ define void @zext_v16i32_to_v16i64_in_loop(ptr %src, ptr %dst) {
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    add x9, x0, x8
 ; CHECK-BE-NEXT:    add x8, x8, #64
+; CHECK-BE-NEXT:    ld1 { v0.4s }, [x9]
 ; CHECK-BE-NEXT:    add x10, x9, #48
-; CHECK-BE-NEXT:    add x11, x9, #32
 ; CHECK-BE-NEXT:    cmp x8, #512
-; CHECK-BE-NEXT:    ld1 { v0.4s }, [x9]
-; CHECK-BE-NEXT:    add x9, x9, #16
 ; CHECK-BE-NEXT:    ld1 { v1.4s }, [x10]
-; CHECK-BE-NEXT:    add x10, x1, #16
-; CHECK-BE-NEXT:    ld1 { v2.4s }, [x11]
-; CHECK-BE-NEXT:    ushll2 v3.2d, v0.4s, #0
+; CHECK-BE-NEXT:    add x10, x9, #32
+; CHECK-BE-NEXT:    add x9, x9, #16
 ; CHECK-BE-NEXT:    ld1 { v4.4s }, [x9]
-; CHECK-BE-NEXT:    add x9, x1, #112
-; CHECK-BE-NEXT:    st1 { v3.2d }, [x10]
+; CHECK-BE-NEXT:    ld1 { v2.4s }, [x10]
+; CHECK-BE-NEXT:    add x9, x1, #16
+; CHECK-BE-NEXT:    ushll2 v3.2d, v0.4s, #0
+; CHECK-BE-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-BE-NEXT:    add x10, x1, #80
-; CHECK-BE-NEXT:    ushll2 v3.2d, v1.4s, #0
-; CHECK-BE-NEXT:    ushll2 v5.2d, v2.4s, #0
+; CHECK-BE-NEXT:    ushll2 v5.2d, v1.4s, #0
+; CHECK-BE-NEXT:    ushll2 v6.2d, v2.4s, #0
 ; CHECK-BE-NEXT:    st1 { v3.2d }, [x9]
-; CHECK-BE-NEXT:    add x9, x1, #48
-; CHECK-BE-NEXT:    st1 { v5.2d }, [x10]
-; CHECK-BE-NEXT:    add x10, x1, #96
-; CHECK-BE-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-BE-NEXT:    ushll v3.2d, v4.2s, #0
-; CHECK-BE-NEXT:    ushll2 v4.2d, v4.4s, #0
+; CHECK-BE-NEXT:    ushll2 v3.2d, v4.4s, #0
+; CHECK-BE-NEXT:    add x9, x1, #112
 ; CHECK-BE-NEXT:    st1 { v0.2d }, [x1]
-; CHECK-BE-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-BE-NEXT:    st1 { v4.2d }, [x9]
+; CHECK-BE-NEXT:    ushll v0.2d, v1.2s, #0
+; CHECK-BE-NEXT:    ushll v1.2d, v2.2s, #0
+; CHECK-BE-NEXT:    st1 { v5.2d }, [x9]
+; CHECK-BE-NEXT:    add x9, x1, #48
+; CHECK-BE-NEXT:    ushll v2.2d, v4.2s, #0
+; CHECK-BE-NEXT:    st1 { v3.2d }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #64
-; CHECK-BE-NEXT:    st1 { v1.2d }, [x10]
-; CHECK-BE-NEXT:    add x10, x1, #32
+; CHECK-BE-NEXT:    st1 { v6.2d }, [x10]
+; CHECK-BE-NEXT:    add x10, x1, #96
+; CHECK-BE-NEXT:    st1 { v1.2d }, [x9]
+; CHECK-BE-NEXT:    add x9, x1, #32
 ; CHECK-BE-NEXT:    add x1, x1, #128
-; CHECK-BE-NEXT:    ushll v2.2d, v2.2s, #0
-; CHECK-BE-NEXT:    st1 { v3.2d }, [x10]
+; CHECK-BE-NEXT:    st1 { v0.2d }, [x10]
 ; CHECK-BE-NEXT:    st1 { v2.2d }, [x9]
 ; CHECK-BE-NEXT:    b.ne .LBB15_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
@@ -1574,35 +1575,35 @@ define void @zext_v8i8_to_v8i128_in_loop(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr d0, [x0, x8]
 ; CHECK-NEXT:    add x9, x1, #112
-; CHECK-NEXT:    add x10, x1, #80
-; CHECK-NEXT:    str xzr, [x1, #120]
-; CHECK-NEXT:    str xzr, [x1, #104]
 ; CHECK-NEXT:    add x8, x8, #16
-; CHECK-NEXT:    str xzr, [x1, #88]
+; CHECK-NEXT:    str xzr, [x1, #120]
 ; CHECK-NEXT:    cmp x8, #128
 ; CHECK-NEXT:    ushll.8h v0, v0, #0
+; CHECK-NEXT:    str xzr, [x1, #104]
+; CHECK-NEXT:    str xzr, [x1, #88]
 ; CHECK-NEXT:    str xzr, [x1, #72]
-; CHECK-NEXT:    str xzr, [x1, #56]
 ; CHECK-NEXT:    ushll2.4s v1, v0, #0
-; CHECK-NEXT:    str xzr, [x1, #40]
 ; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    str xzr, [x1, #56]
+; CHECK-NEXT:    str xzr, [x1, #40]
 ; CHECK-NEXT:    str xzr, [x1, #24]
 ; CHECK-NEXT:    ushll2.2d v2, v1, #0
-; CHECK-NEXT:    str xzr, [x1, #8]
 ; CHECK-NEXT:    ushll.2d v1, v1, #0
+; CHECK-NEXT:    ushll2.2d v3, v0, #0
+; CHECK-NEXT:    ushll.2d v0, v0, #0
+; CHECK-NEXT:    str xzr, [x1, #8]
 ; CHECK-NEXT:    st1.d { v2 }[1], [x9]
+; CHECK-NEXT:    add x9, x1, #80
+; CHECK-NEXT:    st1.d { v1 }[1], [x9]
 ; CHECK-NEXT:    add x9, x1, #48
 ; CHECK-NEXT:    str d2, [x1, #96]
-; CHECK-NEXT:    ushll2.2d v2, v0, #0
-; CHECK-NEXT:    st1.d { v1 }[1], [x10]
-; CHECK-NEXT:    ushll.2d v0, v0, #0
+; CHECK-NEXT:    st1.d { v3 }[1], [x9]
+; CHECK-NEXT:    add x9, x1, #16
 ; CHECK-NEXT:    str d1, [x1, #64]
-; CHECK-NEXT:    str d2, [x1, #32]
-; CHECK-NEXT:    add x10, x1, #16
+; CHECK-NEXT:    str d3, [x1, #32]
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    add x1, x1, #256
-; CHECK-NEXT:    st1.d { v2 }[1], [x9]
-; CHECK-NEXT:    st1.d { v0 }[1], [x10]
+; CHECK-NEXT:    st1.d { v0 }[1], [x9]
 ; CHECK-NEXT:    b.ne LBB16_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
@@ -1613,37 +1614,37 @@ define void @zext_v8i8_to_v8i128_in_loop(ptr %src, ptr %dst) {
 ; CHECK-BE-NEXT:  .LBB16_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    add x9, x0, x8
-; CHECK-BE-NEXT:    add x10, x1, #88
 ; CHECK-BE-NEXT:    add x8, x8, #16
-; CHECK-BE-NEXT:    cmp x8, #128
 ; CHECK-BE-NEXT:    ld1 { v0.8b }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #120
 ; CHECK-BE-NEXT:    str xzr, [x1, #112]
 ; CHECK-BE-NEXT:    str xzr, [x1, #96]
+; CHECK-BE-NEXT:    cmp x8, #128
 ; CHECK-BE-NEXT:    str xzr, [x1, #80]
 ; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-BE-NEXT:    str xzr, [x1, #64]
 ; CHECK-BE-NEXT:    str xzr, [x1, #48]
-; CHECK-BE-NEXT:    ushll2 v1.4s, v0.8h, #0
 ; CHECK-BE-NEXT:    str xzr, [x1, #32]
+; CHECK-BE-NEXT:    ushll2 v1.4s, v0.8h, #0
 ; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-BE-NEXT:    str xzr, [x1, #16]
-; CHECK-BE-NEXT:    ushll2 v2.2d, v1.4s, #0
 ; CHECK-BE-NEXT:    str xzr, [x1]
+; CHECK-BE-NEXT:    ushll2 v2.2d, v1.4s, #0
 ; CHECK-BE-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-BE-NEXT:    ushll2 v3.2d, v0.4s, #0
+; CHECK-BE-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-BE-NEXT:    st1 { v2.d }[1], [x9]
+; CHECK-BE-NEXT:    add x9, x1, #88
+; CHECK-BE-NEXT:    st1 { v1.d }[1], [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #56
 ; CHECK-BE-NEXT:    str d2, [x1, #104]
-; CHECK-BE-NEXT:    ushll2 v2.2d, v0.4s, #0
-; CHECK-BE-NEXT:    st1 { v1.d }[1], [x10]
-; CHECK-BE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT:    st1 { v3.d }[1], [x9]
+; CHECK-BE-NEXT:    add x9, x1, #24
 ; CHECK-BE-NEXT:    str d1, [x1, #72]
-; CHECK-BE-NEXT:    str d2, [x1, #40]
-; CHECK-BE-NEXT:    add x10, x1, #24
+; CHECK-BE-NEXT:    str d3, [x1, #40]
 ; CHECK-BE-NEXT:    str d0, [x1, #8]
 ; CHECK-BE-NEXT:    add x1, x1, #256
-; CHECK-BE-NEXT:    st1 { v2.d }[1], [x9]
-; CHECK-BE-NEXT:    st1 { v0.d }[1], [x10]
+; CHECK-BE-NEXT:    st1 { v0.d }[1], [x9]
 ; CHECK-BE-NEXT:    b.ne .LBB16_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    ret
@@ -1678,36 +1679,36 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst)
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  Lloh20:
 ; CHECK-NEXT:    ldr q0, [x9, lCPI17_0 at PAGEOFF]
-; CHECK-NEXT:    add x9, x0, #8
 ; CHECK-NEXT:  Lloh21:
 ; CHECK-NEXT:    ldr q1, [x10, lCPI17_1 at PAGEOFF]
+; CHECK-NEXT:    add x9, x0, #8
 ; CHECK-NEXT:  LBB17_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldp d2, d4, [x9, #-8]
 ; CHECK-NEXT:    add x10, x1, x8
+; CHECK-NEXT:    ldp q6, q5, [x10, #32]
 ; CHECK-NEXT:    add x8, x8, #128
-; CHECK-NEXT:    ldp d2, d3, [x9, #-8]
-; CHECK-NEXT:    add x9, x9, #16
+; CHECK-NEXT:    ldp q17, q16, [x10]
 ; CHECK-NEXT:    cmp x8, #1024
-; CHECK-NEXT:    ldp q5, q4, [x10, #32]
-; CHECK-NEXT:    tbl.16b v6, { v2 }, v1
+; CHECK-NEXT:    tbl.16b v3, { v2 }, v1
 ; CHECK-NEXT:    tbl.16b v2, { v2 }, v0
-; CHECK-NEXT:    tbl.16b v17, { v3 }, v0
-; CHECK-NEXT:    tbl.16b v3, { v3 }, v1
-; CHECK-NEXT:    ldp q16, q7, [x10]
-; CHECK-NEXT:    uaddw2.2d v4, v4, v6
-; CHECK-NEXT:    uaddw.2d v5, v5, v6
-; CHECK-NEXT:    stp q5, q4, [x10, #32]
-; CHECK-NEXT:    ldp q19, q18, [x10, #96]
-; CHECK-NEXT:    uaddw2.2d v7, v7, v2
-; CHECK-NEXT:    uaddw.2d v2, v16, v2
-; CHECK-NEXT:    stp q2, q7, [x10]
-; CHECK-NEXT:    ldp q6, q20, [x10, #64]
-; CHECK-NEXT:    uaddw2.2d v4, v18, v3
-; CHECK-NEXT:    uaddw.2d v3, v19, v3
-; CHECK-NEXT:    stp q3, q4, [x10, #96]
-; CHECK-NEXT:    uaddw2.2d v2, v20, v17
-; CHECK-NEXT:    uaddw.2d v4, v6, v17
-; CHECK-NEXT:    stp q4, q2, [x10, #64]
+; CHECK-NEXT:    tbl.16b v7, { v4 }, v1
+; CHECK-NEXT:    tbl.16b v4, { v4 }, v0
+; CHECK-NEXT:    add x9, x9, #16
+; CHECK-NEXT:    uaddw2.2d v5, v5, v3
+; CHECK-NEXT:    uaddw.2d v3, v6, v3
+; CHECK-NEXT:    uaddw2.2d v6, v16, v2
+; CHECK-NEXT:    ldp q18, q16, [x10, #96]
+; CHECK-NEXT:    uaddw.2d v2, v17, v2
+; CHECK-NEXT:    stp q3, q5, [x10, #32]
+; CHECK-NEXT:    ldp q17, q5, [x10, #64]
+; CHECK-NEXT:    uaddw2.2d v16, v16, v7
+; CHECK-NEXT:    uaddw.2d v7, v18, v7
+; CHECK-NEXT:    stp q2, q6, [x10]
+; CHECK-NEXT:    uaddw2.2d v3, v5, v4
+; CHECK-NEXT:    uaddw.2d v4, v17, v4
+; CHECK-NEXT:    stp q7, q16, [x10, #96]
+; CHECK-NEXT:    stp q4, q3, [x10, #64]
 ; CHECK-NEXT:    b.ne LBB17_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
@@ -1726,60 +1727,60 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst)
 ; CHECK-BE-NEXT:    add x9, x0, #8
 ; CHECK-BE-NEXT:  .LBB17_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-BE-NEXT:    sub x12, x9, #8
-; CHECK-BE-NEXT:    add x10, x1, x8
+; CHECK-BE-NEXT:    sub x10, x9, #8
 ; CHECK-BE-NEXT:    ld1 { v2.8b }, [x9]
+; CHECK-BE-NEXT:    add x9, x9, #16
+; CHECK-BE-NEXT:    ld1 { v3.8b }, [x10]
+; CHECK-BE-NEXT:    add x10, x1, x8
+; CHECK-BE-NEXT:    add x8, x8, #128
+; CHECK-BE-NEXT:    add x15, x10, #96
 ; CHECK-BE-NEXT:    add x11, x10, #32
-; CHECK-BE-NEXT:    add x13, x10, #48
-; CHECK-BE-NEXT:    add x14, x10, #16
-; CHECK-BE-NEXT:    ld1 { v4.8b }, [x12]
-; CHECK-BE-NEXT:    add x15, x10, #64
-; CHECK-BE-NEXT:    ld1 { v3.2d }, [x11]
-; CHECK-BE-NEXT:    add x12, x10, #96
-; CHECK-BE-NEXT:    tbl v6.16b, { v2.16b }, v1.16b
-; CHECK-BE-NEXT:    add x16, x10, #112
+; CHECK-BE-NEXT:    add x14, x10, #64
+; CHECK-BE-NEXT:    tbl v4.16b, { v2.16b }, v1.16b
 ; CHECK-BE-NEXT:    tbl v2.16b, { v2.16b }, v0.16b
-; CHECK-BE-NEXT:    ld1 { v7.2d }, [x13]
-; CHECK-BE-NEXT:    tbl v16.16b, { v4.16b }, v0.16b
+; CHECK-BE-NEXT:    ld1 { v16.2d }, [x15]
+; CHECK-BE-NEXT:    tbl v5.16b, { v3.16b }, v1.16b
+; CHECK-BE-NEXT:    tbl v3.16b, { v3.16b }, v0.16b
+; CHECK-BE-NEXT:    ld1 { v6.2d }, [x10]
+; CHECK-BE-NEXT:    ld1 { v19.2d }, [x14]
+; CHECK-BE-NEXT:    ld1 { v21.2d }, [x11]
+; CHECK-BE-NEXT:    add x12, x10, #48
+; CHECK-BE-NEXT:    add x13, x10, #16
+; CHECK-BE-NEXT:    add x16, x10, #112
 ; CHECK-BE-NEXT:    add x17, x10, #80
-; CHECK-BE-NEXT:    tbl v4.16b, { v4.16b }, v1.16b
-; CHECK-BE-NEXT:    ld1 { v18.2d }, [x14]
-; CHECK-BE-NEXT:    rev32 v17.8b, v6.8b
-; CHECK-BE-NEXT:    add x8, x8, #128
-; CHECK-BE-NEXT:    ext v6.16b, v6.16b, v6.16b, #8
-; CHECK-BE-NEXT:    ld1 { v5.2d }, [x10]
-; CHECK-BE-NEXT:    ext v23.16b, v16.16b, v16.16b, #8
-; CHECK-BE-NEXT:    add x9, x9, #16
-; CHECK-BE-NEXT:    ext v21.16b, v4.16b, v4.16b, #8
-; CHECK-BE-NEXT:    ld1 { v20.2d }, [x12]
-; CHECK-BE-NEXT:    rev32 v4.8b, v4.8b
+; CHECK-BE-NEXT:    rev32 v7.8b, v4.8b
+; CHECK-BE-NEXT:    ext v4.16b, v4.16b, v4.16b, #8
+; CHECK-BE-NEXT:    rev32 v17.8b, v2.8b
+; CHECK-BE-NEXT:    ext v18.16b, v5.16b, v5.16b, #8
+; CHECK-BE-NEXT:    ext v20.16b, v3.16b, v3.16b, #8
+; CHECK-BE-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
+; CHECK-BE-NEXT:    rev32 v5.8b, v5.8b
+; CHECK-BE-NEXT:    rev32 v3.8b, v3.8b
 ; CHECK-BE-NEXT:    cmp x8, #1024
-; CHECK-BE-NEXT:    ext v19.16b, v2.16b, v2.16b, #8
-; CHECK-BE-NEXT:    ld1 { v22.2d }, [x15]
+; CHECK-BE-NEXT:    rev32 v4.8b, v4.8b
+; CHECK-BE-NEXT:    uaddw v7.2d, v16.2d, v7.2s
+; CHECK-BE-NEXT:    ld1 { v16.2d }, [x16]
+; CHECK-BE-NEXT:    rev32 v18.8b, v18.8b
+; CHECK-BE-NEXT:    rev32 v20.8b, v20.8b
 ; CHECK-BE-NEXT:    rev32 v2.8b, v2.8b
-; CHECK-BE-NEXT:    rev32 v21.8b, v21.8b
-; CHECK-BE-NEXT:    ld1 { v24.2d }, [x16]
-; CHECK-BE-NEXT:    uaddw v3.2d, v3.2d, v4.2s
-; CHECK-BE-NEXT:    rev32 v4.8b, v23.8b
-; CHECK-BE-NEXT:    ld1 { v23.2d }, [x17]
-; CHECK-BE-NEXT:    rev32 v16.8b, v16.8b
-; CHECK-BE-NEXT:    rev32 v6.8b, v6.8b
-; CHECK-BE-NEXT:    rev32 v19.8b, v19.8b
-; CHECK-BE-NEXT:    st1 { v3.2d }, [x11]
-; CHECK-BE-NEXT:    uaddw v3.2d, v7.2d, v21.2s
-; CHECK-BE-NEXT:    uaddw v4.2d, v18.2d, v4.2s
-; CHECK-BE-NEXT:    uaddw v5.2d, v5.2d, v16.2s
-; CHECK-BE-NEXT:    uaddw v7.2d, v20.2d, v17.2s
-; CHECK-BE-NEXT:    st1 { v3.2d }, [x13]
-; CHECK-BE-NEXT:    uaddw v2.2d, v22.2d, v2.2s
-; CHECK-BE-NEXT:    st1 { v4.2d }, [x14]
-; CHECK-BE-NEXT:    uaddw v3.2d, v24.2d, v6.2s
-; CHECK-BE-NEXT:    st1 { v5.2d }, [x10]
-; CHECK-BE-NEXT:    uaddw v4.2d, v23.2d, v19.2s
+; CHECK-BE-NEXT:    uaddw v17.2d, v19.2d, v17.2s
+; CHECK-BE-NEXT:    ld1 { v19.2d }, [x12]
+; CHECK-BE-NEXT:    uaddw v5.2d, v21.2d, v5.2s
+; CHECK-BE-NEXT:    ld1 { v21.2d }, [x13]
+; CHECK-BE-NEXT:    uaddw v3.2d, v6.2d, v3.2s
+; CHECK-BE-NEXT:    ld1 { v6.2d }, [x17]
+; CHECK-BE-NEXT:    uaddw v4.2d, v16.2d, v4.2s
+; CHECK-BE-NEXT:    st1 { v7.2d }, [x15]
+; CHECK-BE-NEXT:    uaddw v7.2d, v19.2d, v18.2s
+; CHECK-BE-NEXT:    uaddw v16.2d, v21.2d, v20.2s
+; CHECK-BE-NEXT:    uaddw v2.2d, v6.2d, v2.2s
+; CHECK-BE-NEXT:    st1 { v17.2d }, [x14]
+; CHECK-BE-NEXT:    st1 { v5.2d }, [x11]
+; CHECK-BE-NEXT:    st1 { v3.2d }, [x10]
+; CHECK-BE-NEXT:    st1 { v4.2d }, [x16]
 ; CHECK-BE-NEXT:    st1 { v7.2d }, [x12]
-; CHECK-BE-NEXT:    st1 { v2.2d }, [x15]
-; CHECK-BE-NEXT:    st1 { v3.2d }, [x16]
-; CHECK-BE-NEXT:    st1 { v4.2d }, [x17]
+; CHECK-BE-NEXT:    st1 { v16.2d }, [x13]
+; CHECK-BE-NEXT:    st1 { v2.2d }, [x17]
 ; CHECK-BE-NEXT:    b.ne .LBB17_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    ret
@@ -1822,44 +1823,44 @@ define void @zext_v16i8_to_v16i64_in_sequence_in_loop(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x10, x0, x8
 ; CHECK-NEXT:    add x8, x8, #16
-; CHECK-NEXT:    cmp x8, #128
 ; CHECK-NEXT:    ldp q0, q1, [x10]
-; CHECK-NEXT:    ushll.8h v2, v0, #0
-; CHECK-NEXT:    ushll2.8h v0, v0, #0
-; CHECK-NEXT:    ushll.4s v4, v2, #0
+; CHECK-NEXT:    cmp x8, #128
+; CHECK-NEXT:    ushll2.8h v2, v0, #0
+; CHECK-NEXT:    ushll.8h v0, v0, #0
+; CHECK-NEXT:    ushll2.8h v6, v1, #0
+; CHECK-NEXT:    ushll.8h v1, v1, #0
+; CHECK-NEXT:    ushll2.4s v3, v2, #0
+; CHECK-NEXT:    ushll.4s v2, v2, #0
 ; CHECK-NEXT:    ushll2.4s v5, v0, #0
-; CHECK-NEXT:    ushll2.4s v2, v2, #0
 ; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    ushll2.2d v6, v5, #0
+; CHECK-NEXT:    ushll2.2d v4, v3, #0
+; CHECK-NEXT:    ushll.2d v3, v3, #0
+; CHECK-NEXT:    ushll2.2d v7, v2, #0
+; CHECK-NEXT:    ushll.2d v2, v2, #0
+; CHECK-NEXT:    stp q3, q4, [x9, #-32]
+; CHECK-NEXT:    ushll2.2d v4, v5, #0
+; CHECK-NEXT:    ushll2.4s v3, v6, #0
 ; CHECK-NEXT:    ushll.2d v5, v5, #0
-; CHECK-NEXT:    ushll2.8h v3, v1, #0
+; CHECK-NEXT:    stp q2, q7, [x9, #-64]
 ; CHECK-NEXT:    ushll2.2d v7, v0, #0
-; CHECK-NEXT:    stp q5, q6, [x9, #-32]
 ; CHECK-NEXT:    ushll.2d v0, v0, #0
-; CHECK-NEXT:    ushll2.2d v5, v2, #0
-; CHECK-NEXT:    ushll.2d v2, v2, #0
-; CHECK-NEXT:    stp q0, q7, [x9, #-64]
-; CHECK-NEXT:    ushll2.2d v0, v4, #0
-; CHECK-NEXT:    stp q2, q5, [x9, #-96]
-; CHECK-NEXT:    ushll2.4s v5, v3, #0
-; CHECK-NEXT:    ushll.2d v2, v4, #0
-; CHECK-NEXT:    ushll2.2d v4, v5, #0
-; CHECK-NEXT:    stp q2, q0, [x9, #-128]
-; CHECK-NEXT:    ushll.2d v0, v5, #0
-; CHECK-NEXT:    ushll.4s v2, v3, #0
-; CHECK-NEXT:    stp q0, q4, [x9, #96]
-; CHECK-NEXT:    ushll.8h v0, v1, #0
-; CHECK-NEXT:    ushll2.2d v1, v2, #0
-; CHECK-NEXT:    ushll.2d v2, v2, #0
-; CHECK-NEXT:    ushll2.4s v3, v0, #0
-; CHECK-NEXT:    stp q2, q1, [x9, #64]
-; CHECK-NEXT:    ushll2.2d v1, v3, #0
-; CHECK-NEXT:    ushll.2d v2, v3, #0
-; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    stp q2, q1, [x9, #32]
-; CHECK-NEXT:    ushll2.2d v1, v0, #0
+; CHECK-NEXT:    ushll.4s v2, v6, #0
+; CHECK-NEXT:    stp q5, q4, [x9, #-96]
+; CHECK-NEXT:    ushll2.2d v4, v3, #0
+; CHECK-NEXT:    ushll2.4s v5, v1, #0
+; CHECK-NEXT:    ushll.2d v3, v3, #0
+; CHECK-NEXT:    stp q0, q7, [x9, #-128]
+; CHECK-NEXT:    ushll.4s v0, v1, #0
+; CHECK-NEXT:    ushll2.2d v6, v2, #0
+; CHECK-NEXT:    ushll.2d v1, v2, #0
+; CHECK-NEXT:    ushll2.2d v2, v5, #0
+; CHECK-NEXT:    stp q3, q4, [x9, #96]
+; CHECK-NEXT:    ushll.2d v3, v5, #0
+; CHECK-NEXT:    ushll2.2d v4, v0, #0
 ; CHECK-NEXT:    ushll.2d v0, v0, #0
-; CHECK-NEXT:    stp q0, q1, [x9], #128
+; CHECK-NEXT:    stp q1, q6, [x9, #64]
+; CHECK-NEXT:    stp q3, q2, [x9, #32]
+; CHECK-NEXT:    stp q0, q4, [x9], #128
 ; CHECK-NEXT:    b.ne LBB18_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
@@ -1871,70 +1872,70 @@ define void @zext_v16i8_to_v16i64_in_sequence_in_loop(ptr %src, ptr %dst) {
 ; CHECK-BE-NEXT:  .LBB18_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    add x10, x0, x8
+; CHECK-BE-NEXT:    sub x11, x9, #32
 ; CHECK-BE-NEXT:    add x8, x8, #16
-; CHECK-BE-NEXT:    add x11, x10, #16
-; CHECK-BE-NEXT:    cmp x8, #128
 ; CHECK-BE-NEXT:    ld1 { v0.16b }, [x10]
+; CHECK-BE-NEXT:    add x10, x10, #16
+; CHECK-BE-NEXT:    cmp x8, #128
+; CHECK-BE-NEXT:    ld1 { v5.16b }, [x10]
 ; CHECK-BE-NEXT:    sub x10, x9, #16
-; CHECK-BE-NEXT:    ld1 { v3.16b }, [x11]
-; CHECK-BE-NEXT:    sub x11, x9, #32
 ; CHECK-BE-NEXT:    ushll2 v1.8h, v0.16b, #0
 ; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-BE-NEXT:    ushll2 v2.4s, v1.8h, #0
+; CHECK-BE-NEXT:    ushll2 v3.4s, v0.8h, #0
 ; CHECK-BE-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-BE-NEXT:    ushll2 v4.2d, v2.4s, #0
 ; CHECK-BE-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-BE-NEXT:    ushll2 v6.2d, v1.4s, #0
+; CHECK-BE-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-BE-NEXT:    st1 { v4.2d }, [x10]
-; CHECK-BE-NEXT:    sub x10, x9, #48
-; CHECK-BE-NEXT:    ushll2 v4.2d, v1.4s, #0
+; CHECK-BE-NEXT:    ushll2 v4.2d, v3.4s, #0
+; CHECK-BE-NEXT:    ushll v3.2d, v3.2s, #0
 ; CHECK-BE-NEXT:    st1 { v2.2d }, [x11]
-; CHECK-BE-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-BE-NEXT:    ushll2 v2.8h, v5.16b, #0
 ; CHECK-BE-NEXT:    sub x11, x9, #80
-; CHECK-BE-NEXT:    st1 { v4.2d }, [x10]
-; CHECK-BE-NEXT:    sub x10, x9, #64
-; CHECK-BE-NEXT:    ushll2 v5.2d, v2.4s, #0
-; CHECK-BE-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-BE-NEXT:    st1 { v5.2d }, [x11]
+; CHECK-BE-NEXT:    sub x10, x9, #48
+; CHECK-BE-NEXT:    st1 { v4.2d }, [x11]
+; CHECK-BE-NEXT:    ushll v4.8h, v5.8b, #0
+; CHECK-BE-NEXT:    sub x11, x9, #64
+; CHECK-BE-NEXT:    ushll2 v5.4s, v2.8h, #0
+; CHECK-BE-NEXT:    st1 { v1.2d }, [x11]
 ; CHECK-BE-NEXT:    sub x11, x9, #96
-; CHECK-BE-NEXT:    st1 { v1.2d }, [x10]
-; CHECK-BE-NEXT:    ushll v4.8h, v3.8b, #0
-; CHECK-BE-NEXT:    sub x10, x9, #112
-; CHECK-BE-NEXT:    ushll2 v3.8h, v3.16b, #0
-; CHECK-BE-NEXT:    ushll v5.2d, v0.2s, #0
-; CHECK-BE-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-BE-NEXT:    ushll v2.2d, v2.2s, #0
-; CHECK-BE-NEXT:    ushll v1.4s, v4.4h, #0
+; CHECK-BE-NEXT:    ushll2 v1.2d, v0.4s, #0
+; CHECK-BE-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-BE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT:    st1 { v6.2d }, [x10]
+; CHECK-BE-NEXT:    sub x10, x9, #128
+; CHECK-BE-NEXT:    st1 { v3.2d }, [x11]
+; CHECK-BE-NEXT:    ushll2 v3.4s, v4.8h, #0
+; CHECK-BE-NEXT:    ushll2 v6.2d, v5.4s, #0
+; CHECK-BE-NEXT:    sub x11, x9, #112
+; CHECK-BE-NEXT:    ushll v5.2d, v5.2s, #0
 ; CHECK-BE-NEXT:    st1 { v0.2d }, [x10]
-; CHECK-BE-NEXT:    ushll2 v6.4s, v3.8h, #0
-; CHECK-BE-NEXT:    st1 { v2.2d }, [x11]
-; CHECK-BE-NEXT:    sub x11, x9, #128
+; CHECK-BE-NEXT:    st1 { v1.2d }, [x11]
+; CHECK-BE-NEXT:    ushll2 v1.2d, v2.4s, #0
 ; CHECK-BE-NEXT:    add x10, x9, #112
-; CHECK-BE-NEXT:    ushll v2.2d, v1.2s, #0
-; CHECK-BE-NEXT:    ushll2 v0.2d, v1.4s, #0
-; CHECK-BE-NEXT:    ushll2 v1.2d, v6.4s, #0
-; CHECK-BE-NEXT:    st1 { v5.2d }, [x11]
-; CHECK-BE-NEXT:    ushll v3.4s, v3.4h, #0
-; CHECK-BE-NEXT:    add x11, x9, #96
-; CHECK-BE-NEXT:    st1 { v1.2d }, [x10]
+; CHECK-BE-NEXT:    ushll v4.4s, v4.4h, #0
+; CHECK-BE-NEXT:    ushll2 v0.2d, v3.4s, #0
+; CHECK-BE-NEXT:    st1 { v6.2d }, [x10]
+; CHECK-BE-NEXT:    add x10, x9, #96
+; CHECK-BE-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-BE-NEXT:    ushll v3.2d, v3.2s, #0
+; CHECK-BE-NEXT:    st1 { v5.2d }, [x10]
 ; CHECK-BE-NEXT:    add x10, x9, #80
-; CHECK-BE-NEXT:    ushll v5.2d, v6.2s, #0
-; CHECK-BE-NEXT:    st1 { v2.2d }, [x9]
-; CHECK-BE-NEXT:    ushll2 v1.4s, v4.8h, #0
-; CHECK-BE-NEXT:    ushll2 v4.2d, v3.4s, #0
-; CHECK-BE-NEXT:    st1 { v5.2d }, [x11]
-; CHECK-BE-NEXT:    add x11, x9, #48
-; CHECK-BE-NEXT:    st1 { v4.2d }, [x10]
+; CHECK-BE-NEXT:    st1 { v1.2d }, [x10]
+; CHECK-BE-NEXT:    add x10, x9, #48
+; CHECK-BE-NEXT:    ushll2 v1.2d, v4.4s, #0
+; CHECK-BE-NEXT:    st1 { v0.2d }, [x10]
+; CHECK-BE-NEXT:    ushll v0.2d, v4.2s, #0
 ; CHECK-BE-NEXT:    add x10, x9, #64
-; CHECK-BE-NEXT:    ushll2 v5.2d, v1.4s, #0
-; CHECK-BE-NEXT:    ushll v3.2d, v3.2s, #0
-; CHECK-BE-NEXT:    st1 { v5.2d }, [x11]
-; CHECK-BE-NEXT:    add x11, x9, #16
-; CHECK-BE-NEXT:    st1 { v3.2d }, [x10]
+; CHECK-BE-NEXT:    st1 { v2.2d }, [x10]
 ; CHECK-BE-NEXT:    add x10, x9, #32
+; CHECK-BE-NEXT:    st1 { v3.2d }, [x10]
+; CHECK-BE-NEXT:    add x10, x9, #16
+; CHECK-BE-NEXT:    st1 { v0.2d }, [x9]
 ; CHECK-BE-NEXT:    add x9, x9, #128
-; CHECK-BE-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-BE-NEXT:    st1 { v0.2d }, [x11]
 ; CHECK-BE-NEXT:    st1 { v1.2d }, [x10]
 ; CHECK-BE-NEXT:    b.ne .LBB18_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
@@ -1967,52 +1968,52 @@ exit:
 define void @zext_v16i8_to_v16i32_in_loop_scalable_vectors(ptr %src, ptr %dst) {
 ; CHECK-LABEL: zext_v16i8_to_v16i32_in_loop_scalable_vectors:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB19_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add x9, x0, x8
 ; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0, x8]
+; CHECK-NEXT:    add x9, x0, x8
 ; CHECK-NEXT:    ld1b { z1.s }, p0/z, [x9, #2, mul vl]
 ; CHECK-NEXT:    ld1b { z2.s }, p0/z, [x9, #3, mul vl]
 ; CHECK-NEXT:    ld1b { z3.s }, p0/z, [x9, #1, mul vl]
-; CHECK-NEXT:    add z0.s, z0.s, z0.s
 ; CHECK-NEXT:    add x9, x1, x8, lsl #2
+; CHECK-NEXT:    add z0.s, z0.s, z0.s
+; CHECK-NEXT:    add z1.s, z1.s, z1.s
+; CHECK-NEXT:    add z2.s, z2.s, z2.s
+; CHECK-NEXT:    add z3.s, z3.s, z3.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    cmp x8, #128
-; CHECK-NEXT:    add z1.s, z1.s, z1.s
-; CHECK-NEXT:    add z0.s, z3.s, z3.s
-; CHECK-NEXT:    add z2.s, z2.s, z2.s
 ; CHECK-NEXT:    st1w { z1.s }, p0, [x9, #2, mul vl]
 ; CHECK-NEXT:    st1w { z2.s }, p0, [x9, #3, mul vl]
-; CHECK-NEXT:    st1w { z0.s }, p0, [x9, #1, mul vl]
+; CHECK-NEXT:    st1w { z3.s }, p0, [x9, #1, mul vl]
 ; CHECK-NEXT:    b.ne LBB19_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: zext_v16i8_to_v16i32_in_loop_scalable_vectors:
 ; CHECK-BE:       // %bb.0: // %entry
-; CHECK-BE-NEXT:    mov x8, xzr
 ; CHECK-BE-NEXT:    ptrue p0.s
+; CHECK-BE-NEXT:    mov x8, xzr
 ; CHECK-BE-NEXT:  .LBB19_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-BE-NEXT:    add x9, x0, x8
 ; CHECK-BE-NEXT:    ld1b { z0.s }, p0/z, [x0, x8]
+; CHECK-BE-NEXT:    add x9, x0, x8
 ; CHECK-BE-NEXT:    ld1b { z1.s }, p0/z, [x9, #2, mul vl]
 ; CHECK-BE-NEXT:    ld1b { z2.s }, p0/z, [x9, #3, mul vl]
 ; CHECK-BE-NEXT:    ld1b { z3.s }, p0/z, [x9, #1, mul vl]
-; CHECK-BE-NEXT:    add z0.s, z0.s, z0.s
 ; CHECK-BE-NEXT:    add x9, x1, x8, lsl #2
+; CHECK-BE-NEXT:    add z0.s, z0.s, z0.s
+; CHECK-BE-NEXT:    add z1.s, z1.s, z1.s
+; CHECK-BE-NEXT:    add z2.s, z2.s, z2.s
+; CHECK-BE-NEXT:    add z3.s, z3.s, z3.s
 ; CHECK-BE-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
 ; CHECK-BE-NEXT:    add x8, x8, #16
 ; CHECK-BE-NEXT:    cmp x8, #128
-; CHECK-BE-NEXT:    add z1.s, z1.s, z1.s
-; CHECK-BE-NEXT:    add z0.s, z3.s, z3.s
-; CHECK-BE-NEXT:    add z2.s, z2.s, z2.s
 ; CHECK-BE-NEXT:    st1w { z1.s }, p0, [x9, #2, mul vl]
 ; CHECK-BE-NEXT:    st1w { z2.s }, p0, [x9, #3, mul vl]
-; CHECK-BE-NEXT:    st1w { z0.s }, p0, [x9, #1, mul vl]
+; CHECK-BE-NEXT:    st1w { z3.s }, p0, [x9, #1, mul vl]
 ; CHECK-BE-NEXT:    b.ne .LBB19_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    ret
@@ -2177,45 +2178,46 @@ define void @zext_v20i8_to_v20i24_in_loop(ptr %src, ptr %dst) {
 ; CHECK-LABEL: zext_v20i8_to_v20i24_in_loop:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh22:
-; CHECK-NEXT:    adrp x9, lCPI20_0 at PAGE
+; CHECK-NEXT:    adrp x8, lCPI20_0 at PAGE
 ; CHECK-NEXT:  Lloh23:
-; CHECK-NEXT:    adrp x10, lCPI20_1 at PAGE
+; CHECK-NEXT:    adrp x9, lCPI20_1 at PAGE
 ; CHECK-NEXT:  Lloh24:
-; CHECK-NEXT:    adrp x11, lCPI20_2 at PAGE
+; CHECK-NEXT:    adrp x10, lCPI20_2 at PAGE
 ; CHECK-NEXT:  Lloh25:
-; CHECK-NEXT:    adrp x12, lCPI20_3 at PAGE
-; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    ldr q0, [x8, lCPI20_0 at PAGEOFF]
 ; CHECK-NEXT:  Lloh26:
-; CHECK-NEXT:    ldr q0, [x9, lCPI20_0 at PAGEOFF]
+; CHECK-NEXT:    adrp x8, lCPI20_3 at PAGE
 ; CHECK-NEXT:  Lloh27:
-; CHECK-NEXT:    ldr q1, [x10, lCPI20_1 at PAGEOFF]
+; CHECK-NEXT:    ldr q1, [x9, lCPI20_1 at PAGEOFF]
 ; CHECK-NEXT:  Lloh28:
-; CHECK-NEXT:    ldr q2, [x11, lCPI20_2 at PAGEOFF]
+; CHECK-NEXT:    ldr q2, [x10, lCPI20_2 at PAGEOFF]
 ; CHECK-NEXT:  Lloh29:
-; CHECK-NEXT:    ldr q3, [x12, lCPI20_3 at PAGEOFF]
+; CHECK-NEXT:    ldr q3, [x8, lCPI20_3 at PAGEOFF]
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB20_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x9, x0, x8
 ; CHECK-NEXT:    add x8, x8, #16
-; CHECK-NEXT:    cmp x8, #128
-; CHECK-NEXT:    ldp q4, q5, [x9]
+; CHECK-NEXT:    ldp q5, q4, [x9]
 ; CHECK-NEXT:    add x9, x1, #56
-; CHECK-NEXT:    tbl.16b v6, { v4 }, v2
-; CHECK-NEXT:    tbl.16b v7, { v4 }, v1
-; CHECK-NEXT:    tbl.16b v4, { v4 }, v0
-; CHECK-NEXT:    tbl.16b v5, { v5 }, v3
+; CHECK-NEXT:    cmp x8, #128
+; CHECK-NEXT:    tbl.16b v4, { v4 }, v3
+; CHECK-NEXT:    tbl.16b v6, { v5 }, v2
+; CHECK-NEXT:    tbl.16b v7, { v5 }, v1
+; CHECK-NEXT:    tbl.16b v5, { v5 }, v0
 ; CHECK-NEXT:    stp q7, q6, [x1, #16]
-; CHECK-NEXT:    str q4, [x1]
-; CHECK-NEXT:    str d5, [x1, #48]
+; CHECK-NEXT:    str q5, [x1]
+; CHECK-NEXT:    str d4, [x1, #48]
 ; CHECK-NEXT:    add x1, x1, #64
-; CHECK-NEXT:    st1.s { v5 }[2], [x9]
+; CHECK-NEXT:    st1.s { v4 }[2], [x9]
 ; CHECK-NEXT:    b.ne LBB20_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh25, Lloh29
+; CHECK-NEXT:    .loh AdrpLdr Lloh26, Lloh29
 ; CHECK-NEXT:    .loh AdrpLdr Lloh24, Lloh28
 ; CHECK-NEXT:    .loh AdrpLdr Lloh23, Lloh27
-; CHECK-NEXT:    .loh AdrpLdr Lloh22, Lloh26
+; CHECK-NEXT:    .loh AdrpAdrp Lloh22, Lloh26
+; CHECK-NEXT:    .loh AdrpLdr Lloh22, Lloh25
 ;
 ; CHECK-BE-LABEL: zext_v20i8_to_v20i24_in_loop:
 ; CHECK-BE:       // %bb.0: // %entry
@@ -2237,24 +2239,24 @@ define void @zext_v20i8_to_v20i24_in_loop(ptr %src, ptr %dst) {
 ; CHECK-BE-NEXT:    add x9, x0, x8
 ; CHECK-BE-NEXT:    add x8, x8, #16
 ; CHECK-BE-NEXT:    add x10, x9, #16
-; CHECK-BE-NEXT:    cmp x8, #128
 ; CHECK-BE-NEXT:    ld1 { v5.16b }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #32
 ; CHECK-BE-NEXT:    ld1 { v4.16b }, [x10]
-; CHECK-BE-NEXT:    add x10, x1, #56
+; CHECK-BE-NEXT:    cmp x8, #128
 ; CHECK-BE-NEXT:    tbl v6.16b, { v5.16b }, v3.16b
-; CHECK-BE-NEXT:    tbl v7.16b, { v5.16b }, v1.16b
+; CHECK-BE-NEXT:    tbl v7.16b, { v5.16b }, v2.16b
+; CHECK-BE-NEXT:    tbl v5.16b, { v5.16b }, v1.16b
 ; CHECK-BE-NEXT:    tbl v4.16b, { v4.16b }, v0.16b
-; CHECK-BE-NEXT:    tbl v5.16b, { v5.16b }, v2.16b
 ; CHECK-BE-NEXT:    st1 { v6.16b }, [x9]
 ; CHECK-BE-NEXT:    add x9, x1, #16
-; CHECK-BE-NEXT:    st1 { v7.16b }, [x1]
-; CHECK-BE-NEXT:    rev64 v16.16b, v4.16b
-; CHECK-BE-NEXT:    rev32 v4.16b, v4.16b
-; CHECK-BE-NEXT:    st1 { v5.16b }, [x9]
-; CHECK-BE-NEXT:    str d16, [x1, #48]
+; CHECK-BE-NEXT:    rev32 v16.16b, v4.16b
+; CHECK-BE-NEXT:    rev64 v4.16b, v4.16b
+; CHECK-BE-NEXT:    st1 { v7.16b }, [x9]
+; CHECK-BE-NEXT:    add x9, x1, #56
+; CHECK-BE-NEXT:    st1 { v5.16b }, [x1]
+; CHECK-BE-NEXT:    str d4, [x1, #48]
 ; CHECK-BE-NEXT:    add x1, x1, #64
-; CHECK-BE-NEXT:    st1 { v4.s }[2], [x10]
+; CHECK-BE-NEXT:    st1 { v16.s }[2], [x9]
 ; CHECK-BE-NEXT:    b.ne .LBB20_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    ret
@@ -2504,64 +2506,64 @@ define void @zext_v23i8_to_v23i48_in_loop(ptr %src, ptr %dst) {
 ; CHECK-LABEL: zext_v23i8_to_v23i48_in_loop:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh30:
-; CHECK-NEXT:    adrp x9, lCPI21_0 at PAGE
+; CHECK-NEXT:    adrp x8, lCPI21_0 at PAGE
 ; CHECK-NEXT:  Lloh31:
-; CHECK-NEXT:    adrp x10, lCPI21_1 at PAGE
+; CHECK-NEXT:    adrp x9, lCPI21_1 at PAGE
 ; CHECK-NEXT:  Lloh32:
-; CHECK-NEXT:    adrp x11, lCPI21_2 at PAGE
-; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    adrp x10, lCPI21_2 at PAGE
 ; CHECK-NEXT:  Lloh33:
-; CHECK-NEXT:    ldr q0, [x9, lCPI21_0 at PAGEOFF]
+; CHECK-NEXT:    ldr q0, [x8, lCPI21_0 at PAGEOFF]
 ; CHECK-NEXT:  Lloh34:
-; CHECK-NEXT:    adrp x9, lCPI21_3 at PAGE
+; CHECK-NEXT:    ldr q1, [x9, lCPI21_1 at PAGEOFF]
 ; CHECK-NEXT:  Lloh35:
-; CHECK-NEXT:    ldr q1, [x10, lCPI21_1 at PAGEOFF]
+; CHECK-NEXT:    ldr q2, [x10, lCPI21_2 at PAGEOFF]
 ; CHECK-NEXT:  Lloh36:
-; CHECK-NEXT:    adrp x10, lCPI21_4 at PAGE
+; CHECK-NEXT:    adrp x8, lCPI21_3 at PAGE
 ; CHECK-NEXT:  Lloh37:
-; CHECK-NEXT:    ldr q2, [x11, lCPI21_2 at PAGEOFF]
+; CHECK-NEXT:    adrp x9, lCPI21_4 at PAGE
 ; CHECK-NEXT:  Lloh38:
-; CHECK-NEXT:    adrp x11, lCPI21_5 at PAGE
+; CHECK-NEXT:    adrp x10, lCPI21_5 at PAGE
 ; CHECK-NEXT:  Lloh39:
-; CHECK-NEXT:    ldr q3, [x9, lCPI21_3 at PAGEOFF]
+; CHECK-NEXT:    ldr q3, [x8, lCPI21_3 at PAGEOFF]
 ; CHECK-NEXT:  Lloh40:
-; CHECK-NEXT:    ldr q4, [x10, lCPI21_4 at PAGEOFF]
+; CHECK-NEXT:    ldr q4, [x9, lCPI21_4 at PAGEOFF]
 ; CHECK-NEXT:  Lloh41:
-; CHECK-NEXT:    ldr q5, [x11, lCPI21_5 at PAGEOFF]
+; CHECK-NEXT:    ldr q5, [x10, lCPI21_5 at PAGEOFF]
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB21_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x9, x0, x8
+; CHECK-NEXT:    movi.2d v19, #0000000000000000
 ; CHECK-NEXT:    add x8, x8, #16
-; CHECK-NEXT:    movi.2d v6, #0000000000000000
+; CHECK-NEXT:    ldp q7, q6, [x9]
 ; CHECK-NEXT:    cmp x8, #128
-; CHECK-NEXT:    ldp q16, q7, [x9]
 ; CHECK-NEXT:    strh wzr, [x1, #136]
-; CHECK-NEXT:    tbl.16b v18, { v16 }, v5
-; CHECK-NEXT:    tbl.16b v19, { v16 }, v4
-; CHECK-NEXT:    mov.b v6[4], v7[6]
+; CHECK-NEXT:    tbl.16b v16, { v6 }, v1
+; CHECK-NEXT:    tbl.16b v17, { v6 }, v0
+; CHECK-NEXT:    mov.b v19[4], v6[6]
+; CHECK-NEXT:    tbl.16b v18, { v7 }, v5
+; CHECK-NEXT:    tbl.16b v20, { v7 }, v4
+; CHECK-NEXT:    tbl.16b v21, { v7 }, v3
+; CHECK-NEXT:    stp q17, q16, [x1, #96]
+; CHECK-NEXT:    tbl.16b v16, { v7 }, v2
 ; CHECK-NEXT:    tbl.16b v17, { v7 }, v1
 ; CHECK-NEXT:    tbl.16b v7, { v7 }, v0
-; CHECK-NEXT:    tbl.16b v20, { v16 }, v3
-; CHECK-NEXT:    stp q19, q18, [x1, #64]
-; CHECK-NEXT:    fmov x9, d6
-; CHECK-NEXT:    stp q7, q17, [x1, #96]
-; CHECK-NEXT:    tbl.16b v17, { v16 }, v2
-; CHECK-NEXT:    tbl.16b v7, { v16 }, v1
-; CHECK-NEXT:    tbl.16b v16, { v16 }, v0
-; CHECK-NEXT:    stp q17, q20, [x1, #32]
-; CHECK-NEXT:    stp q16, q7, [x1]
+; CHECK-NEXT:    fmov x9, d19
+; CHECK-NEXT:    stp q20, q18, [x1, #64]
+; CHECK-NEXT:    stp q16, q21, [x1, #32]
+; CHECK-NEXT:    stp q7, q17, [x1]
 ; CHECK-NEXT:    str x9, [x1, #128]!
 ; CHECK-NEXT:    b.ne LBB21_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:    .loh AdrpLdr Lloh38, Lloh41
-; CHECK-NEXT:    .loh AdrpLdr Lloh36, Lloh40
-; CHECK-NEXT:    .loh AdrpLdr Lloh34, Lloh39
+; CHECK-NEXT:    .loh AdrpLdr Lloh37, Lloh40
+; CHECK-NEXT:    .loh AdrpLdr Lloh36, Lloh39
 ; CHECK-NEXT:    .loh AdrpAdrp Lloh32, Lloh38
-; CHECK-NEXT:    .loh AdrpLdr Lloh32, Lloh37
-; CHECK-NEXT:    .loh AdrpAdrp Lloh31, Lloh36
-; CHECK-NEXT:    .loh AdrpLdr Lloh31, Lloh35
-; CHECK-NEXT:    .loh AdrpAdrp Lloh30, Lloh34
+; CHECK-NEXT:    .loh AdrpLdr Lloh32, Lloh35
+; CHECK-NEXT:    .loh AdrpAdrp Lloh31, Lloh37
+; CHECK-NEXT:    .loh AdrpLdr Lloh31, Lloh34
+; CHECK-NEXT:    .loh AdrpAdrp Lloh30, Lloh36
 ; CHECK-NEXT:    .loh AdrpLdr Lloh30, Lloh33
 ;
 ; CHECK-BE-LABEL: zext_v23i8_to_v23i48_in_loop:
@@ -2591,44 +2593,45 @@ define void @zext_v23i8_to_v23i48_in_loop(ptr %src, ptr %dst) {
 ; CHECK-BE-NEXT:  .LBB21_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    add x9, x0, x8
-; CHECK-BE-NEXT:    add x11, x1, #64
-; CHECK-BE-NEXT:    add x10, x1, #80
 ; CHECK-BE-NEXT:    add x8, x8, #16
-; CHECK-BE-NEXT:    cmp x8, #128
 ; CHECK-BE-NEXT:    ld1 { v7.16b }, [x9]
 ; CHECK-BE-NEXT:    add x9, x9, #16
-; CHECK-BE-NEXT:    ld1 { v18.16b }, [x9]
-; CHECK-BE-NEXT:    add x9, x1, #48
-; CHECK-BE-NEXT:    tbl v17.16b, { v7.16b }, v5.16b
+; CHECK-BE-NEXT:    cmp x8, #128
+; CHECK-BE-NEXT:    ld1 { v17.16b }, [x9]
+; CHECK-BE-NEXT:    add x9, x1, #80
 ; CHECK-BE-NEXT:    tbl v16.16b, { v7.16b }, v6.16b
-; CHECK-BE-NEXT:    tbl v19.16b, { v7.16b }, v3.16b
-; CHECK-BE-NEXT:    tbl v20.16b, { v18.16b }, v0.16b
-; CHECK-BE-NEXT:    st1 { v17.16b }, [x11]
-; CHECK-BE-NEXT:    add x11, x1, #16
-; CHECK-BE-NEXT:    tbl v17.16b, { v7.16b }, v4.16b
-; CHECK-BE-NEXT:    st1 { v16.16b }, [x10]
-; CHECK-BE-NEXT:    add x10, x1, #32
-; CHECK-BE-NEXT:    tbl v16.16b, { v7.16b }, v1.16b
-; CHECK-BE-NEXT:    tbl v7.16b, { v7.16b }, v2.16b
-; CHECK-BE-NEXT:    tbl v21.16b, { v18.16b }, v1.16b
-; CHECK-BE-NEXT:    st1 { v17.16b }, [x9]
-; CHECK-BE-NEXT:    tbl v17.16b, { v18.16b }, v2.16b
+; CHECK-BE-NEXT:    tbl v18.16b, { v7.16b }, v5.16b
+; CHECK-BE-NEXT:    tbl v19.16b, { v7.16b }, v4.16b
+; CHECK-BE-NEXT:    tbl v20.16b, { v7.16b }, v3.16b
+; CHECK-BE-NEXT:    tbl v21.16b, { v17.16b }, v0.16b
+; CHECK-BE-NEXT:    st1 { v16.16b }, [x9]
+; CHECK-BE-NEXT:    add x9, x1, #64
+; CHECK-BE-NEXT:    tbl v16.16b, { v7.16b }, v2.16b
+; CHECK-BE-NEXT:    st1 { v18.16b }, [x9]
+; CHECK-BE-NEXT:    add x9, x1, #48
+; CHECK-BE-NEXT:    tbl v18.16b, { v17.16b }, v2.16b
+; CHECK-BE-NEXT:    st1 { v19.16b }, [x9]
+; CHECK-BE-NEXT:    add x9, x1, #32
+; CHECK-BE-NEXT:    tbl v17.16b, { v17.16b }, v1.16b
+; CHECK-BE-NEXT:    st1 { v20.16b }, [x9]
+; CHECK-BE-NEXT:    add x9, x1, #16
+; CHECK-BE-NEXT:    rev64 v19.16b, v21.16b
+; CHECK-BE-NEXT:    st1 { v16.16b }, [x9]
+; CHECK-BE-NEXT:    rev16 v16.16b, v21.16b
 ; CHECK-BE-NEXT:    add x9, x1, #112
-; CHECK-BE-NEXT:    rev64 v18.16b, v20.16b
-; CHECK-BE-NEXT:    st1 { v19.16b }, [x10]
-; CHECK-BE-NEXT:    rev16 v19.16b, v20.16b
-; CHECK-BE-NEXT:    add x10, x1, #96
-; CHECK-BE-NEXT:    st1 { v7.16b }, [x11]
-; CHECK-BE-NEXT:    add x11, x1, #136
+; CHECK-BE-NEXT:    st1 { v18.16b }, [x9]
+; CHECK-BE-NEXT:    add x9, x1, #96
+; CHECK-BE-NEXT:    tbl v7.16b, { v7.16b }, v1.16b
 ; CHECK-BE-NEXT:    st1 { v17.16b }, [x9]
-; CHECK-BE-NEXT:    fmov x9, d18
-; CHECK-BE-NEXT:    st1 { v21.16b }, [x10]
-; CHECK-BE-NEXT:    st1 { v19.h }[4], [x11]
-; CHECK-BE-NEXT:    st1 { v16.16b }, [x1]
+; CHECK-BE-NEXT:    add x9, x1, #136
+; CHECK-BE-NEXT:    st1 { v16.h }[4], [x9]
+; CHECK-BE-NEXT:    fmov x9, d19
+; CHECK-BE-NEXT:    st1 { v7.16b }, [x1]
 ; CHECK-BE-NEXT:    str x9, [x1, #128]!
 ; CHECK-BE-NEXT:    b.ne .LBB21_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    ret
+; CHECK      :  Lloh30:
 
 
 
@@ -2658,32 +2661,32 @@ define void @zext_v8i8_to_v8i33_in_loop(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr d0, [x0, x8]
 ; CHECK-NEXT:    add x8, x8, #16
-; CHECK-NEXT:    cmp x8, #128
 ; CHECK-NEXT:    strb wzr, [x1, #32]
+; CHECK-NEXT:    cmp x8, #128
 ; CHECK-NEXT:    ushll.8h v0, v0, #0
-; CHECK-NEXT:    ushll.4s v1, v0, #0
-; CHECK-NEXT:    ushll2.4s v0, v0, #0
-; CHECK-NEXT:    ushll.2d v2, v1, #0
+; CHECK-NEXT:    ushll2.4s v1, v0, #0
+; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    ushll2.2d v2, v1, #0
+; CHECK-NEXT:    ushll.2d v1, v1, #0
 ; CHECK-NEXT:    ushll2.2d v3, v0, #0
-; CHECK-NEXT:    ushll2.2d v1, v1, #0
-; CHECK-NEXT:    mov.d x9, v3[1]
-; CHECK-NEXT:    fmov x10, d3
-; CHECK-NEXT:    mov.d x12, v1[1]
 ; CHECK-NEXT:    ushll.2d v0, v0, #0
-; CHECK-NEXT:    lsl x9, x9, #39
-; CHECK-NEXT:    orr x9, x9, x10, lsl #6
-; CHECK-NEXT:    fmov x10, d1
-; CHECK-NEXT:    mov.d x11, v0[1]
-; CHECK-NEXT:    lsl x12, x12, #35
-; CHECK-NEXT:    mov.d x14, v2[1]
-; CHECK-NEXT:    fmov x13, d0
-; CHECK-NEXT:    orr x10, x12, x10, lsl #2
+; CHECK-NEXT:    mov.d x9, v2[1]
+; CHECK-NEXT:    mov.d x10, v1[1]
 ; CHECK-NEXT:    fmov x12, d2
-; CHECK-NEXT:    lsl x11, x11, #37
-; CHECK-NEXT:    orr x11, x11, x13, lsl #4
-; CHECK-NEXT:    orr x12, x12, x14, lsl #33
-; CHECK-NEXT:    stp x11, x9, [x1, #16]
-; CHECK-NEXT:    stp x12, x10, [x1], #128
+; CHECK-NEXT:    mov.d x11, v3[1]
+; CHECK-NEXT:    mov.d x13, v0[1]
+; CHECK-NEXT:    lsl x9, x9, #39
+; CHECK-NEXT:    lsl x10, x10, #37
+; CHECK-NEXT:    lsl x11, x11, #35
+; CHECK-NEXT:    orr x9, x9, x12, lsl #6
+; CHECK-NEXT:    fmov x12, d1
+; CHECK-NEXT:    orr x10, x10, x12, lsl #4
+; CHECK-NEXT:    fmov x12, d3
+; CHECK-NEXT:    stp x10, x9, [x1, #16]
+; CHECK-NEXT:    orr x11, x11, x12, lsl #2
+; CHECK-NEXT:    fmov x12, d0
+; CHECK-NEXT:    orr x9, x12, x13, lsl #33
+; CHECK-NEXT:    stp x9, x11, [x1], #128
 ; CHECK-NEXT:    b.ne LBB22_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
@@ -2695,37 +2698,37 @@ define void @zext_v8i8_to_v8i33_in_loop(ptr %src, ptr %dst) {
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    add x9, x0, x8
 ; CHECK-BE-NEXT:    add x8, x8, #16
-; CHECK-BE-NEXT:    cmp x8, #128
 ; CHECK-BE-NEXT:    ld1 { v0.8b }, [x9]
+; CHECK-BE-NEXT:    cmp x8, #128
 ; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-BE-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-BE-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-BE-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-BE-NEXT:    ushll v2.2d, v1.2s, #0
+; CHECK-BE-NEXT:    ushll2 v1.2d, v1.4s, #0
 ; CHECK-BE-NEXT:    ushll2 v3.2d, v0.4s, #0
 ; CHECK-BE-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-BE-NEXT:    mov x9, v3.d[1]
-; CHECK-BE-NEXT:    ushll2 v1.2d, v1.4s, #0
-; CHECK-BE-NEXT:    fmov x10, d3
-; CHECK-BE-NEXT:    mov x11, v0.d[1]
+; CHECK-BE-NEXT:    mov x9, v2.d[1]
+; CHECK-BE-NEXT:    mov x10, v1.d[1]
+; CHECK-BE-NEXT:    fmov x13, d1
+; CHECK-BE-NEXT:    mov x11, v3.d[1]
+; CHECK-BE-NEXT:    mov x12, v0.d[1]
+; CHECK-BE-NEXT:    fmov x14, d2
+; CHECK-BE-NEXT:    fmov x15, d3
+; CHECK-BE-NEXT:    lsl x9, x9, #2
+; CHECK-BE-NEXT:    orr x13, x10, x13, lsl #33
+; CHECK-BE-NEXT:    strb w10, [x1, #32]
+; CHECK-BE-NEXT:    lsl x11, x11, #4
+; CHECK-BE-NEXT:    lsl x12, x12, #6
+; CHECK-BE-NEXT:    orr x14, x9, x14, lsl #35
+; CHECK-BE-NEXT:    extr x9, x9, x13, #8
 ; CHECK-BE-NEXT:    fmov x13, d0
-; CHECK-BE-NEXT:    mov x12, v1.d[1]
-; CHECK-BE-NEXT:    strb w9, [x1, #32]
-; CHECK-BE-NEXT:    orr x10, x9, x10, lsl #33
-; CHECK-BE-NEXT:    fmov x15, d1
-; CHECK-BE-NEXT:    mov x14, v2.d[1]
-; CHECK-BE-NEXT:    lsl x11, x11, #2
-; CHECK-BE-NEXT:    lsl x12, x12, #4
-; CHECK-BE-NEXT:    orr x13, x11, x13, lsl #35
-; CHECK-BE-NEXT:    extr x10, x11, x10, #8
-; CHECK-BE-NEXT:    fmov x11, d2
-; CHECK-BE-NEXT:    orr x15, x12, x15, lsl #37
-; CHECK-BE-NEXT:    lsl x14, x14, #6
-; CHECK-BE-NEXT:    extr x9, x12, x13, #8
-; CHECK-BE-NEXT:    orr x11, x14, x11, lsl #39
-; CHECK-BE-NEXT:    extr x12, x14, x15, #8
-; CHECK-BE-NEXT:    lsr x11, x11, #8
-; CHECK-BE-NEXT:    stp x9, x10, [x1, #16]
-; CHECK-BE-NEXT:    stp x11, x12, [x1], #128
+; CHECK-BE-NEXT:    orr x15, x11, x15, lsl #37
+; CHECK-BE-NEXT:    extr x10, x11, x14, #8
+; CHECK-BE-NEXT:    orr x11, x12, x13, lsl #39
+; CHECK-BE-NEXT:    extr x12, x12, x15, #8
+; CHECK-BE-NEXT:    stp x10, x9, [x1, #16]
+; CHECK-BE-NEXT:    lsr x9, x11, #8
+; CHECK-BE-NEXT:    stp x9, x12, [x1], #128
 ; CHECK-BE-NEXT:    b.ne .LBB22_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    ret
@@ -2753,35 +2756,35 @@ exit:
 define i32 @test_pr62620_widening_instr(ptr %p1, ptr %p2, i64 %lx, i32 %h) {
 ; CHECK-LABEL: test_pr62620_widening_instr:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    mov x8, x0
-; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    lsl x9, x2, #4
+; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:  LBB23_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr q0, [x8, x9]
-; CHECK-NEXT:    subs w3, w3, #1
+; CHECK-NEXT:    ldr q0, [x0, x9]
 ; CHECK-NEXT:    ldr q1, [x1, x9]
+; CHECK-NEXT:    subs w3, w3, #1
 ; CHECK-NEXT:    uabdl.8h v2, v0, v1
 ; CHECK-NEXT:    uabal2.8h v2, v0, v1
 ; CHECK-NEXT:    uaddlv.8h s0, v2
 ; CHECK-NEXT:    fmov w10, s0
-; CHECK-NEXT:    add w0, w10, w0
+; CHECK-NEXT:    add w8, w10, w8
 ; CHECK-NEXT:    b.ne LBB23_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
+; CHECK-NEXT:    mov w0, w8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: test_pr62620_widening_instr:
 ; CHECK-BE:       // %bb.0: // %entry
-; CHECK-BE-NEXT:    mov x8, x0
 ; CHECK-BE-NEXT:    lsl x9, x2, #4
+; CHECK-BE-NEXT:    mov x8, x0
 ; CHECK-BE-NEXT:    mov w0, wzr
 ; CHECK-BE-NEXT:    add x8, x8, x9
 ; CHECK-BE-NEXT:    add x9, x1, x9
 ; CHECK-BE-NEXT:  .LBB23_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    ld1 { v0.16b }, [x8]
-; CHECK-BE-NEXT:    subs w3, w3, #1
 ; CHECK-BE-NEXT:    ld1 { v1.16b }, [x9]
+; CHECK-BE-NEXT:    subs w3, w3, #1
 ; CHECK-BE-NEXT:    uabdl v2.8h, v0.8b, v1.8b
 ; CHECK-BE-NEXT:    uabal2 v2.8h, v0.16b, v1.16b
 ; CHECK-BE-NEXT:    uaddlv s0, v2.8h
@@ -2826,19 +2829,19 @@ define i32 @test_widening_instr_mull(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-NEXT:  LBB24_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr q0, [x1], #16
-; CHECK-NEXT:    ldr q1, [x0]
-; CHECK-NEXT:    subs w2, w2, #1
+; CHECK-NEXT:    ldr q3, [x0]
 ; CHECK-NEXT:    ldr q2, [x8, #16]!
-; CHECK-NEXT:    ushll2.8h v3, v0, #0
+; CHECK-NEXT:    subs w2, w2, #1
+; CHECK-NEXT:    ushll2.8h v1, v0, #0
 ; CHECK-NEXT:    ushll.8h v0, v0, #0
-; CHECK-NEXT:    umull2.4s v4, v2, v3
-; CHECK-NEXT:    umull.4s v2, v2, v3
-; CHECK-NEXT:    umull.4s v3, v1, v0
-; CHECK-NEXT:    umull2.4s v0, v1, v0
-; CHECK-NEXT:    stp q2, q4, [x0, #32]
-; CHECK-NEXT:    str q3, [x0]
+; CHECK-NEXT:    umull2.4s v4, v2, v1
+; CHECK-NEXT:    umull.4s v1, v2, v1
+; CHECK-NEXT:    umull2.4s v2, v3, v0
+; CHECK-NEXT:    umull.4s v0, v3, v0
+; CHECK-NEXT:    stp q1, q4, [x0, #32]
+; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    mov x0, x8
-; CHECK-NEXT:    str q0, [x8]
+; CHECK-NEXT:    str q2, [x8]
 ; CHECK-NEXT:    b.ne LBB24_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    mov w0, wzr
@@ -2849,24 +2852,24 @@ define i32 @test_widening_instr_mull(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:  .LBB24_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    ld1 { v0.16b }, [x1]
+; CHECK-BE-NEXT:    ld1 { v1.8h }, [x0]
 ; CHECK-BE-NEXT:    add x8, x0, #16
+; CHECK-BE-NEXT:    ld1 { v3.8h }, [x8]
 ; CHECK-BE-NEXT:    add x9, x0, #48
 ; CHECK-BE-NEXT:    add x10, x0, #32
-; CHECK-BE-NEXT:    ld1 { v1.8h }, [x0]
 ; CHECK-BE-NEXT:    subs w2, w2, #1
 ; CHECK-BE-NEXT:    add x1, x1, #16
-; CHECK-BE-NEXT:    ld1 { v4.8h }, [x8]
 ; CHECK-BE-NEXT:    ushll v2.8h, v0.8b, #0
 ; CHECK-BE-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-BE-NEXT:    umull v3.4s, v1.4h, v2.4h
+; CHECK-BE-NEXT:    umull v4.4s, v1.4h, v2.4h
+; CHECK-BE-NEXT:    umull2 v5.4s, v3.8h, v0.8h
+; CHECK-BE-NEXT:    umull v0.4s, v3.4h, v0.4h
 ; CHECK-BE-NEXT:    umull2 v1.4s, v1.8h, v2.8h
-; CHECK-BE-NEXT:    umull2 v2.4s, v4.8h, v0.8h
-; CHECK-BE-NEXT:    umull v0.4s, v4.4h, v0.4h
-; CHECK-BE-NEXT:    st1 { v3.4s }, [x0]
+; CHECK-BE-NEXT:    st1 { v4.4s }, [x0]
 ; CHECK-BE-NEXT:    mov x0, x8
-; CHECK-BE-NEXT:    st1 { v1.4s }, [x8]
-; CHECK-BE-NEXT:    st1 { v2.4s }, [x9]
+; CHECK-BE-NEXT:    st1 { v5.4s }, [x9]
 ; CHECK-BE-NEXT:    st1 { v0.4s }, [x10]
+; CHECK-BE-NEXT:    st1 { v1.4s }, [x8]
 ; CHECK-BE-NEXT:    b.ne .LBB24_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    mov w0, wzr
@@ -2900,51 +2903,52 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-NEXT:  Lloh43:
 ; CHECK-NEXT:    adrp x9, lCPI25_1 at PAGE
 ; CHECK-NEXT:  Lloh44:
-; CHECK-NEXT:    adrp x10, lCPI25_2 at PAGE
+; CHECK-NEXT:    adrp x10, lCPI25_3 at PAGE
 ; CHECK-NEXT:  Lloh45:
-; CHECK-NEXT:    adrp x11, lCPI25_3 at PAGE
-; CHECK-NEXT:  Lloh46:
 ; CHECK-NEXT:    ldr q0, [x8, lCPI25_0 at PAGEOFF]
-; CHECK-NEXT:    mov x8, x1
+; CHECK-NEXT:  Lloh46:
+; CHECK-NEXT:    adrp x8, lCPI25_2 at PAGE
 ; CHECK-NEXT:  Lloh47:
 ; CHECK-NEXT:    ldr q1, [x9, lCPI25_1 at PAGEOFF]
 ; CHECK-NEXT:  Lloh48:
-; CHECK-NEXT:    ldr q2, [x10, lCPI25_2 at PAGEOFF]
+; CHECK-NEXT:    ldr q2, [x8, lCPI25_2 at PAGEOFF]
 ; CHECK-NEXT:  Lloh49:
-; CHECK-NEXT:    ldr q3, [x11, lCPI25_3 at PAGEOFF]
+; CHECK-NEXT:    ldr q3, [x10, lCPI25_3 at PAGEOFF]
+; CHECK-NEXT:    mov x8, x1
 ; CHECK-NEXT:  LBB25_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr q4, [x0]
+; CHECK-NEXT:    ldp q16, q7, [x1, #32]
+; CHECK-NEXT:    ldr q18, [x8, #16]!
 ; CHECK-NEXT:    subs w2, w2, #1
-; CHECK-NEXT:    ldp q7, q17, [x1, #32]
-; CHECK-NEXT:    tbl.16b v16, { v4 }, v3
-; CHECK-NEXT:    tbl.16b v18, { v4 }, v0
-; CHECK-NEXT:    tbl.16b v19, { v4 }, v1
-; CHECK-NEXT:    tbl.16b v4, { v4 }, v2
-; CHECK-NEXT:    ldr q5, [x1]
-; CHECK-NEXT:    ldr q6, [x8, #16]!
-; CHECK-NEXT:    umull2.2d v20, v16, v17
+; CHECK-NEXT:    tbl.16b v5, { v4 }, v3
+; CHECK-NEXT:    tbl.16b v6, { v4 }, v0
+; CHECK-NEXT:    tbl.16b v17, { v4 }, v2
+; CHECK-NEXT:    tbl.16b v4, { v4 }, v1
+; CHECK-NEXT:    umull2.2d v19, v5, v7
+; CHECK-NEXT:    umull.2d v5, v5, v7
+; CHECK-NEXT:    ldr q7, [x1]
+; CHECK-NEXT:    umull2.2d v20, v6, v16
+; CHECK-NEXT:    umull2.2d v21, v17, v18
+; CHECK-NEXT:    umull.2d v17, v17, v18
+; CHECK-NEXT:    umull2.2d v18, v4, v7
+; CHECK-NEXT:    umull.2d v4, v4, v7
 ; CHECK-NEXT:    mov x1, x8
-; CHECK-NEXT:    umull2.2d v21, v18, v7
-; CHECK-NEXT:    umull.2d v16, v16, v17
-; CHECK-NEXT:    umull2.2d v17, v4, v6
-; CHECK-NEXT:    umull.2d v4, v4, v6
-; CHECK-NEXT:    umull2.2d v6, v19, v5
-; CHECK-NEXT:    str q21, [x0, #80]
-; CHECK-NEXT:    umull.2d v5, v19, v5
-; CHECK-NEXT:    stp q16, q20, [x0, #96]
-; CHECK-NEXT:    umull.2d v7, v18, v7
-; CHECK-NEXT:    stp q4, q17, [x0, #32]
-; CHECK-NEXT:    stp q5, q6, [x0]
-; CHECK-NEXT:    str q7, [x0, #64]!
+; CHECK-NEXT:    stp q5, q19, [x0, #96]
+; CHECK-NEXT:    umull.2d v5, v6, v16
+; CHECK-NEXT:    str q20, [x0, #80]
+; CHECK-NEXT:    stp q4, q18, [x0]
+; CHECK-NEXT:    stp q17, q21, [x0, #32]
+; CHECK-NEXT:    str q5, [x0, #64]!
 ; CHECK-NEXT:    b.ne LBB25_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh45, Lloh49
-; CHECK-NEXT:    .loh AdrpLdr Lloh44, Lloh48
+; CHECK-NEXT:    .loh AdrpLdr Lloh46, Lloh48
+; CHECK-NEXT:    .loh AdrpLdr Lloh44, Lloh49
 ; CHECK-NEXT:    .loh AdrpLdr Lloh43, Lloh47
-; CHECK-NEXT:    .loh AdrpLdr Lloh42, Lloh46
+; CHECK-NEXT:    .loh AdrpAdrp Lloh42, Lloh46
+; CHECK-NEXT:    .loh AdrpLdr Lloh42, Lloh45
 ;
 ; CHECK-BE-LABEL: test_widening_instr_mull_64:
 ; CHECK-BE:       // %bb.0: // %entry
@@ -2963,57 +2967,58 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:  .LBB25_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    ld1 { v4.16b }, [x0]
-; CHECK-BE-NEXT:    add x8, x1, #48
-; CHECK-BE-NEXT:    add x9, x1, #32
-; CHECK-BE-NEXT:    subs w2, w2, #1
-; CHECK-BE-NEXT:    ld1 { v5.4s }, [x1]
+; CHECK-BE-NEXT:    add x9, x1, #48
+; CHECK-BE-NEXT:    add x8, x1, #32
+; CHECK-BE-NEXT:    ld1 { v18.4s }, [x9]
+; CHECK-BE-NEXT:    ld1 { v16.4s }, [x1]
 ; CHECK-BE-NEXT:    add x1, x1, #16
-; CHECK-BE-NEXT:    ld1 { v6.4s }, [x8]
+; CHECK-BE-NEXT:    ld1 { v20.4s }, [x8]
+; CHECK-BE-NEXT:    ld1 { v22.4s }, [x1]
 ; CHECK-BE-NEXT:    add x8, x0, #96
-; CHECK-BE-NEXT:    tbl v7.16b, { v4.16b }, v3.16b
-; CHECK-BE-NEXT:    tbl v17.16b, { v4.16b }, v1.16b
-; CHECK-BE-NEXT:    ld1 { v20.4s }, [x1]
-; CHECK-BE-NEXT:    tbl v16.16b, { v4.16b }, v0.16b
-; CHECK-BE-NEXT:    tbl v4.16b, { v4.16b }, v2.16b
-; CHECK-BE-NEXT:    ld1 { v18.4s }, [x9]
-; CHECK-BE-NEXT:    rev32 v19.8b, v7.8b
+; CHECK-BE-NEXT:    tbl v5.16b, { v4.16b }, v3.16b
+; CHECK-BE-NEXT:    tbl v6.16b, { v4.16b }, v2.16b
+; CHECK-BE-NEXT:    tbl v7.16b, { v4.16b }, v1.16b
+; CHECK-BE-NEXT:    tbl v4.16b, { v4.16b }, v0.16b
+; CHECK-BE-NEXT:    ext v24.16b, v18.16b, v18.16b, #8
 ; CHECK-BE-NEXT:    add x9, x0, #32
-; CHECK-BE-NEXT:    ext v23.16b, v6.16b, v6.16b, #8
-; CHECK-BE-NEXT:    rev32 v22.8b, v17.8b
+; CHECK-BE-NEXT:    ext v25.16b, v20.16b, v20.16b, #8
+; CHECK-BE-NEXT:    add x10, x0, #16
+; CHECK-BE-NEXT:    subs w2, w2, #1
+; CHECK-BE-NEXT:    ext v17.16b, v5.16b, v5.16b, #8
+; CHECK-BE-NEXT:    ext v19.16b, v6.16b, v6.16b, #8
+; CHECK-BE-NEXT:    rev32 v5.8b, v5.8b
+; CHECK-BE-NEXT:    rev32 v21.8b, v7.8b
+; CHECK-BE-NEXT:    rev32 v23.8b, v4.8b
 ; CHECK-BE-NEXT:    ext v7.16b, v7.16b, v7.16b, #8
-; CHECK-BE-NEXT:    ext v24.16b, v4.16b, v4.16b, #8
-; CHECK-BE-NEXT:    umull v6.2d, v19.2s, v6.2s
-; CHECK-BE-NEXT:    umull v19.2d, v22.2s, v20.2s
-; CHECK-BE-NEXT:    ext v22.16b, v18.16b, v18.16b, #8
-; CHECK-BE-NEXT:    ext v21.16b, v5.16b, v5.16b, #8
-; CHECK-BE-NEXT:    st1 { v6.2d }, [x8]
-; CHECK-BE-NEXT:    rev32 v6.8b, v7.8b
-; CHECK-BE-NEXT:    ext v7.16b, v17.16b, v17.16b, #8
-; CHECK-BE-NEXT:    rev32 v17.8b, v16.8b
+; CHECK-BE-NEXT:    ext v4.16b, v4.16b, v4.16b, #8
+; CHECK-BE-NEXT:    rev32 v6.8b, v6.8b
+; CHECK-BE-NEXT:    rev32 v17.8b, v17.8b
+; CHECK-BE-NEXT:    rev32 v19.8b, v19.8b
+; CHECK-BE-NEXT:    umull v5.2d, v5.2s, v18.2s
+; CHECK-BE-NEXT:    umull v18.2d, v21.2s, v22.2s
+; CHECK-BE-NEXT:    ext v21.16b, v22.16b, v22.16b, #8
+; CHECK-BE-NEXT:    rev32 v7.8b, v7.8b
+; CHECK-BE-NEXT:    umull v22.2d, v23.2s, v16.2s
 ; CHECK-BE-NEXT:    ext v16.16b, v16.16b, v16.16b, #8
-; CHECK-BE-NEXT:    add x8, x0, #112
-; CHECK-BE-NEXT:    st1 { v19.2d }, [x9]
-; CHECK-BE-NEXT:    rev32 v19.8b, v24.8b
-; CHECK-BE-NEXT:    umull v6.2d, v6.2s, v23.2s
 ; CHECK-BE-NEXT:    rev32 v4.8b, v4.8b
-; CHECK-BE-NEXT:    umull v5.2d, v17.2s, v5.2s
+; CHECK-BE-NEXT:    umull v17.2d, v17.2s, v24.2s
+; CHECK-BE-NEXT:    umull v19.2d, v19.2s, v25.2s
+; CHECK-BE-NEXT:    st1 { v5.2d }, [x8]
+; CHECK-BE-NEXT:    umull v5.2d, v6.2s, v20.2s
+; CHECK-BE-NEXT:    umull v6.2d, v7.2s, v21.2s
+; CHECK-BE-NEXT:    add x8, x0, #112
+; CHECK-BE-NEXT:    umull v4.2d, v4.2s, v16.2s
+; CHECK-BE-NEXT:    st1 { v18.2d }, [x9]
 ; CHECK-BE-NEXT:    add x9, x0, #80
-; CHECK-BE-NEXT:    rev32 v7.8b, v7.8b
-; CHECK-BE-NEXT:    rev32 v16.8b, v16.8b
-; CHECK-BE-NEXT:    st1 { v6.2d }, [x8]
-; CHECK-BE-NEXT:    add x8, x0, #48
-; CHECK-BE-NEXT:    ext v6.16b, v20.16b, v20.16b, #8
-; CHECK-BE-NEXT:    st1 { v5.2d }, [x0]
-; CHECK-BE-NEXT:    umull v17.2d, v19.2s, v22.2s
-; CHECK-BE-NEXT:    umull v4.2d, v4.2s, v18.2s
-; CHECK-BE-NEXT:    umull v5.2d, v7.2s, v6.2s
-; CHECK-BE-NEXT:    umull v6.2d, v16.2s, v21.2s
-; CHECK-BE-NEXT:    st1 { v17.2d }, [x9]
-; CHECK-BE-NEXT:    add x9, x0, #16
-; CHECK-BE-NEXT:    add x0, x0, #64
+; CHECK-BE-NEXT:    st1 { v22.2d }, [x0]
+; CHECK-BE-NEXT:    st1 { v17.2d }, [x8]
+; CHECK-BE-NEXT:    add x8, x0, #64
+; CHECK-BE-NEXT:    st1 { v19.2d }, [x9]
+; CHECK-BE-NEXT:    add x9, x0, #48
+; CHECK-BE-NEXT:    mov x0, x8
 ; CHECK-BE-NEXT:    st1 { v5.2d }, [x8]
-; CHECK-BE-NEXT:    st1 { v4.2d }, [x0]
 ; CHECK-BE-NEXT:    st1 { v6.2d }, [x9]
+; CHECK-BE-NEXT:    st1 { v4.2d }, [x10]
 ; CHECK-BE-NEXT:    b.ne .LBB25_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    mov w0, wzr
@@ -3047,33 +3052,33 @@ define i32 @test_widening_instr_mull_2(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-NEXT:  Lloh51:
 ; CHECK-NEXT:    adrp x9, lCPI26_1 at PAGE
 ; CHECK-NEXT:  Lloh52:
-; CHECK-NEXT:    adrp x10, lCPI26_2 at PAGE
+; CHECK-NEXT:    adrp x10, lCPI26_3 at PAGE
 ; CHECK-NEXT:  Lloh53:
-; CHECK-NEXT:    adrp x11, lCPI26_3 at PAGE
-; CHECK-NEXT:  Lloh54:
 ; CHECK-NEXT:    ldr q0, [x8, lCPI26_0 at PAGEOFF]
-; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:  Lloh54:
+; CHECK-NEXT:    adrp x8, lCPI26_2 at PAGE
 ; CHECK-NEXT:  Lloh55:
 ; CHECK-NEXT:    ldr q1, [x9, lCPI26_1 at PAGEOFF]
 ; CHECK-NEXT:  Lloh56:
-; CHECK-NEXT:    ldr q2, [x10, lCPI26_2 at PAGEOFF]
+; CHECK-NEXT:    ldr q2, [x8, lCPI26_2 at PAGEOFF]
 ; CHECK-NEXT:  Lloh57:
-; CHECK-NEXT:    ldr q3, [x11, lCPI26_3 at PAGEOFF]
+; CHECK-NEXT:    ldr q3, [x10, lCPI26_3 at PAGEOFF]
+; CHECK-NEXT:    mov x8, x0
 ; CHECK-NEXT:  LBB26_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr q4, [x1], #16
-; CHECK-NEXT:    ldp q5, q6, [x0, #32]
+; CHECK-NEXT:    ldr q18, [x0]
+; CHECK-NEXT:    ldp q16, q17, [x0, #32]
 ; CHECK-NEXT:    subs w2, w2, #1
-; CHECK-NEXT:    tbl.16b v16, { v4 }, v0
-; CHECK-NEXT:    tbl.16b v18, { v4 }, v1
-; CHECK-NEXT:    tbl.16b v19, { v4 }, v2
+; CHECK-NEXT:    tbl.16b v5, { v4 }, v0
+; CHECK-NEXT:    tbl.16b v6, { v4 }, v1
+; CHECK-NEXT:    tbl.16b v7, { v4 }, v2
 ; CHECK-NEXT:    tbl.16b v4, { v4 }, v3
-; CHECK-NEXT:    ldr q7, [x0]
-; CHECK-NEXT:    ldr q17, [x8, #16]!
-; CHECK-NEXT:    mul.4s v5, v5, v16
-; CHECK-NEXT:    mul.4s v6, v6, v18
-; CHECK-NEXT:    mul.4s v7, v7, v19
-; CHECK-NEXT:    mul.4s v4, v17, v4
+; CHECK-NEXT:    mul.4s v5, v16, v5
+; CHECK-NEXT:    ldr q16, [x8, #16]!
+; CHECK-NEXT:    mul.4s v6, v17, v6
+; CHECK-NEXT:    mul.4s v7, v18, v7
+; CHECK-NEXT:    mul.4s v4, v16, v4
 ; CHECK-NEXT:    stp q5, q6, [x0, #32]
 ; CHECK-NEXT:    str q7, [x0]
 ; CHECK-NEXT:    mov x0, x8
@@ -3082,10 +3087,11 @@ define i32 @test_widening_instr_mull_2(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh53, Lloh57
-; CHECK-NEXT:    .loh AdrpLdr Lloh52, Lloh56
+; CHECK-NEXT:    .loh AdrpLdr Lloh54, Lloh56
+; CHECK-NEXT:    .loh AdrpLdr Lloh52, Lloh57
 ; CHECK-NEXT:    .loh AdrpLdr Lloh51, Lloh55
-; CHECK-NEXT:    .loh AdrpLdr Lloh50, Lloh54
+; CHECK-NEXT:    .loh AdrpAdrp Lloh50, Lloh54
+; CHECK-NEXT:    .loh AdrpLdr Lloh50, Lloh53
 ;
 ; CHECK-BE-LABEL: test_widening_instr_mull_2:
 ; CHECK-BE:       // %bb.0: // %entry
@@ -3105,31 +3111,31 @@ define i32 @test_widening_instr_mull_2(ptr %p1, ptr %p2, i32 %h) {
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    ld1 { v4.16b }, [x1]
 ; CHECK-BE-NEXT:    add x8, x0, #32
+; CHECK-BE-NEXT:    ld1 { v16.4s }, [x0]
 ; CHECK-BE-NEXT:    add x9, x0, #48
 ; CHECK-BE-NEXT:    add x10, x0, #16
-; CHECK-BE-NEXT:    ld1 { v6.4s }, [x0]
+; CHECK-BE-NEXT:    ld1 { v17.4s }, [x8]
+; CHECK-BE-NEXT:    ld1 { v18.4s }, [x9]
+; CHECK-BE-NEXT:    ld1 { v19.4s }, [x10]
 ; CHECK-BE-NEXT:    subs w2, w2, #1
-; CHECK-BE-NEXT:    add x1, x1, #16
-; CHECK-BE-NEXT:    ld1 { v16.4s }, [x8]
 ; CHECK-BE-NEXT:    tbl v5.16b, { v4.16b }, v1.16b
-; CHECK-BE-NEXT:    tbl v7.16b, { v4.16b }, v0.16b
-; CHECK-BE-NEXT:    ld1 { v18.4s }, [x10]
-; CHECK-BE-NEXT:    tbl v17.16b, { v4.16b }, v3.16b
-; CHECK-BE-NEXT:    tbl v4.16b, { v4.16b }, v2.16b
+; CHECK-BE-NEXT:    tbl v6.16b, { v4.16b }, v3.16b
+; CHECK-BE-NEXT:    tbl v7.16b, { v4.16b }, v2.16b
+; CHECK-BE-NEXT:    tbl v4.16b, { v4.16b }, v0.16b
+; CHECK-BE-NEXT:    add x1, x1, #16
 ; CHECK-BE-NEXT:    rev32 v5.16b, v5.16b
+; CHECK-BE-NEXT:    rev32 v6.16b, v6.16b
 ; CHECK-BE-NEXT:    rev32 v7.16b, v7.16b
-; CHECK-BE-NEXT:    rev32 v17.16b, v17.16b
 ; CHECK-BE-NEXT:    rev32 v4.16b, v4.16b
-; CHECK-BE-NEXT:    mul v5.4s, v6.4s, v5.4s
-; CHECK-BE-NEXT:    ld1 { v6.4s }, [x9]
+; CHECK-BE-NEXT:    mul v5.4s, v16.4s, v5.4s
+; CHECK-BE-NEXT:    mul v6.4s, v17.4s, v6.4s
 ; CHECK-BE-NEXT:    mul v7.4s, v18.4s, v7.4s
+; CHECK-BE-NEXT:    mul v4.4s, v19.4s, v4.4s
 ; CHECK-BE-NEXT:    st1 { v5.4s }, [x0]
 ; CHECK-BE-NEXT:    mov x0, x10
-; CHECK-BE-NEXT:    mul v5.4s, v16.4s, v17.4s
-; CHECK-BE-NEXT:    st1 { v7.4s }, [x10]
-; CHECK-BE-NEXT:    mul v4.4s, v6.4s, v4.4s
-; CHECK-BE-NEXT:    st1 { v5.4s }, [x8]
-; CHECK-BE-NEXT:    st1 { v4.4s }, [x9]
+; CHECK-BE-NEXT:    st1 { v6.4s }, [x8]
+; CHECK-BE-NEXT:    st1 { v7.4s }, [x9]
+; CHECK-BE-NEXT:    st1 { v4.4s }, [x10]
 ; CHECK-BE-NEXT:    b.ne .LBB26_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
 ; CHECK-BE-NEXT:    mov w0, wzr

diff  --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll
index 8ac9dd8fdc62bc..603e4addcfe980 100644
--- a/llvm/test/CodeGen/AArch64/zext.ll
+++ b/llvm/test/CodeGen/AArch64/zext.ll
@@ -341,33 +341,33 @@ entry:
 define <3 x i64> @zext_v3i8_v3i64(<3 x i8> %a) {
 ; CHECK-SD-LABEL: zext_v3i8_v3i64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmov s1, w0
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    movi v1.2d, #0x000000000000ff
 ; CHECK-SD-NEXT:    fmov s3, w2
-; CHECK-SD-NEXT:    movi v0.2d, #0x000000000000ff
 ; CHECK-SD-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-SD-NEXT:    mov v1.s[1], w1
-; CHECK-SD-NEXT:    ushll v3.2d, v3.2s, #0
-; CHECK-SD-NEXT:    mov v2.b[0], v3.b[0]
-; CHECK-SD-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
-; CHECK-SD-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-SD-NEXT:    mov v0.s[1], w1
+; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT:    ushll v1.2d, v3.2s, #0
+; CHECK-SD-NEXT:    mov v2.b[0], v1.b[0]
 ; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: zext_v3i8_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-GI-NEXT:    fmov d0, x0
-; CHECK-GI-NEXT:    adrp x8, .LCPI21_0
 ; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-NEXT:    adrp x8, .LCPI21_0
 ; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 def $x2
-; CHECK-GI-NEXT:    mov v0.d[1], x1
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI21_0]
 ; CHECK-GI-NEXT:    and x8, x2, #0xff
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    mov v0.d[1], x1
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
@@ -409,9 +409,9 @@ define <3 x i64> @zext_v3i16_v3i64(<3 x i16> %a) {
 ; CHECK-SD-NEXT:    ushll v2.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    ushll v0.2d, v2.2s, #0
 ; CHECK-SD-NEXT:    ushll2 v2.2d, v2.4s, #0
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-SD-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
 ; CHECK-SD-NEXT:    ret
 ;
@@ -421,12 +421,12 @@ define <3 x i64> @zext_v3i16_v3i64(<3 x i16> %a) {
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov h2, v0.h[2]
 ; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    ubfx x8, x8, #0, #16
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    ubfx x8, x8, #0, #16
+; CHECK-GI-NEXT:    fmov d0, x8
 ; CHECK-GI-NEXT:    ubfx x9, x9, #0, #16
 ; CHECK-GI-NEXT:    ubfx x10, x10, #0, #16
-; CHECK-GI-NEXT:    fmov d0, x8
 ; CHECK-GI-NEXT:    fmov d1, x9
 ; CHECK-GI-NEXT:    fmov d2, x10
 ; CHECK-GI-NEXT:    ret
@@ -440,8 +440,8 @@ define <3 x i64> @zext_v3i32_v3i64(<3 x i32> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    ushll v3.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    ushll2 v2.2d, v0.4s, #0
-; CHECK-SD-NEXT:    fmov d0, d3
 ; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 killed $q2
+; CHECK-SD-NEXT:    fmov d0, d3
 ; CHECK-SD-NEXT:    ext v1.16b, v3.16b, v3.16b, #8
 ; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
 ; CHECK-SD-NEXT:    ret
@@ -451,9 +451,9 @@ define <3 x i64> @zext_v3i32_v3i64(<3 x i32> %a) {
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    mov s2, v0.s[2]
 ; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov d0, x8
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    fmov d0, x8
 ; CHECK-GI-NEXT:    fmov d1, x9
 ; CHECK-GI-NEXT:    fmov d2, x10
 ; CHECK-GI-NEXT:    ret
@@ -537,11 +537,11 @@ define <3 x i64> @zext_v3i10_v3i64(<3 x i10> %a) {
 ; CHECK-SD-LABEL: zext_v3i10_v3i64:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    fmov s1, w2
 ; CHECK-SD-NEXT:    mov w8, #1023 // =0x3ff
-; CHECK-SD-NEXT:    fmov s3, w2
-; CHECK-SD-NEXT:    mov v0.s[1], w1
 ; CHECK-SD-NEXT:    dup v2.2d, x8
-; CHECK-SD-NEXT:    ushll v3.2d, v3.2s, #0
+; CHECK-SD-NEXT:    mov v0.s[1], w1
+; CHECK-SD-NEXT:    ushll v3.2d, v1.2s, #0
 ; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-SD-NEXT:    and v2.8b, v3.8b, v2.8b
@@ -554,14 +554,14 @@ define <3 x i64> @zext_v3i10_v3i64(<3 x i10> %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-GI-NEXT:    fmov d0, x0
-; CHECK-GI-NEXT:    adrp x8, .LCPI27_0
 ; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-NEXT:    adrp x8, .LCPI27_0
 ; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 def $x2
-; CHECK-GI-NEXT:    mov v0.d[1], x1
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI27_0]
 ; CHECK-GI-NEXT:    and x8, x2, #0x3ff
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    fmov d2, x8
+; CHECK-GI-NEXT:    mov v0.d[1], x1
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
@@ -619,11 +619,11 @@ define <4 x i64> @zext_v4i8_v4i64(<4 x i8> %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    adrp x8, .LCPI30_0
-; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI30_0]
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -728,11 +728,11 @@ define <4 x i64> @zext_v4i10_v4i64(<4 x i10> %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    adrp x8, .LCPI36_0
-; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI36_0]
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -774,11 +774,11 @@ define <8 x i64> @zext_v8i8_v8i64(<8 x i8> %a) {
 ; CHECK-SD-LABEL: zext_v8i8_v8i64:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    ushll2 v2.4s, v0.8h, #0
-; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v0.2d, v1.2s, #0
 ; CHECK-SD-NEXT:    ushll2 v3.2d, v2.4s, #0
-; CHECK-SD-NEXT:    ushll2 v1.2d, v0.4s, #0
-; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT:    ushll2 v1.2d, v1.4s, #0
 ; CHECK-SD-NEXT:    ushll v2.2d, v2.2s, #0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -787,13 +787,13 @@ define <8 x i64> @zext_v8i8_v8i64(<8 x i8> %a) {
 ; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    mov d2, v0.d[1]
-; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    mov d4, v3.d[1]
-; CHECK-GI-NEXT:    ushll v1.2d, v2.2s, #0
-; CHECK-GI-NEXT:    ushll v2.2d, v3.2s, #0
-; CHECK-GI-NEXT:    ushll v3.2d, v4.2s, #0
+; CHECK-GI-NEXT:    mov d3, v2.d[1]
+; CHECK-GI-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll v3.2d, v3.2s, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <8 x i8> %a to <8 x i64>
@@ -821,11 +821,11 @@ entry:
 define <8 x i64> @zext_v8i16_v8i64(<8 x i16> %a) {
 ; CHECK-SD-LABEL: zext_v8i16_v8i64:
 ; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v1.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    ushll2 v2.4s, v0.8h, #0
-; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v0.2d, v1.2s, #0
 ; CHECK-SD-NEXT:    ushll2 v3.2d, v2.4s, #0
-; CHECK-SD-NEXT:    ushll2 v1.2d, v0.4s, #0
-; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT:    ushll2 v1.2d, v1.4s, #0
 ; CHECK-SD-NEXT:    ushll v2.2d, v2.2s, #0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -833,13 +833,13 @@ define <8 x i64> @zext_v8i16_v8i64(<8 x i16> %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    mov d2, v0.d[1]
-; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    mov d4, v3.d[1]
-; CHECK-GI-NEXT:    ushll v1.2d, v2.2s, #0
-; CHECK-GI-NEXT:    ushll v2.2d, v3.2s, #0
-; CHECK-GI-NEXT:    ushll v3.2d, v4.2s, #0
+; CHECK-GI-NEXT:    mov d3, v2.d[1]
+; CHECK-GI-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll v3.2d, v3.2s, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <8 x i16> %a to <8 x i64>
@@ -849,21 +849,23 @@ entry:
 define <8 x i64> @zext_v8i32_v8i64(<8 x i32> %a) {
 ; CHECK-SD-LABEL: zext_v8i32_v8i64:
 ; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v5.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    ushll2 v4.2d, v0.4s, #0
 ; CHECK-SD-NEXT:    ushll2 v3.2d, v1.4s, #0
-; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    ushll v2.2d, v1.2s, #0
+; CHECK-SD-NEXT:    mov v0.16b, v5.16b
 ; CHECK-SD-NEXT:    mov v1.16b, v4.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: zext_v8i32_v8i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov d3, v0.d[1]
-; CHECK-GI-NEXT:    mov d4, v1.d[1]
+; CHECK-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v4.2d, v2.2s, #0
 ; CHECK-GI-NEXT:    ushll v2.2d, v1.2s, #0
-; CHECK-GI-NEXT:    ushll v1.2d, v3.2s, #0
-; CHECK-GI-NEXT:    ushll v3.2d, v4.2s, #0
+; CHECK-GI-NEXT:    ushll v3.2d, v3.2s, #0
+; CHECK-GI-NEXT:    mov v1.16b, v4.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <8 x i32> %a to <8 x i64>
@@ -897,12 +899,12 @@ define <8 x i32> @zext_v8i10_v8i32(<8 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v8i10_v8i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    adrp x8, .LCPI44_0
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI44_0
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI44_0]
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -914,31 +916,31 @@ define <8 x i64> @zext_v8i10_v8i64(<8 x i10> %a) {
 ; CHECK-SD-LABEL: zext_v8i10_v8i64:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    bic v0.8h, #252, lsl #8
+; CHECK-SD-NEXT:    ushll v1.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    ushll2 v2.4s, v0.8h, #0
-; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v0.2d, v1.2s, #0
 ; CHECK-SD-NEXT:    ushll2 v3.2d, v2.4s, #0
-; CHECK-SD-NEXT:    ushll2 v1.2d, v0.4s, #0
-; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT:    ushll2 v1.2d, v1.4s, #0
 ; CHECK-SD-NEXT:    ushll v2.2d, v2.2s, #0
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: zext_v8i10_v8i64:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
-; CHECK-GI-NEXT:    adrp x8, .LCPI45_0
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    adrp x8, .LCPI45_0
+; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI45_0]
 ; CHECK-GI-NEXT:    mov d2, v0.d[1]
-; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI45_0]
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    mov d4, v1.d[1]
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v4.16b
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    ushll v2.2d, v2.2s, #0
 ; CHECK-GI-NEXT:    ushll v5.2d, v1.2s, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v3.16b
-; CHECK-GI-NEXT:    ushll v4.2d, v4.2s, #0
-; CHECK-GI-NEXT:    and v1.16b, v2.16b, v3.16b
-; CHECK-GI-NEXT:    and v2.16b, v5.16b, v3.16b
-; CHECK-GI-NEXT:    and v3.16b, v4.16b, v3.16b
+; CHECK-GI-NEXT:    and v1.16b, v2.16b, v4.16b
+; CHECK-GI-NEXT:    and v2.16b, v5.16b, v4.16b
+; CHECK-GI-NEXT:    ushll v3.2d, v3.2s, #0
+; CHECK-GI-NEXT:    and v3.16b, v3.16b, v4.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <8 x i10> %a to <8 x i64>
@@ -966,11 +968,11 @@ entry:
 define <16 x i32> @zext_v16i8_v16i32(<16 x i8> %a) {
 ; CHECK-SD-LABEL: zext_v16i8_v16i32:
 ; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v1.8h, v0.8b, #0
 ; CHECK-SD-NEXT:    ushll2 v2.8h, v0.16b, #0
-; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    ushll2 v3.4s, v2.8h, #0
-; CHECK-SD-NEXT:    ushll2 v1.4s, v0.8h, #0
-; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll2 v1.4s, v1.8h, #0
 ; CHECK-SD-NEXT:    ushll v2.4s, v2.4h, #0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -978,13 +980,13 @@ define <16 x i32> @zext_v16i8_v16i32(<16 x i8> %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    mov d2, v0.d[1]
-; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll v2.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    mov d4, v3.d[1]
-; CHECK-GI-NEXT:    ushll v1.4s, v2.4h, #0
-; CHECK-GI-NEXT:    ushll v2.4s, v3.4h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v4.4h, #0
+; CHECK-GI-NEXT:    mov d3, v2.d[1]
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <16 x i8> %a to <16 x i32>
@@ -994,45 +996,45 @@ entry:
 define <16 x i64> @zext_v16i8_v16i64(<16 x i8> %a) {
 ; CHECK-SD-LABEL: zext_v16i8_v16i64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ushll2 v1.8h, v0.16b, #0
-; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-NEXT:    ushll2 v2.4s, v1.8h, #0
-; CHECK-SD-NEXT:    ushll2 v4.4s, v0.8h, #0
-; CHECK-SD-NEXT:    ushll v16.4s, v1.4h, #0
-; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-NEXT:    ushll2 v7.2d, v2.4s, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-SD-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-SD-NEXT:    ushll2 v4.4s, v1.8h, #0
+; CHECK-SD-NEXT:    ushll v5.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll2 v6.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll2 v1.2d, v2.4s, #0
+; CHECK-SD-NEXT:    ushll v0.2d, v2.2s, #0
 ; CHECK-SD-NEXT:    ushll2 v3.2d, v4.4s, #0
-; CHECK-SD-NEXT:    ushll2 v5.2d, v16.4s, #0
-; CHECK-SD-NEXT:    ushll v6.2d, v2.2s, #0
-; CHECK-SD-NEXT:    ushll2 v1.2d, v0.4s, #0
 ; CHECK-SD-NEXT:    ushll v2.2d, v4.2s, #0
-; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-NEXT:    ushll v4.2d, v16.2s, #0
+; CHECK-SD-NEXT:    ushll v4.2d, v5.2s, #0
+; CHECK-SD-NEXT:    ushll2 v7.2d, v6.4s, #0
+; CHECK-SD-NEXT:    ushll2 v5.2d, v5.4s, #0
+; CHECK-SD-NEXT:    ushll v6.2d, v6.2s, #0
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: zext_v16i8_v16i64:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    mov d2, v0.d[1]
-; CHECK-GI-NEXT:    ushll v1.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v0.4s, v1.4h, #0
-; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-GI-NEXT:    mov d3, v0.d[1]
-; CHECK-GI-NEXT:    mov d5, v2.d[1]
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
 ; CHECK-GI-NEXT:    ushll v4.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll v6.4s, v2.4h, #0
-; CHECK-GI-NEXT:    ushll v1.2d, v3.2s, #0
-; CHECK-GI-NEXT:    ushll v16.4s, v5.4h, #0
-; CHECK-GI-NEXT:    mov d3, v4.d[1]
-; CHECK-GI-NEXT:    mov d7, v6.d[1]
-; CHECK-GI-NEXT:    mov d17, v16.d[1]
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll v2.2d, v4.2s, #0
-; CHECK-GI-NEXT:    ushll v4.2d, v6.2s, #0
+; CHECK-GI-NEXT:    mov d5, v4.d[1]
+; CHECK-GI-NEXT:    ushll v4.2d, v4.2s, #0
+; CHECK-GI-NEXT:    ushll v6.4s, v3.4h, #0
+; CHECK-GI-NEXT:    mov d3, v2.d[1]
+; CHECK-GI-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    mov d7, v6.d[1]
+; CHECK-GI-NEXT:    ushll v5.2d, v5.2s, #0
+; CHECK-GI-NEXT:    ushll v6.2d, v6.2s, #0
 ; CHECK-GI-NEXT:    ushll v3.2d, v3.2s, #0
-; CHECK-GI-NEXT:    ushll v5.2d, v7.2s, #0
-; CHECK-GI-NEXT:    ushll v6.2d, v16.2s, #0
-; CHECK-GI-NEXT:    ushll v7.2d, v17.2s, #0
+; CHECK-GI-NEXT:    ushll v7.2d, v7.2s, #0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <16 x i8> %a to <16 x i64>
@@ -1042,21 +1044,23 @@ entry:
 define <16 x i32> @zext_v16i16_v16i32(<16 x i16> %a) {
 ; CHECK-SD-LABEL: zext_v16i16_v16i32:
 ; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v5.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    ushll2 v4.4s, v0.8h, #0
 ; CHECK-SD-NEXT:    ushll2 v3.4s, v1.8h, #0
-; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-SD-NEXT:    mov v0.16b, v5.16b
 ; CHECK-SD-NEXT:    mov v1.16b, v4.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: zext_v16i16_v16i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov d3, v0.d[1]
-; CHECK-GI-NEXT:    mov d4, v1.d[1]
+; CHECK-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v4.4s, v2.4h, #0
 ; CHECK-GI-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v3.4h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v4.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    mov v1.16b, v4.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <16 x i16> %a to <16 x i32>
@@ -1066,40 +1070,39 @@ entry:
 define <16 x i64> @zext_v16i16_v16i64(<16 x i16> %a) {
 ; CHECK-SD-LABEL: zext_v16i16_v16i64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ushll2 v2.4s, v0.8h, #0
-; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-NEXT:    ushll2 v4.4s, v1.8h, #0
-; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-SD-NEXT:    ushll2 v16.2d, v0.4s, #0
-; CHECK-SD-NEXT:    ushll2 v3.2d, v2.4s, #0
-; CHECK-SD-NEXT:    ushll2 v7.2d, v4.4s, #0
-; CHECK-SD-NEXT:    ushll v2.2d, v2.2s, #0
-; CHECK-SD-NEXT:    ushll2 v5.2d, v1.4s, #0
-; CHECK-SD-NEXT:    ushll v6.2d, v4.2s, #0
-; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-SD-NEXT:    ushll v4.2d, v1.2s, #0
-; CHECK-SD-NEXT:    mov v1.16b, v16.16b
+; CHECK-SD-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll2 v4.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll v5.4s, v1.4h, #0
+; CHECK-SD-NEXT:    ushll2 v6.4s, v1.8h, #0
+; CHECK-SD-NEXT:    ushll2 v1.2d, v2.4s, #0
+; CHECK-SD-NEXT:    ushll v0.2d, v2.2s, #0
+; CHECK-SD-NEXT:    ushll2 v3.2d, v4.4s, #0
+; CHECK-SD-NEXT:    ushll v2.2d, v4.2s, #0
+; CHECK-SD-NEXT:    ushll v4.2d, v5.2s, #0
+; CHECK-SD-NEXT:    ushll2 v7.2d, v6.4s, #0
+; CHECK-SD-NEXT:    ushll2 v5.2d, v5.4s, #0
+; CHECK-SD-NEXT:    ushll v6.2d, v6.2s, #0
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: zext_v16i16_v16i64:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v6.4s, v3.4h, #0
 ; CHECK-GI-NEXT:    mov d3, v0.d[1]
-; CHECK-GI-NEXT:    mov d4, v1.d[1]
-; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v5.4s, v1.4h, #0
-; CHECK-GI-NEXT:    mov d1, v2.d[1]
-; CHECK-GI-NEXT:    ushll v0.2d, v2.2s, #0
-; CHECK-GI-NEXT:    mov d6, v5.d[1]
-; CHECK-GI-NEXT:    ushll v2.4s, v3.4h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v4.4h, #0
-; CHECK-GI-NEXT:    mov d7, v2.d[1]
-; CHECK-GI-NEXT:    mov d16, v3.d[1]
-; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    mov d7, v1.d[1]
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v4.2d, v1.2s, #0
+; CHECK-GI-NEXT:    mov d5, v2.d[1]
+; CHECK-GI-NEXT:    mov d16, v6.d[1]
 ; CHECK-GI-NEXT:    ushll v2.2d, v2.2s, #0
-; CHECK-GI-NEXT:    ushll v4.2d, v5.2s, #0
-; CHECK-GI-NEXT:    ushll v5.2d, v6.2s, #0
-; CHECK-GI-NEXT:    ushll v6.2d, v3.2s, #0
-; CHECK-GI-NEXT:    ushll v3.2d, v7.2s, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v3.2s, #0
+; CHECK-GI-NEXT:    ushll v6.2d, v6.2s, #0
+; CHECK-GI-NEXT:    ushll v3.2d, v5.2s, #0
+; CHECK-GI-NEXT:    ushll v5.2d, v7.2s, #0
 ; CHECK-GI-NEXT:    ushll v7.2d, v16.2s, #0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1111,35 +1114,34 @@ define <16 x i64> @zext_v16i32_v16i64(<16 x i32> %a) {
 ; CHECK-SD-LABEL: zext_v16i32_v16i64:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    ushll2 v17.2d, v0.4s, #0
-; CHECK-SD-NEXT:    ushll2 v18.2d, v1.4s, #0
-; CHECK-SD-NEXT:    ushll v16.2d, v1.2s, #0
-; CHECK-SD-NEXT:    ushll2 v5.2d, v2.4s, #0
-; CHECK-SD-NEXT:    ushll2 v7.2d, v3.4s, #0
+; CHECK-SD-NEXT:    ushll2 v16.2d, v1.4s, #0
+; CHECK-SD-NEXT:    ushll v18.2d, v1.2s, #0
 ; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    ushll v4.2d, v2.2s, #0
+; CHECK-SD-NEXT:    ushll2 v5.2d, v2.4s, #0
+; CHECK-SD-NEXT:    ushll2 v7.2d, v3.4s, #0
 ; CHECK-SD-NEXT:    ushll v6.2d, v3.2s, #0
 ; CHECK-SD-NEXT:    mov v1.16b, v17.16b
-; CHECK-SD-NEXT:    mov v2.16b, v16.16b
-; CHECK-SD-NEXT:    mov v3.16b, v18.16b
+; CHECK-SD-NEXT:    mov v2.16b, v18.16b
+; CHECK-SD-NEXT:    mov v3.16b, v16.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: zext_v16i32_v16i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov d5, v1.d[1]
-; CHECK-GI-NEXT:    mov d6, v2.d[1]
-; CHECK-GI-NEXT:    ushll v16.2d, v0.2s, #0
-; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    mov d6, v1.d[1]
+; CHECK-GI-NEXT:    mov d5, v0.d[1]
+; CHECK-GI-NEXT:    mov d7, v2.d[1]
+; CHECK-GI-NEXT:    mov d18, v3.d[1]
+; CHECK-GI-NEXT:    ushll v16.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    ushll v4.2d, v2.2s, #0
-; CHECK-GI-NEXT:    mov d2, v3.d[1]
-; CHECK-GI-NEXT:    ushll v17.2d, v1.2s, #0
-; CHECK-GI-NEXT:    ushll v18.2d, v5.2s, #0
-; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll v5.2d, v6.2s, #0
+; CHECK-GI-NEXT:    ushll v17.2d, v6.2s, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v5.2s, #0
 ; CHECK-GI-NEXT:    ushll v6.2d, v3.2s, #0
-; CHECK-GI-NEXT:    ushll v7.2d, v2.2s, #0
-; CHECK-GI-NEXT:    mov v0.16b, v16.16b
-; CHECK-GI-NEXT:    mov v2.16b, v17.16b
-; CHECK-GI-NEXT:    mov v3.16b, v18.16b
+; CHECK-GI-NEXT:    ushll v5.2d, v7.2s, #0
+; CHECK-GI-NEXT:    ushll v7.2d, v18.2s, #0
+; CHECK-GI-NEXT:    mov v2.16b, v16.16b
+; CHECK-GI-NEXT:    mov v3.16b, v17.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <16 x i32> %a to <16 x i64>
@@ -1151,11 +1153,11 @@ define <16 x i16> @zext_v16i10_v16i16(<16 x i10> %a) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr w8, [sp]
 ; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ldr w9, [sp, #8]
 ; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    ldr w8, [sp, #8]
-; CHECK-NEXT:    mov v0.h[1], w1
-; CHECK-NEXT:    mov v1.h[1], w8
 ; CHECK-NEXT:    ldr w8, [sp, #16]
+; CHECK-NEXT:    mov v0.h[1], w1
+; CHECK-NEXT:    mov v1.h[1], w9
 ; CHECK-NEXT:    mov v0.h[2], w2
 ; CHECK-NEXT:    mov v1.h[2], w8
 ; CHECK-NEXT:    ldr w8, [sp, #24]
@@ -1184,31 +1186,31 @@ entry:
 define <16 x i32> @zext_v16i10_v16i32(<16 x i10> %a) {
 ; CHECK-SD-LABEL: zext_v16i10_v16i32:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldr w11, [sp, #32]
+; CHECK-SD-NEXT:    ldr w8, [sp, #32]
+; CHECK-SD-NEXT:    ldr w9, [sp]
 ; CHECK-SD-NEXT:    fmov s0, w0
-; CHECK-SD-NEXT:    ldr w13, [sp]
 ; CHECK-SD-NEXT:    fmov s1, w4
 ; CHECK-SD-NEXT:    ldr w10, [sp, #40]
-; CHECK-SD-NEXT:    ldr w15, [sp, #8]
-; CHECK-SD-NEXT:    fmov s3, w11
-; CHECK-SD-NEXT:    fmov s2, w13
-; CHECK-SD-NEXT:    ldr w9, [sp, #48]
+; CHECK-SD-NEXT:    ldr w11, [sp, #8]
+; CHECK-SD-NEXT:    fmov s2, w9
+; CHECK-SD-NEXT:    fmov s3, w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #48]
 ; CHECK-SD-NEXT:    mov v0.h[1], w1
-; CHECK-SD-NEXT:    ldr w14, [sp, #16]
+; CHECK-SD-NEXT:    ldr w9, [sp, #16]
+; CHECK-SD-NEXT:    movi v4.4s, #3, msl #8
 ; CHECK-SD-NEXT:    mov v1.h[1], w5
-; CHECK-SD-NEXT:    ldr w8, [sp, #56]
-; CHECK-SD-NEXT:    mov v2.h[1], w15
-; CHECK-SD-NEXT:    ldr w12, [sp, #24]
+; CHECK-SD-NEXT:    mov v2.h[1], w11
 ; CHECK-SD-NEXT:    mov v3.h[1], w10
 ; CHECK-SD-NEXT:    mov v0.h[2], w2
 ; CHECK-SD-NEXT:    mov v1.h[2], w6
-; CHECK-SD-NEXT:    mov v2.h[2], w14
-; CHECK-SD-NEXT:    mov v3.h[2], w9
+; CHECK-SD-NEXT:    mov v2.h[2], w9
+; CHECK-SD-NEXT:    mov v3.h[2], w8
+; CHECK-SD-NEXT:    ldr w8, [sp, #56]
+; CHECK-SD-NEXT:    ldr w9, [sp, #24]
 ; CHECK-SD-NEXT:    mov v0.h[3], w3
 ; CHECK-SD-NEXT:    mov v1.h[3], w7
-; CHECK-SD-NEXT:    mov v2.h[3], w12
+; CHECK-SD-NEXT:    mov v2.h[3], w9
 ; CHECK-SD-NEXT:    mov v3.h[3], w8
-; CHECK-SD-NEXT:    movi v4.4s, #3, msl #8
 ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    ushll v2.4s, v2.4h, #0
@@ -1221,34 +1223,34 @@ define <16 x i32> @zext_v16i10_v16i32(<16 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v16i10_v16i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr s0, [sp]
-; CHECK-GI-NEXT:    fmov s16, w0
-; CHECK-GI-NEXT:    ldr s1, [sp, #8]
-; CHECK-GI-NEXT:    fmov s17, w4
-; CHECK-GI-NEXT:    ldr s4, [sp, #32]
+; CHECK-GI-NEXT:    fmov s4, w0
+; CHECK-GI-NEXT:    fmov s5, w4
+; CHECK-GI-NEXT:    ldr s2, [sp]
+; CHECK-GI-NEXT:    ldr s0, [sp, #8]
+; CHECK-GI-NEXT:    ldr s3, [sp, #32]
+; CHECK-GI-NEXT:    ldr s1, [sp, #40]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI53_0
-; CHECK-GI-NEXT:    ldr s5, [sp, #40]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    ldr s2, [sp, #16]
-; CHECK-GI-NEXT:    mov v16.s[1], w1
-; CHECK-GI-NEXT:    ldr s6, [sp, #48]
-; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
-; CHECK-GI-NEXT:    ldr s3, [sp, #24]
-; CHECK-GI-NEXT:    mov v17.s[1], w5
-; CHECK-GI-NEXT:    ldr s7, [sp, #56]
-; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
-; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI53_0]
-; CHECK-GI-NEXT:    mov v16.s[2], w2
-; CHECK-GI-NEXT:    mov v4.s[2], v6.s[0]
-; CHECK-GI-NEXT:    mov v17.s[2], w6
-; CHECK-GI-NEXT:    mov v0.s[3], v3.s[0]
-; CHECK-GI-NEXT:    mov v16.s[3], w3
-; CHECK-GI-NEXT:    mov v4.s[3], v7.s[0]
-; CHECK-GI-NEXT:    mov v17.s[3], w7
-; CHECK-GI-NEXT:    and v2.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT:    and v0.16b, v16.16b, v1.16b
-; CHECK-GI-NEXT:    and v3.16b, v4.16b, v1.16b
-; CHECK-GI-NEXT:    and v1.16b, v17.16b, v1.16b
+; CHECK-GI-NEXT:    mov v4.s[1], w1
+; CHECK-GI-NEXT:    mov v5.s[1], w5
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[0]
+; CHECK-GI-NEXT:    mov v3.s[1], v1.s[0]
+; CHECK-GI-NEXT:    ldr s0, [sp, #16]
+; CHECK-GI-NEXT:    ldr s1, [sp, #48]
+; CHECK-GI-NEXT:    ldr q6, [x8, :lo12:.LCPI53_0]
+; CHECK-GI-NEXT:    mov v4.s[2], w2
+; CHECK-GI-NEXT:    mov v5.s[2], w6
+; CHECK-GI-NEXT:    mov v2.s[2], v0.s[0]
+; CHECK-GI-NEXT:    mov v3.s[2], v1.s[0]
+; CHECK-GI-NEXT:    ldr s0, [sp, #24]
+; CHECK-GI-NEXT:    ldr s1, [sp, #56]
+; CHECK-GI-NEXT:    mov v4.s[3], w3
+; CHECK-GI-NEXT:    mov v5.s[3], w7
+; CHECK-GI-NEXT:    mov v2.s[3], v0.s[0]
+; CHECK-GI-NEXT:    mov v3.s[3], v1.s[0]
+; CHECK-GI-NEXT:    and v0.16b, v4.16b, v6.16b
+; CHECK-GI-NEXT:    and v1.16b, v5.16b, v6.16b
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v6.16b
+; CHECK-GI-NEXT:    and v3.16b, v3.16b, v6.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <16 x i10> %a to <16 x i32>
@@ -1258,86 +1260,86 @@ entry:
 define <16 x i64> @zext_v16i10_v16i64(<16 x i10> %a) {
 ; CHECK-SD-LABEL: zext_v16i10_v16i64:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mov w8, #1023 // =0x3ff
-; CHECK-SD-NEXT:    ldr s4, [sp]
+; CHECK-SD-NEXT:    fmov s0, w2
+; CHECK-SD-NEXT:    fmov s1, w0
+; CHECK-SD-NEXT:    ldr s2, [sp]
+; CHECK-SD-NEXT:    fmov s3, w4
+; CHECK-SD-NEXT:    fmov s4, w6
+; CHECK-SD-NEXT:    add x9, sp, #8
 ; CHECK-SD-NEXT:    ldr s5, [sp, #16]
+; CHECK-SD-NEXT:    ldr s6, [sp, #32]
+; CHECK-SD-NEXT:    ldr s7, [sp, #48]
+; CHECK-SD-NEXT:    mov v1.s[1], w1
+; CHECK-SD-NEXT:    mov v0.s[1], w3
+; CHECK-SD-NEXT:    ld1 { v2.s }[1], [x9]
+; CHECK-SD-NEXT:    mov v3.s[1], w5
+; CHECK-SD-NEXT:    mov v4.s[1], w7
 ; CHECK-SD-NEXT:    add x9, sp, #24
-; CHECK-SD-NEXT:    fmov s0, w0
-; CHECK-SD-NEXT:    fmov s1, w2
-; CHECK-SD-NEXT:    dup v7.2d, x8
-; CHECK-SD-NEXT:    add x8, sp, #8
-; CHECK-SD-NEXT:    fmov s2, w4
-; CHECK-SD-NEXT:    fmov s3, w6
+; CHECK-SD-NEXT:    add x10, sp, #40
+; CHECK-SD-NEXT:    add x11, sp, #56
 ; CHECK-SD-NEXT:    ld1 { v5.s }[1], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #56
-; CHECK-SD-NEXT:    ld1 { v4.s }[1], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #40
-; CHECK-SD-NEXT:    ldr s6, [sp, #32]
-; CHECK-SD-NEXT:    ldr s16, [sp, #48]
-; CHECK-SD-NEXT:    mov v0.s[1], w1
-; CHECK-SD-NEXT:    mov v1.s[1], w3
-; CHECK-SD-NEXT:    ld1 { v6.s }[1], [x8]
-; CHECK-SD-NEXT:    mov v2.s[1], w5
-; CHECK-SD-NEXT:    ld1 { v16.s }[1], [x9]
-; CHECK-SD-NEXT:    mov v3.s[1], w7
-; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-SD-NEXT:    ld1 { v6.s }[1], [x10]
+; CHECK-SD-NEXT:    ld1 { v7.s }[1], [x11]
+; CHECK-SD-NEXT:    mov w8, #1023 // =0x3ff
 ; CHECK-SD-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-SD-NEXT:    ushll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT:    dup v16.2d, x8
+; CHECK-SD-NEXT:    ushll v17.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    ushll v3.2d, v3.2s, #0
 ; CHECK-SD-NEXT:    ushll v4.2d, v4.2s, #0
+; CHECK-SD-NEXT:    ushll v18.2d, v2.2s, #0
 ; CHECK-SD-NEXT:    ushll v5.2d, v5.2s, #0
 ; CHECK-SD-NEXT:    ushll v6.2d, v6.2s, #0
-; CHECK-SD-NEXT:    ushll v16.2d, v16.2s, #0
-; CHECK-SD-NEXT:    and v0.16b, v0.16b, v7.16b
-; CHECK-SD-NEXT:    and v1.16b, v1.16b, v7.16b
-; CHECK-SD-NEXT:    and v2.16b, v2.16b, v7.16b
-; CHECK-SD-NEXT:    and v3.16b, v3.16b, v7.16b
-; CHECK-SD-NEXT:    and v4.16b, v4.16b, v7.16b
-; CHECK-SD-NEXT:    and v5.16b, v5.16b, v7.16b
-; CHECK-SD-NEXT:    and v6.16b, v6.16b, v7.16b
-; CHECK-SD-NEXT:    and v7.16b, v16.16b, v7.16b
+; CHECK-SD-NEXT:    ushll v7.2d, v7.2s, #0
+; CHECK-SD-NEXT:    and v0.16b, v1.16b, v16.16b
+; CHECK-SD-NEXT:    and v1.16b, v17.16b, v16.16b
+; CHECK-SD-NEXT:    and v2.16b, v3.16b, v16.16b
+; CHECK-SD-NEXT:    and v3.16b, v4.16b, v16.16b
+; CHECK-SD-NEXT:    and v4.16b, v18.16b, v16.16b
+; CHECK-SD-NEXT:    and v5.16b, v5.16b, v16.16b
+; CHECK-SD-NEXT:    and v6.16b, v6.16b, v16.16b
+; CHECK-SD-NEXT:    and v7.16b, v7.16b, v16.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: zext_v16i10_v16i64:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov s7, w0
+; CHECK-GI-NEXT:    fmov s17, w2
 ; CHECK-GI-NEXT:    ldr s0, [sp]
-; CHECK-GI-NEXT:    fmov s6, w0
+; CHECK-GI-NEXT:    fmov s18, w4
+; CHECK-GI-NEXT:    fmov s19, w6
 ; CHECK-GI-NEXT:    ldr s1, [sp, #8]
-; CHECK-GI-NEXT:    fmov s16, w2
 ; CHECK-GI-NEXT:    ldr s2, [sp, #16]
-; CHECK-GI-NEXT:    fmov s18, w4
 ; CHECK-GI-NEXT:    ldr s3, [sp, #24]
-; CHECK-GI-NEXT:    fmov s19, w6
 ; CHECK-GI-NEXT:    ldr s4, [sp, #32]
-; CHECK-GI-NEXT:    adrp x8, .LCPI54_0
 ; CHECK-GI-NEXT:    ldr s5, [sp, #40]
-; CHECK-GI-NEXT:    ldr s7, [sp, #48]
-; CHECK-GI-NEXT:    ldr s17, [sp, #56]
-; CHECK-GI-NEXT:    mov v6.s[1], w1
-; CHECK-GI-NEXT:    mov v16.s[1], w3
+; CHECK-GI-NEXT:    ldr s6, [sp, #48]
+; CHECK-GI-NEXT:    ldr s16, [sp, #56]
+; CHECK-GI-NEXT:    mov v7.s[1], w1
+; CHECK-GI-NEXT:    mov v17.s[1], w3
 ; CHECK-GI-NEXT:    mov v18.s[1], w5
 ; CHECK-GI-NEXT:    mov v19.s[1], w7
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
 ; CHECK-GI-NEXT:    mov v2.s[1], v3.s[0]
 ; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
-; CHECK-GI-NEXT:    mov v7.s[1], v17.s[0]
-; CHECK-GI-NEXT:    ldr q17, [x8, :lo12:.LCPI54_0]
-; CHECK-GI-NEXT:    ushll v1.2d, v6.2s, #0
-; CHECK-GI-NEXT:    ushll v3.2d, v16.2s, #0
+; CHECK-GI-NEXT:    mov v6.s[1], v16.s[0]
+; CHECK-GI-NEXT:    adrp x8, .LCPI54_0
+; CHECK-GI-NEXT:    ldr q16, [x8, :lo12:.LCPI54_0]
+; CHECK-GI-NEXT:    ushll v1.2d, v7.2s, #0
+; CHECK-GI-NEXT:    ushll v3.2d, v17.2s, #0
 ; CHECK-GI-NEXT:    ushll v5.2d, v18.2s, #0
-; CHECK-GI-NEXT:    ushll v6.2d, v19.2s, #0
-; CHECK-GI-NEXT:    ushll v16.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v7.2d, v19.2s, #0
+; CHECK-GI-NEXT:    ushll v17.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    ushll v18.2d, v2.2s, #0
 ; CHECK-GI-NEXT:    ushll v19.2d, v4.2s, #0
-; CHECK-GI-NEXT:    ushll v7.2d, v7.2s, #0
-; CHECK-GI-NEXT:    and v0.16b, v1.16b, v17.16b
-; CHECK-GI-NEXT:    and v1.16b, v3.16b, v17.16b
-; CHECK-GI-NEXT:    and v2.16b, v5.16b, v17.16b
-; CHECK-GI-NEXT:    and v3.16b, v6.16b, v17.16b
-; CHECK-GI-NEXT:    and v4.16b, v16.16b, v17.16b
-; CHECK-GI-NEXT:    and v5.16b, v18.16b, v17.16b
-; CHECK-GI-NEXT:    and v6.16b, v19.16b, v17.16b
-; CHECK-GI-NEXT:    and v7.16b, v7.16b, v17.16b
+; CHECK-GI-NEXT:    ushll v20.2d, v6.2s, #0
+; CHECK-GI-NEXT:    and v0.16b, v1.16b, v16.16b
+; CHECK-GI-NEXT:    and v1.16b, v3.16b, v16.16b
+; CHECK-GI-NEXT:    and v2.16b, v5.16b, v16.16b
+; CHECK-GI-NEXT:    and v3.16b, v7.16b, v16.16b
+; CHECK-GI-NEXT:    and v4.16b, v17.16b, v16.16b
+; CHECK-GI-NEXT:    and v5.16b, v18.16b, v16.16b
+; CHECK-GI-NEXT:    and v6.16b, v19.16b, v16.16b
+; CHECK-GI-NEXT:    and v7.16b, v20.16b, v16.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <16 x i10> %a to <16 x i64>

diff  --git a/llvm/test/MC/AArch64/elf-globaladdress.ll b/llvm/test/MC/AArch64/elf-globaladdress.ll
index 2271e83e533124..7aa0e564abb88b 100644
--- a/llvm/test/MC/AArch64/elf-globaladdress.ll
+++ b/llvm/test/MC/AArch64/elf-globaladdress.ll
@@ -41,22 +41,22 @@ define dso_local void @address() {
 
 ; OBJ: Relocations [
 ; OBJ:   Section {{.*}} .rela.text {
-; OBJ:     0x{{[0-9,A-F]+}} R_AARCH64_ADR_PREL_PG_HI21   var8
-; OBJ:     0x{{[0-9,A-F]+}} R_AARCH64_ADR_PREL_PG_HI21   var16
-; OBJ:     0x{{[0-9,A-F]+}} R_AARCH64_LDST8_ABS_LO12_NC  var8
-; OBJ:     0x{{[0-9,A-F]+}} R_AARCH64_ADR_PREL_PG_HI21   var32
-; OBJ:     0x{{[0-9,A-F]+}} R_AARCH64_LDST16_ABS_LO12_NC var16
-; OBJ:     0x{{[0-9,A-F]+}} R_AARCH64_ADR_PREL_PG_HI21   var64
-; OBJ:     0x{{[0-9,A-F]+}} R_AARCH64_LDST32_ABS_LO12_NC var32
-; OBJ:     0x{{[0-9,A-F]+}} R_AARCH64_LDST64_ABS_LO12_NC var64
-
-; This is on the store, so not really important, but it stops the next
-; match working.
-; OBJ:     0x{{[0-9,A-F]+}} R_AARCH64_LDST64_ABS_LO12_NC var64
-
-; Pure address-calculation against var64
-; OBJ:     0x{{[0-9,A-F]+}} R_AARCH64_ADR_PREL_PG_HI21   var64
-; OBJ:     0x{{[0-9,A-F]+}} R_AARCH64_ADD_ABS_LO12_NC    var64
+; OBJ:      0x{{[0-9,A-F]+}} R_AARCH64_ADR_PREL_PG_HI21 var8 0x0
+; OBJ:      0x{{[0-9,A-F]+}} R_AARCH64_LDST8_ABS_LO12_NC var8 0x0
+; OBJ:      0x{{[0-9,A-F]+}} R_AARCH64_LDST8_ABS_LO12_NC var8 0x0
+; OBJ:      0x{{[0-9,A-F]+}} R_AARCH64_ADR_PREL_PG_HI21 var16 0x0
+; OBJ:      0x{{[0-9,A-F]+}} R_AARCH64_LDST16_ABS_LO12_NC var16 0x0
+; OBJ:      0x{{[0-9,A-F]+}} R_AARCH64_LDST16_ABS_LO12_NC var16 0x0
+; OBJ:      0x{{[0-9,A-F]+}} R_AARCH64_ADR_PREL_PG_HI21 var32 0x0
+; OBJ:      0x{{[0-9,A-F]+}} R_AARCH64_LDST32_ABS_LO12_NC var32 0x0
+; OBJ:      0x{{[0-9,A-F]+}} R_AARCH64_LDST32_ABS_LO12_NC var32 0x0
+; OBJ:      0x{{[0-9,A-F]+}} R_AARCH64_ADR_PREL_PG_HI21 var64 0x0
+; OBJ:      0x{{[0-9,A-F]+}} R_AARCH64_LDST64_ABS_LO12_NC var64 0x0
+; OBJ:      0x{{[0-9,A-F]+}} R_AARCH64_LDST64_ABS_LO12_NC var64 0x0
+; OBJ:      0x{{[0-9,A-F]+}} R_AARCH64_ADR_PREL_PG_HI21 globaddr 0x0
+; OBJ:      0x{{[0-9,A-F]+}} R_AARCH64_ADR_PREL_PG_HI21 var64 0x0
+; OBJ:      0x{{[0-9,A-F]+}} R_AARCH64_ADD_ABS_LO12_NC var64 0x0
+; OBJ:      0x{{[0-9,A-F]+}} R_AARCH64_LDST64_ABS_LO12_NC globaddr 0x0
 
 ; OBJ:   }
 ; OBJ: ]

diff  --git a/llvm/test/MachineVerifier/test_g_concat_vectors.mir b/llvm/test/MachineVerifier/test_g_concat_vectors.mir
index b24401db139940..5c4de1bbc6de72 100644
--- a/llvm/test/MachineVerifier/test_g_concat_vectors.mir
+++ b/llvm/test/MachineVerifier/test_g_concat_vectors.mir
@@ -1,4 +1,4 @@
-#RUN: not --crash llc -o - -global-isel -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s
+#RUN: not --crash llc -o - -global-isel -mtriple=aarch64 -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s
 # REQUIRES: aarch64-registered-target
 ---
 name:            g_concat_vectors

diff  --git a/llvm/test/Transforms/CanonicalizeFreezeInLoops/aarch64.ll b/llvm/test/Transforms/CanonicalizeFreezeInLoops/aarch64.ll
index 711422fe399451..165aec4fce2074 100644
--- a/llvm/test/Transforms/CanonicalizeFreezeInLoops/aarch64.ll
+++ b/llvm/test/Transforms/CanonicalizeFreezeInLoops/aarch64.ll
@@ -10,9 +10,9 @@ define void @f(ptr %p, i32 %n, i32 %m) {
 ; CHECK-NEXT:    add w8, w2, #1
 ; CHECK-NEXT:  .LBB0_1: // %loop
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    subs w1, w1, #1
 ; CHECK-NEXT:    strb wzr, [x0, w8, sxtw]
 ; CHECK-NEXT:    add w8, w8, #1
-; CHECK-NEXT:    subs w1, w1, #1
 ; CHECK-NEXT:    b.ne .LBB0_1
 ; CHECK-NEXT:  // %bb.2: // %exit
 ; CHECK-NEXT:    ret
@@ -37,8 +37,8 @@ define void @f_without_freeze(ptr %p, i32 %n, i32 %m) {
 ; CHECK-NEXT:    add w8, w2, #1
 ; CHECK-NEXT:  .LBB1_1: // %loop
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    strb wzr, [x0, w8, sxtw]
 ; CHECK-NEXT:    subs w1, w1, #1
+; CHECK-NEXT:    strb wzr, [x0, w8, sxtw]
 ; CHECK-NEXT:    add w8, w8, #1
 ; CHECK-NEXT:    b.ne .LBB1_1
 ; CHECK-NEXT:  // %bb.2: // %exit

diff  --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll
index 25f3cdb7d491da..1c58b90f77dea7 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll
@@ -14,9 +14,9 @@ define void @test1(ptr %s, i32 %n) {
 ; CHECK-NEXT:    b.ge .LBB0_2
 ; CHECK-NEXT:  .LBB0_1: // %while_body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add w10, w8, #1
-; CHECK-NEXT:    stp w10, w8, [x9]
-; CHECK-NEXT:    mov w8, w10
+; CHECK-NEXT:    str w8, [x9, #4]
+; CHECK-NEXT:    add w8, w8, #1
+; CHECK-NEXT:    str w8, [x9]
 ; CHECK-NEXT:    cmp w8, w1
 ; CHECK-NEXT:    b.lt .LBB0_1
 ; CHECK-NEXT:  .LBB0_2: // %while_end
@@ -54,9 +54,9 @@ define void @test2(ptr %struct, i32 %n) {
 ; CHECK-NEXT:    b.ge .LBB1_3
 ; CHECK-NEXT:  .LBB1_2: // %while_body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add w10, w8, #1
-; CHECK-NEXT:    stp w10, w8, [x9]
-; CHECK-NEXT:    mov w8, w10
+; CHECK-NEXT:    str w8, [x9, #4]
+; CHECK-NEXT:    add w8, w8, #1
+; CHECK-NEXT:    str w8, [x9]
 ; CHECK-NEXT:    cmp w8, w1
 ; CHECK-NEXT:    b.lt .LBB1_2
 ; CHECK-NEXT:  .LBB1_3: // %while_end
@@ -96,9 +96,9 @@ define void @test3(ptr %s1, ptr %s2, i1 %cond, i32 %n) {
 ; CHECK-NEXT:    b.ge .LBB2_3
 ; CHECK-NEXT:  .LBB2_2: // %while_body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add w10, w8, #1
-; CHECK-NEXT:    stp w10, w8, [x9]
-; CHECK-NEXT:    mov w8, w10
+; CHECK-NEXT:    str w8, [x9, #4]
+; CHECK-NEXT:    add w8, w8, #1
+; CHECK-NEXT:    str w8, [x9]
 ; CHECK-NEXT:    cmp w8, w3
 ; CHECK-NEXT:    b.lt .LBB2_2
 ; CHECK-NEXT:  .LBB2_3: // %while_end
@@ -165,9 +165,9 @@ define void @test4(i32 %n) uwtable personality ptr @__FrameHandler {
 ; CHECK-NEXT:    b.ge .LBB3_4
 ; CHECK-NEXT:  // %bb.3: // %while_body
 ; CHECK-NEXT:    // in Loop: Header=BB3_1 Depth=1
-; CHECK-NEXT:    add w9, w20, #1
-; CHECK-NEXT:    stp w9, w20, [x8]
-; CHECK-NEXT:    mov w20, w9
+; CHECK-NEXT:    str w20, [x8, #4]
+; CHECK-NEXT:    add w20, w20, #1
+; CHECK-NEXT:    str w20, [x8]
 ; CHECK-NEXT:    b .LBB3_1
 ; CHECK-NEXT:  .LBB3_4: // %while_end
 ; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
@@ -228,9 +228,9 @@ define void @test5(ptr %s, i32 %n) {
 ; CHECK-NEXT:    b.ge .LBB4_2
 ; CHECK-NEXT:  .LBB4_1: // %while_body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add w10, w8, #1
-; CHECK-NEXT:    stp w10, w8, [x9]
-; CHECK-NEXT:    mov w8, w10
+; CHECK-NEXT:    str w8, [x9, #4]
+; CHECK-NEXT:    add w8, w8, #1
+; CHECK-NEXT:    str w8, [x9]
 ; CHECK-NEXT:    cmp w8, w1
 ; CHECK-NEXT:    b.lt .LBB4_1
 ; CHECK-NEXT:  .LBB4_2: // %while_end

diff  --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-ldp.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-ldp.ll
index 1c1c385b21f3ae..2eb41cd5c2fc6a 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-ldp.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-ldp.ll
@@ -12,22 +12,22 @@ define void @convolution(ptr %src0, ptr %src1, i64 %stride_xm, i64 %stride_xp, p
 ; CHECK-NEXT:    add x12, x0, x3
 ; CHECK-NEXT:  .LBB0_1: // %do.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    add x13, x1, x8
+; CHECK-NEXT:    add x14, x0, x8
 ; CHECK-NEXT:    ldr q0, [x11, x8]
-; CHECK-NEXT:    add x13, x0, x8
+; CHECK-NEXT:    ldp q2, q3, [x14]
 ; CHECK-NEXT:    ldr q1, [x12, x8]
-; CHECK-NEXT:    add x14, x1, x8
-; CHECK-NEXT:    ldr q4, [x10, x8]
+; CHECK-NEXT:    ldp q6, q7, [x13]
 ; CHECK-NEXT:    subs w5, w5, #1
-; CHECK-NEXT:    ldp q2, q3, [x13]
-; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ldp q6, q1, [x14]
-; CHECK-NEXT:    fadd v2.4s, v2.4s, v3.4s
-; CHECK-NEXT:    fadd v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ldr q4, [x10, x8]
 ; CHECK-NEXT:    ldr q5, [x9, x8]
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fadd v1.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    add x8, x8, #32
-; CHECK-NEXT:    fadd v1.4s, v6.4s, v1.4s
-; CHECK-NEXT:    fadd v3.4s, v4.4s, v5.4s
-; CHECK-NEXT:    fadd v1.4s, v3.4s, v1.4s
+; CHECK-NEXT:    fadd v2.4s, v4.4s, v5.4s
+; CHECK-NEXT:    fadd v3.4s, v6.4s, v7.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fadd v1.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    str q0, [x4], #16
 ; CHECK-NEXT:    b.ne .LBB0_1

diff  --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/pr53625.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/pr53625.ll
index 536f9912f1b6fc..3c1094f2ee31db 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/pr53625.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/pr53625.ll
@@ -135,8 +135,8 @@ define i32 @negative_test_type_is_struct(i32 %c, ptr %a, ptr %b) {
 ; CHECK-NEXT:    cbnz w9, .LBB2_5
 ; CHECK-NEXT:  // %bb.3: // %for.cond
 ; CHECK-NEXT:    // in Loop: Header=BB2_2 Depth=1
-; CHECK-NEXT:    add x1, x1, #4
 ; CHECK-NEXT:    subs x8, x8, #1
+; CHECK-NEXT:    add x1, x1, #4
 ; CHECK-NEXT:    b.ne .LBB2_2
 ; CHECK-NEXT:  .LBB2_4:
 ; CHECK-NEXT:    mov w0, wzr

diff  --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll
index 356d9fc1faa3b2..cd6b410b67aa3b 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll
@@ -67,9 +67,9 @@ define float @test2(ptr nocapture readonly %arr, i64 %start, float %threshold) {
 ; CHECK-NEXT:    add x8, x0, #28
 ; CHECK-NEXT:  .LBB1_2: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    scvtf s2, x1
+; CHECK-NEXT:    scvtf s1, x1
+; CHECK-NEXT:    fadd s2, s1, s0
 ; CHECK-NEXT:    ldr s1, [x8, x1, lsl #2]
-; CHECK-NEXT:    fadd s2, s2, s0
 ; CHECK-NEXT:    fcmp s1, s2
 ; CHECK-NEXT:    b.gt .LBB1_5
 ; CHECK-NEXT:  // %bb.3: // %for.cond

diff  --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
index a75abf44201bdf..110fa4d9b1cf1f 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
@@ -109,19 +109,19 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" }
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    adrp x9, x
-; CHECK-NEXT:    mov w10, #2 // =0x2
-; CHECK-NEXT:    mov w11, #3 // =0x3
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    str w8, [x9, :lo12:x]
-; CHECK-NEXT:    mov w9, #4 // =0x4
+; CHECK-NEXT:    adrp x11, x
+; CHECK-NEXT:    mov w9, #2 // =0x2
+; CHECK-NEXT:    mov w10, #3 // =0x3
+; CHECK-NEXT:    str w8, [x11, :lo12:x]
+; CHECK-NEXT:    mov w11, #4 // =0x4
 ; CHECK-NEXT:    stp w8, wzr, [x29, #-8]
-; CHECK-NEXT:    stur w10, [x29, #-12]
-; CHECK-NEXT:    stp w9, w11, [sp, #12]
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    stur w9, [x29, #-12]
+; CHECK-NEXT:    stp w11, w10, [sp, #12]
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
-; CHECK-NEXT:    stp w10, w8, [x29, #-12]
-; CHECK-NEXT:    stp w9, w11, [sp, #12]
+; CHECK-NEXT:    stp w9, w8, [x29, #-12]
+; CHECK-NEXT:    stp w11, w10, [sp, #12]
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 48
 ; CHECK-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #48
@@ -134,7 +134,7 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" }
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w9, #2 // =0x2
 ; CHECK-NEXT:    mov w10, #3 // =0x3
-; CHECK-NEXT:    mov w11, #4 // =0x4
 ; CHECK-NEXT:    stp w9, w8, [x29, #-12]
-; CHECK-NEXT:    stp w11, w10, [sp, #12]
+; CHECK-NEXT:    mov w8, #4 // =0x4
+; CHECK-NEXT:    stp w8, w10, [sp, #12]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
index 01dd58dd9a6362..3d379ea1faf5f4 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
@@ -86,19 +86,19 @@ define dso_local i32 @main() #0 {
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    adrp x9, x
-; CHECK-NEXT:    mov w10, #2 // =0x2
-; CHECK-NEXT:    mov w11, #3 // =0x3
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    str w8, [x9, :lo12:x]
-; CHECK-NEXT:    mov w9, #4 // =0x4
+; CHECK-NEXT:    adrp x11, x
+; CHECK-NEXT:    mov w9, #2 // =0x2
+; CHECK-NEXT:    mov w10, #3 // =0x3
+; CHECK-NEXT:    str w8, [x11, :lo12:x]
+; CHECK-NEXT:    mov w11, #4 // =0x4
 ; CHECK-NEXT:    stp w8, wzr, [x29, #-8]
-; CHECK-NEXT:    stur w10, [x29, #-12]
-; CHECK-NEXT:    stp w9, w11, [sp, #12]
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    stur w9, [x29, #-12]
+; CHECK-NEXT:    stp w11, w10, [sp, #12]
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
-; CHECK-NEXT:    stp w10, w8, [x29, #-12]
-; CHECK-NEXT:    stp w9, w11, [sp, #12]
+; CHECK-NEXT:    stp w9, w8, [x29, #-12]
+; CHECK-NEXT:    stp w11, w10, [sp, #12]
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 48
 ; CHECK-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #48


        


More information about the cfe-commits mailing list