[llvm] 02a1d31 - [AArch64] Extend and rewrite load zero and load undef patterns (#108185)

via llvm-commits llvm-commits at lists.llvm.org
Thu Sep 19 06:52:57 PDT 2024


Author: David Green
Date: 2024-09-19T14:52:52+01:00
New Revision: 02a1d311bde4a90cffa661215c81f9fef1bc7967

URL: https://github.com/llvm/llvm-project/commit/02a1d311bde4a90cffa661215c81f9fef1bc7967
DIFF: https://github.com/llvm/llvm-project/commit/02a1d311bde4a90cffa661215c81f9fef1bc7967.diff

LOG: [AArch64] Extend and rewrite load zero and load undef patterns (#108185)

The ldr instructions implicitly zero any upper lanes, so we can use them
for insert(zerovec, load, 0) patterns. Likewise insert(undef, load, 0)
or scalar_to_reg can reuse the scalar loads as the top bits are undef.

This patch makes sure there are patterns for each type and for each of
the normal, unaligned, roW and roX addressing modes.

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64InstrFormats.td
    llvm/lib/Target/AArch64/AArch64InstrInfo.td
    llvm/test/CodeGen/AArch64/build-vector-two-dup.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
    llvm/test/CodeGen/AArch64/extbinopload.ll
    llvm/test/CodeGen/AArch64/load-insert-undef.ll
    llvm/test/CodeGen/AArch64/load-insert-zero.ll
    llvm/test/CodeGen/AArch64/merge-scoped-aa-store.ll
    llvm/test/CodeGen/AArch64/neon-dotreduce.ll
    llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll
    llvm/test/CodeGen/AArch64/trunc-to-tbl.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 16002011aedfbe..46b462de5071cb 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -177,6 +177,11 @@ def dup_v4f32 :
              [(v2f32 (extract_subvector (v4f32 (AArch64duplane32 (v4f32 node:$LHS), node:$RHS)), (i64 0))),
               (v2f32 (AArch64duplane32 (v4f32 node:$LHS), node:$RHS))]>;
 
+// Match either a scalar_to_vector (from SDAG) or a vector_insert of undef (from GISel)
+def vec_ins_or_scal_vec : PatFrags<(ops node:$src),
+                          [(vector_insert undef, node:$src, (i64 0)),
+                           (scalar_to_vector node:$src)]>;
+
 //===----------------------------------------------------------------------===//
 // Asm Operand Classes.
 //

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 9a2529a9534d09..c70e835d1619ff 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -3321,63 +3321,6 @@ defm LDRSW  : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>;
 // Pre-fetch.
 defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">;
 
-def vec_ins_or_scal_vec : PatFrags<(ops node:$src),
-                          [(vector_insert undef, node:$src, (i64 0)),
-                           (scalar_to_vector node:$src)]>;
-
-// For regular load, we do not have any alignment requirement.
-// Thus, it is safe to directly map the vector loads with interesting
-// addressing modes.
-// FIXME: We could do the same for bitconvert to floating point vectors.
-multiclass ScalToVecROLoadPat<ROAddrMode ro, SDPatternOperator loadop,
-                              ValueType ScalTy, ValueType VecTy,
-                              Instruction LOADW, Instruction LOADX,
-                              SubRegIndex sub> {
-  def : Pat<(VecTy (vec_ins_or_scal_vec (ScalTy
-              (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))),
-            (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
-                           (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset),
-                           sub)>;
-
-  def : Pat<(VecTy (vec_ins_or_scal_vec (ScalTy
-              (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))),
-            (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
-                           (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset),
-                           sub)>;
-}
-
-let AddedComplexity = 10 in {
-defm : ScalToVecROLoadPat<ro8,  extloadi8,  i32, v8i8,  LDRBroW, LDRBroX, bsub>;
-defm : ScalToVecROLoadPat<ro8,  extloadi8,  i32, v16i8, LDRBroW, LDRBroX, bsub>;
-
-defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v4i16, LDRHroW, LDRHroX, hsub>;
-defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v8i16, LDRHroW, LDRHroX, hsub>;
-
-defm : ScalToVecROLoadPat<ro16, load,       i32, v4f16, LDRHroW, LDRHroX, hsub>;
-defm : ScalToVecROLoadPat<ro16, load,       i32, v8f16, LDRHroW, LDRHroX, hsub>;
-
-defm : ScalToVecROLoadPat<ro32, load,       i32, v2i32, LDRSroW, LDRSroX, ssub>;
-defm : ScalToVecROLoadPat<ro32, load,       i32, v4i32, LDRSroW, LDRSroX, ssub>;
-
-defm : ScalToVecROLoadPat<ro32, load,       f32, v2f32, LDRSroW, LDRSroX, ssub>;
-defm : ScalToVecROLoadPat<ro32, load,       f32, v4f32, LDRSroW, LDRSroX, ssub>;
-
-defm : ScalToVecROLoadPat<ro64, load,       i64, v2i64, LDRDroW, LDRDroX, dsub>;
-
-defm : ScalToVecROLoadPat<ro64, load,       f64, v2f64, LDRDroW, LDRDroX, dsub>;
-
-
-def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
-                      (load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
-                                           ro_Wextend64:$extend))))),
-           (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
-
-def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
-                      (load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
-                                           ro_Xextend64:$extend))))),
-           (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
-}
-
 // Match all load 64 bits width whose type is compatible with FPR64
 multiclass VecROLoadPat<ROAddrMode ro, ValueType VecTy,
                         Instruction LOADW, Instruction LOADX> {
@@ -3501,42 +3444,6 @@ defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128Op, uimm12s16, "ldr",
 def : Pat <(bf16 (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
            (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
 
-// For regular load, we do not have any alignment requirement.
-// Thus, it is safe to directly map the vector loads with interesting
-// addressing modes.
-// FIXME: We could do the same for bitconvert to floating point vectors.
-def : Pat <(v8i8 (vec_ins_or_scal_vec (i32
-               (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
-           (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
-                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
-def : Pat <(v16i8 (vec_ins_or_scal_vec (i32
-               (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
-           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
-def : Pat <(v4i16 (vec_ins_or_scal_vec (i32
-               (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
-           (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
-                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
-def : Pat <(v8i16 (vec_ins_or_scal_vec (i32
-               (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
-           (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
-                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
-def : Pat <(v2i32 (vec_ins_or_scal_vec (i32
-               (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
-           (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
-                          (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
-def : Pat <(v4i32 (vec_ins_or_scal_vec (i32
-               (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
-           (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
-                          (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
-def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
-               (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
-           (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
-def : Pat <(v2i64 (vec_ins_or_scal_vec (i64
-               (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
-           (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
-                          (LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>;
-
 // Match all load 64 bits width whose type is compatible with FPR64
 let Predicates = [IsLE] in {
   // We must use LD1 to perform vector loads in big-endian.
@@ -3902,12 +3809,13 @@ def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
 def : InstAlias<"ldrsw $Rt, [$Rn, $offset]",
                 (LDURSWi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
 
-// A LDR will implicitly zero the rest of the vector, so vector_insert(zeros,
-// load, 0) can use a single load.
-multiclass LoadInsertZeroPatterns<SDPatternOperator LoadOp, ValueType VT, ValueType HVT, ValueType SVT,
-                                  ValueType ScalarVT, Instruction LoadInst, Instruction UnscaledLoadInst,
-                                  ComplexPattern Addr, ComplexPattern UnscaledAddr, Operand AddrImm,
-                                  SubRegIndex SubReg> {
+// A LDR will implicitly zero the rest of the vector, so vector_insert(zeros, load, 0)
+// can use a single load. Same for scalar_to_vector(load) or insert(undef, load, 0).
+multiclass LoadInsertVTPatterns<SDPatternOperator LoadOp, ValueType VT, ValueType ScalarVT,
+                                Instruction LoadInst, Instruction UnscaledLoadInst,
+                                Instruction ROWLoadInst, Instruction ROXLoadInst,
+                                ROAddrMode ro, ComplexPattern Addr, ComplexPattern UnscaledAddr,
+                                Operand AddrImm, SubRegIndex SubReg> {
   // Scaled
   def : Pat <(vector_insert (VT immAllZerosV),
                 (ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))), (i64 0)),
@@ -3916,42 +3824,82 @@ multiclass LoadInsertZeroPatterns<SDPatternOperator LoadOp, ValueType VT, ValueT
   def : Pat <(vector_insert (VT immAllZerosV),
                  (ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))), (i64 0)),
              (SUBREG_TO_REG (i64 0), (UnscaledLoadInst GPR64sp:$Rn, simm9:$offset), SubReg)>;
+  // roW
+  def : Pat <(vector_insert (VT immAllZerosV),
+                 (ScalarVT (LoadOp (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))), (i64 0)),
+             (SUBREG_TO_REG (i64 0), (ROWLoadInst GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend), SubReg)>;
+  // roX
+  def : Pat <(vector_insert (VT immAllZerosV),
+                 (ScalarVT (LoadOp (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))), (i64 0)),
+             (SUBREG_TO_REG (i64 0), (ROXLoadInst GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend), SubReg)>;
 
-  // Half-vector patterns
-  def : Pat <(vector_insert (HVT immAllZerosV),
-                 (ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))), (i64 0)),
-             (SUBREG_TO_REG (i64 0), (LoadInst GPR64sp:$Rn, AddrImm:$offset), SubReg)>;
-  // Unscaled
-  def : Pat <(vector_insert (HVT immAllZerosV),
-                 (ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))), (i64 0)),
-             (SUBREG_TO_REG (i64 0), (UnscaledLoadInst GPR64sp:$Rn, simm9:$offset), SubReg)>;
-
-  // SVE patterns
-  def : Pat <(vector_insert (SVT immAllZerosV),
-                 (ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))), (i64 0)),
-             (SUBREG_TO_REG (i64 0), (LoadInst GPR64sp:$Rn, AddrImm:$offset), SubReg)>;
-  // Unscaled
-  def : Pat <(vector_insert (SVT immAllZerosV),
-                 (ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))), (i64 0)),
+  // Undef equivalents of the patterns above.
+  def : Pat <(VT (vec_ins_or_scal_vec
+                (ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))))),
+            (SUBREG_TO_REG (i64 0), (LoadInst GPR64sp:$Rn, AddrImm:$offset), SubReg)>;
+  def : Pat <(VT (vec_ins_or_scal_vec
+                 (ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))))),
              (SUBREG_TO_REG (i64 0), (UnscaledLoadInst GPR64sp:$Rn, simm9:$offset), SubReg)>;
-}
-
-defm : LoadInsertZeroPatterns<extloadi8,  v16i8,  v8i8,   nxv16i8,  i32,  LDRBui, LDURBi,
-                              am_indexed8,  am_unscaled8,  uimm12s1, bsub>;
-defm : LoadInsertZeroPatterns<extloadi16, v8i16,  v4i16,  nxv8i16,  i32,  LDRHui, LDURHi,
-                              am_indexed16, am_unscaled16, uimm12s2, hsub>;
-defm : LoadInsertZeroPatterns<load,       v4i32,  v2i32,  nxv4i32,  i32,  LDRSui, LDURSi,
-                              am_indexed32, am_unscaled32, uimm12s4, ssub>;
-defm : LoadInsertZeroPatterns<load,       v2i64,  v1i64,  nxv2i64,  i64,  LDRDui, LDURDi,
-                              am_indexed64, am_unscaled64, uimm12s8, dsub>;
-defm : LoadInsertZeroPatterns<load,       v8f16,  v4f16,  nxv8f16,  f16,  LDRHui, LDURHi,
-                              am_indexed16, am_unscaled16, uimm12s2, hsub>;
-defm : LoadInsertZeroPatterns<load,       v8bf16, v4bf16, nxv8bf16, bf16, LDRHui, LDURHi,
-                              am_indexed16, am_unscaled16, uimm12s2, hsub>;
-defm : LoadInsertZeroPatterns<load,       v4f32,  v2f32,  nxv4f32,  f32,  LDRSui, LDURSi,
-                              am_indexed32, am_unscaled32, uimm12s4, ssub>;
-defm : LoadInsertZeroPatterns<load,       v2f64,  v1f64,  nxv2f64,  f64,  LDRDui, LDURDi,
-                              am_indexed64, am_unscaled64, uimm12s8, dsub>;
+  def : Pat <(VT (vec_ins_or_scal_vec
+                 (ScalarVT (LoadOp (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))))),
+             (SUBREG_TO_REG (i64 0), (ROWLoadInst GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend), SubReg)>;
+  def : Pat <(VT (vec_ins_or_scal_vec
+                 (ScalarVT (LoadOp (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))))),
+             (SUBREG_TO_REG (i64 0), (ROXLoadInst GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend), SubReg)>;
+}
+
+multiclass LoadInsertPatterns<SDPatternOperator LoadOp, ValueType VT, ValueType HVT, ValueType SVT,
+                              ValueType ScalarVT, Instruction LoadInst, Instruction UnscaledLoadInst,
+                              Instruction ROWLoadInst, Instruction ROXLoadInst,
+                              ROAddrMode ro, ComplexPattern Addr, ComplexPattern UnscaledAddr,
+                              Operand AddrImm, SubRegIndex SubReg> {
+  defm : LoadInsertVTPatterns<LoadOp, VT, ScalarVT, LoadInst, UnscaledLoadInst, ROWLoadInst,
+                              ROXLoadInst, ro, Addr, UnscaledAddr, AddrImm, SubReg>;
+  defm : LoadInsertVTPatterns<LoadOp, HVT, ScalarVT, LoadInst, UnscaledLoadInst, ROWLoadInst,
+                              ROXLoadInst, ro, Addr, UnscaledAddr, AddrImm, SubReg>;
+  defm : LoadInsertVTPatterns<LoadOp, SVT, ScalarVT, LoadInst, UnscaledLoadInst, ROWLoadInst,
+                              ROXLoadInst, ro, Addr, UnscaledAddr, AddrImm, SubReg>;
+}
+
+defm : LoadInsertPatterns<extloadi8,  v16i8,  v8i8,   nxv16i8,  i32,
+                          LDRBui, LDURBi, LDRBroW, LDRBroX,
+                          ro8, am_indexed8,  am_unscaled8,  uimm12s1, bsub>;
+defm : LoadInsertPatterns<extloadi16, v8i16,  v4i16,  nxv8i16,  i32,
+                          LDRHui, LDURHi, LDRHroW, LDRHroX,
+                          ro16, am_indexed16, am_unscaled16, uimm12s2, hsub>;
+defm : LoadInsertPatterns<load,       v4i32,  v2i32,  nxv4i32,  i32,
+                          LDRSui, LDURSi, LDRSroW, LDRSroX,
+                          ro32, am_indexed32, am_unscaled32, uimm12s4, ssub>;
+defm : LoadInsertPatterns<load,       v2i64,  isVoid, nxv2i64,  i64,
+                          LDRDui, LDURDi, LDRDroW, LDRDroX,
+                          ro64, am_indexed64, am_unscaled64, uimm12s8, dsub>;
+defm : LoadInsertPatterns<load,       v8f16,  v4f16,  nxv8f16,  f16,
+                          LDRHui, LDURHi, LDRHroW, LDRHroX,
+                          ro16, am_indexed16, am_unscaled16, uimm12s2, hsub>;
+defm : LoadInsertPatterns<load,       v8bf16, v4bf16, nxv8bf16, bf16,
+                          LDRHui, LDURHi, LDRHroW, LDRHroX,
+                          ro16, am_indexed16, am_unscaled16, uimm12s2, hsub>;
+defm : LoadInsertPatterns<load,       v4f32,  v2f32,  nxv4f32,  f32,
+                          LDRSui, LDURSi, LDRSroW, LDRSroX,
+                          ro32, am_indexed32, am_unscaled32, uimm12s4, ssub>;
+defm : LoadInsertPatterns<load,       v2f64,  isVoid, nxv2f64,  f64,
+                          LDRDui, LDURDi, LDRDroW, LDRDroX,
+                          ro64, am_indexed64, am_unscaled64, uimm12s8, dsub>;
+
+// Extra patterns for v1f64 scalar_to_vector(load), which need to avoid the
+// SUBREG_TO_REG used above.
+def : Pat <(v1i64 (scalar_to_vector (i64
+               (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
+           (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat <(v1i64 (scalar_to_vector (i64
+               (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))))),
+           (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat <(v1i64 (scalar_to_vector (i64
+               (load (ro64.Wpat GPR64sp:$Rn, GPR32:$Rm, ro64.Wext:$extend))))),
+           (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro64.Wext:$extend)>;
+def : Pat <(v1i64 (scalar_to_vector (i64
+               (load (ro64.Xpat GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend))))),
+           (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend)>;
 
 // Pre-fetch.
 defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",

diff  --git a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll
index 5cfa59a3022394..dbbfbea9176f6e 100644
--- a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll
+++ b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll
@@ -78,9 +78,9 @@ entry:
 define <16 x i8> @test5(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
 ; CHECK-LABEL: test5:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr b0, [x0]
 ; CHECK-NEXT:    adrp x8, .LCPI4_0
 ; CHECK-NEXT:    ld1r { v1.16b }, [x1]
+; CHECK-NEXT:    ldr b0, [x0]
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI4_0]
 ; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
index e8d9ec7dc85de7..7686740aec3026 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
@@ -203,89 +203,89 @@ define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c)
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
 ; CHECK-NEXT:    // kill: def $s3 killed $s3 def $q3
-; CHECK-NEXT:    ldr s16, [sp, #40]
+; CHECK-NEXT:    ldr s17, [sp, #40]
 ; CHECK-NEXT:    add x10, sp, #56
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
 ; CHECK-NEXT:    add x9, sp, #48
 ; CHECK-NEXT:    mov v1.s[1], v3.s[0]
 ; CHECK-NEXT:    ldr s3, [sp, #32]
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    // kill: def $s5 killed $s5 def $q5
 ; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
-; CHECK-NEXT:    ldr s18, [sp, #8]
-; CHECK-NEXT:    ld1 { v16.s }[1], [x10]
 ; CHECK-NEXT:    mov v0.s[1], v2.s[0]
-; CHECK-NEXT:    add x10, sp, #72
+; CHECK-NEXT:    ld1 { v17.s }[1], [x10]
+; CHECK-NEXT:    // kill: def $s5 killed $s5 def $q5
+; CHECK-NEXT:    ldr s16, [sp, #8]
+; CHECK-NEXT:    // kill: def $s4 killed $s4 def $q4
+; CHECK-NEXT:    add x10, sp, #24
 ; CHECK-NEXT:    ld1 { v3.s }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #64
-; CHECK-NEXT:    ldr s17, [sp, #104]
+; CHECK-NEXT:    add x9, sp, #72
 ; CHECK-NEXT:    // kill: def $s7 killed $s7 def $q7
-; CHECK-NEXT:    // kill: def $s4 killed $s4 def $q4
 ; CHECK-NEXT:    // kill: def $s6 killed $s6 def $q6
-; CHECK-NEXT:    ldr s2, [sp, #136]
-; CHECK-NEXT:    ldr s20, [sp, #192]
+; CHECK-NEXT:    ldr s2, [sp]
+; CHECK-NEXT:    ld1 { v16.s }[1], [x10]
+; CHECK-NEXT:    add x10, sp, #112
+; CHECK-NEXT:    ldr s20, [sp, #136]
 ; CHECK-NEXT:    mov v1.s[2], v5.s[0]
-; CHECK-NEXT:    ld1 { v16.s }[2], [x10]
+; CHECK-NEXT:    ld1 { v17.s }[2], [x9]
+; CHECK-NEXT:    add x9, sp, #64
 ; CHECK-NEXT:    ldr s5, [sp, #96]
 ; CHECK-NEXT:    ld1 { v3.s }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #24
-; CHECK-NEXT:    add x10, sp, #112
-; CHECK-NEXT:    ld1 { v18.s }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #88
 ; CHECK-NEXT:    mov v0.s[2], v4.s[0]
+; CHECK-NEXT:    add x9, sp, #88
+; CHECK-NEXT:    ldr s4, [sp, #104]
+; CHECK-NEXT:    ldr s19, [sp, #192]
 ; CHECK-NEXT:    ld1 { v5.s }[1], [x10]
 ; CHECK-NEXT:    add x10, sp, #80
-; CHECK-NEXT:    ld1 { v16.s }[3], [x9]
+; CHECK-NEXT:    ld1 { v17.s }[3], [x9]
 ; CHECK-NEXT:    mov v1.s[3], v7.s[0]
 ; CHECK-NEXT:    add x9, sp, #120
-; CHECK-NEXT:    ldr s4, [sp, #128]
 ; CHECK-NEXT:    ld1 { v3.s }[3], [x10]
-; CHECK-NEXT:    ld1 { v17.s }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #144
-; CHECK-NEXT:    ldr s7, [sp]
 ; CHECK-NEXT:    ld1 { v4.s }[1], [x9]
+; CHECK-NEXT:    ldr s7, [sp, #128]
+; CHECK-NEXT:    add x10, sp, #144
 ; CHECK-NEXT:    mov v0.s[3], v6.s[0]
-; CHECK-NEXT:    add x10, sp, #16
+; CHECK-NEXT:    add x9, sp, #16
+; CHECK-NEXT:    ld1 { v7.s }[1], [x10]
+; CHECK-NEXT:    ld1 { v2.s }[1], [x9]
 ; CHECK-NEXT:    add x9, sp, #160
-; CHECK-NEXT:    fmul v6.4s, v16.4s, v1.4s
-; CHECK-NEXT:    fmul v19.4s, v17.4s, v18.4s
-; CHECK-NEXT:    fmul v18.4s, v5.4s, v18.4s
+; CHECK-NEXT:    fmul v6.4s, v17.4s, v1.4s
+; CHECK-NEXT:    fmul v18.4s, v4.4s, v16.4s
+; CHECK-NEXT:    fmul v16.4s, v5.4s, v16.4s
 ; CHECK-NEXT:    fmul v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    ld1 { v7.s }[1], [x10]
-; CHECK-NEXT:    ld1 { v4.s }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #152
 ; CHECK-NEXT:    add x10, sp, #208
-; CHECK-NEXT:    ld1 { v2.s }[1], [x9]
+; CHECK-NEXT:    ld1 { v7.s }[2], [x9]
+; CHECK-NEXT:    add x9, sp, #152
+; CHECK-NEXT:    ld1 { v19.s }[1], [x10]
+; CHECK-NEXT:    ld1 { v20.s }[1], [x9]
 ; CHECK-NEXT:    add x9, sp, #176
-; CHECK-NEXT:    ld1 { v20.s }[1], [x10]
+; CHECK-NEXT:    add x10, sp, #184
 ; CHECK-NEXT:    fneg v6.4s, v6.4s
-; CHECK-NEXT:    fneg v19.4s, v19.4s
-; CHECK-NEXT:    fmla v18.4s, v7.4s, v17.4s
-; CHECK-NEXT:    fmla v1.4s, v0.4s, v16.4s
-; CHECK-NEXT:    ld1 { v4.s }[3], [x9]
+; CHECK-NEXT:    fneg v18.4s, v18.4s
+; CHECK-NEXT:    fmla v16.4s, v2.4s, v4.4s
+; CHECK-NEXT:    fmla v1.4s, v0.4s, v17.4s
+; CHECK-NEXT:    ld1 { v7.s }[3], [x9]
 ; CHECK-NEXT:    add x9, sp, #168
-; CHECK-NEXT:    ld1 { v2.s }[2], [x9]
-; CHECK-NEXT:    ldr s16, [sp, #200]
+; CHECK-NEXT:    ld1 { v20.s }[2], [x9]
+; CHECK-NEXT:    ldr s4, [sp, #200]
 ; CHECK-NEXT:    add x9, sp, #216
-; CHECK-NEXT:    add x10, sp, #184
 ; CHECK-NEXT:    fmla v6.4s, v0.4s, v3.4s
-; CHECK-NEXT:    fmla v19.4s, v7.4s, v5.4s
-; CHECK-NEXT:    ld1 { v16.s }[1], [x9]
-; CHECK-NEXT:    fsub v0.4s, v4.4s, v1.4s
-; CHECK-NEXT:    fsub v1.4s, v20.4s, v18.4s
-; CHECK-NEXT:    ld1 { v2.s }[3], [x10]
-; CHECK-NEXT:    fadd v3.4s, v16.4s, v19.4s
-; CHECK-NEXT:    fadd v2.4s, v2.4s, v6.4s
+; CHECK-NEXT:    fmla v18.4s, v2.4s, v5.4s
+; CHECK-NEXT:    ld1 { v4.s }[1], [x9]
+; CHECK-NEXT:    fsub v0.4s, v7.4s, v1.4s
+; CHECK-NEXT:    fsub v1.4s, v19.4s, v16.4s
+; CHECK-NEXT:    ld1 { v20.s }[3], [x10]
+; CHECK-NEXT:    fadd v2.4s, v4.4s, v18.4s
+; CHECK-NEXT:    fadd v3.4s, v20.4s, v6.4s
 ; CHECK-NEXT:    ext v4.16b, v0.16b, v1.16b, #12
-; CHECK-NEXT:    ext v5.16b, v2.16b, v3.16b, #12
-; CHECK-NEXT:    trn2 v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    ext v5.16b, v3.16b, v2.16b, #12
+; CHECK-NEXT:    trn2 v1.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    ext v4.16b, v0.16b, v4.16b, #12
-; CHECK-NEXT:    ext v5.16b, v2.16b, v5.16b, #8
+; CHECK-NEXT:    ext v5.16b, v3.16b, v5.16b, #8
 ; CHECK-NEXT:    rev64 v4.4s, v4.4s
-; CHECK-NEXT:    trn2 v3.4s, v4.4s, v5.4s
-; CHECK-NEXT:    zip2 v4.4s, v0.4s, v2.4s
-; CHECK-NEXT:    zip1 v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ext v1.16b, v3.16b, v1.16b, #8
-; CHECK-NEXT:    mov v4.d[1], v3.d[0]
+; CHECK-NEXT:    trn2 v2.4s, v4.4s, v5.4s
+; CHECK-NEXT:    zip2 v4.4s, v0.4s, v3.4s
+; CHECK-NEXT:    zip1 v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    ext v1.16b, v2.16b, v1.16b, #8
+; CHECK-NEXT:    mov v4.d[1], v2.d[0]
 ; CHECK-NEXT:    str q0, [x8]
 ; CHECK-NEXT:    stp q4, q1, [x8, #16]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll
index bd9d9b99622e34..72f4d58a425e78 100644
--- a/llvm/test/CodeGen/AArch64/extbinopload.ll
+++ b/llvm/test/CodeGen/AArch64/extbinopload.ll
@@ -667,30 +667,30 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-NEXT:    add x10, x3, #12
 ; CHECK-NEXT:    bic v1.8h, #255, lsl #8
 ; CHECK-NEXT:    ld1 { v0.s }[3], [x3], #4
-; CHECK-NEXT:    ldr s5, [x0, #4]
-; CHECK-NEXT:    ldp s2, s3, [x2, #4]
-; CHECK-NEXT:    ldr s7, [x2, #12]
-; CHECK-NEXT:    ldp s6, s4, [x0, #8]
-; CHECK-NEXT:    ld1 { v5.s }[1], [x9]
-; CHECK-NEXT:    ld1 { v7.s }[1], [x10]
-; CHECK-NEXT:    ld1 { v3.s }[1], [x8]
-; CHECK-NEXT:    ld1 { v2.s }[1], [x3]
+; CHECK-NEXT:    ldr s3, [x0, #12]
+; CHECK-NEXT:    ldp s2, s7, [x0, #4]
+; CHECK-NEXT:    ldr s6, [x2, #12]
+; CHECK-NEXT:    ldp s5, s4, [x2, #4]
+; CHECK-NEXT:    ld1 { v3.s }[1], [x11]
+; CHECK-NEXT:    ld1 { v6.s }[1], [x10]
+; CHECK-NEXT:    ld1 { v2.s }[1], [x9]
+; CHECK-NEXT:    ld1 { v4.s }[1], [x8]
+; CHECK-NEXT:    ld1 { v5.s }[1], [x3]
 ; CHECK-NEXT:    add x8, x1, #8
-; CHECK-NEXT:    ld1 { v4.s }[1], [x11]
-; CHECK-NEXT:    ld1 { v6.s }[1], [x8]
-; CHECK-NEXT:    ushll v3.8h, v3.8b, #0
-; CHECK-NEXT:    uaddl v2.8h, v2.8b, v7.8b
-; CHECK-NEXT:    uaddl v4.8h, v5.8b, v4.8b
-; CHECK-NEXT:    uaddw v1.8h, v1.8h, v6.8b
-; CHECK-NEXT:    uaddw2 v5.8h, v3.8h, v0.16b
-; CHECK-NEXT:    ushll v6.4s, v2.4h, #3
+; CHECK-NEXT:    ld1 { v7.s }[1], [x8]
+; CHECK-NEXT:    uaddl v2.8h, v2.8b, v3.8b
+; CHECK-NEXT:    ushll v4.8h, v4.8b, #0
+; CHECK-NEXT:    uaddl v3.8h, v5.8b, v6.8b
+; CHECK-NEXT:    uaddw v1.8h, v1.8h, v7.8b
+; CHECK-NEXT:    uaddw2 v4.8h, v4.8h, v0.16b
+; CHECK-NEXT:    ushll v0.4s, v2.4h, #3
+; CHECK-NEXT:    ushll v5.4s, v3.4h, #3
+; CHECK-NEXT:    ushll2 v3.4s, v3.8h, #3
 ; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #3
-; CHECK-NEXT:    ushll v0.4s, v4.4h, #3
-; CHECK-NEXT:    ushll2 v3.4s, v4.8h, #3
 ; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
-; CHECK-NEXT:    uaddw2 v1.4s, v3.4s, v1.8h
-; CHECK-NEXT:    uaddw2 v3.4s, v2.4s, v5.8h
-; CHECK-NEXT:    uaddw v2.4s, v6.4s, v5.4h
+; CHECK-NEXT:    uaddw2 v1.4s, v2.4s, v1.8h
+; CHECK-NEXT:    uaddw2 v3.4s, v3.4s, v4.8h
+; CHECK-NEXT:    uaddw v2.4s, v5.4s, v4.4h
 ; CHECK-NEXT:    ret
   %lp1 = load <4 x i8>, ptr %p
   store <4 x i8> %lp1, ptr %z
@@ -759,39 +759,40 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 define <16 x i32> @extrause_shuffle(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-LABEL: extrause_shuffle:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp s0, s1, [x0, #8]
-; CHECK-NEXT:    add x8, x1, #8
-; CHECK-NEXT:    ldr s6, [x1, #12]
-; CHECK-NEXT:    ldp s17, s18, [x2, #8]
-; CHECK-NEXT:    ldp s3, s5, [x2]
-; CHECK-NEXT:    add x9, x3, #8
-; CHECK-NEXT:    mov v4.16b, v1.16b
-; CHECK-NEXT:    ldp s7, s16, [x0]
-; CHECK-NEXT:    ldr s2, [x3, #12]
-; CHECK-NEXT:    mov v1.s[1], v6.s[0]
-; CHECK-NEXT:    ld1 { v3.s }[1], [x3], #4
-; CHECK-NEXT:    mov v4.s[1], v6.s[0]
-; CHECK-NEXT:    ld1 { v7.s }[1], [x1], #4
-; CHECK-NEXT:    ld1 { v16.s }[1], [x1]
-; CHECK-NEXT:    ld1 { v5.s }[1], [x3]
+; CHECK-NEXT:    ldp s17, s0, [x0, #8]
+; CHECK-NEXT:    add x8, x3, #8
+; CHECK-NEXT:    ldr s3, [x1, #12]
+; CHECK-NEXT:    ldp s2, s16, [x2]
+; CHECK-NEXT:    ldr s5, [x2, #12]
+; CHECK-NEXT:    add x9, x1, #8
+; CHECK-NEXT:    ldr s1, [x3, #12]
+; CHECK-NEXT:    mov v4.16b, v0.16b
+; CHECK-NEXT:    mov v0.s[1], v3.s[0]
+; CHECK-NEXT:    ld1 { v2.s }[1], [x3], #4
+; CHECK-NEXT:    ldp s6, s7, [x0]
+; CHECK-NEXT:    mov v4.s[1], v3.s[0]
+; CHECK-NEXT:    ld1 { v6.s }[1], [x1], #4
+; CHECK-NEXT:    ld1 { v7.s }[1], [x1]
+; CHECK-NEXT:    ld1 { v16.s }[1], [x3]
+; CHECK-NEXT:    ldr s3, [x2, #8]
 ; CHECK-NEXT:    ld1 { v17.s }[1], [x9]
-; CHECK-NEXT:    ld1 { v0.s }[1], [x8]
-; CHECK-NEXT:    mov v4.s[2], v18.s[0]
-; CHECK-NEXT:    mov v18.s[1], v2.s[0]
-; CHECK-NEXT:    uaddl v1.8h, v16.8b, v1.8b
-; CHECK-NEXT:    uaddl v6.8h, v7.8b, v0.8b
-; CHECK-NEXT:    uaddl v7.8h, v3.8b, v17.8b
-; CHECK-NEXT:    ushll v0.4s, v1.4h, #3
-; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #3
-; CHECK-NEXT:    uaddl v5.8h, v5.8b, v18.8b
-; CHECK-NEXT:    mov v4.s[3], v2.s[0]
-; CHECK-NEXT:    uaddw v0.4s, v0.4s, v6.4h
-; CHECK-NEXT:    uaddw2 v1.4s, v1.4s, v6.8h
-; CHECK-NEXT:    ushll v16.4s, v5.4h, #3
-; CHECK-NEXT:    ushll2 v3.4s, v5.8h, #3
+; CHECK-NEXT:    mov v4.s[2], v5.s[0]
+; CHECK-NEXT:    mov v5.s[1], v1.s[0]
+; CHECK-NEXT:    ld1 { v3.s }[1], [x8]
+; CHECK-NEXT:    uaddl v0.8h, v7.8b, v0.8b
+; CHECK-NEXT:    uaddl v6.8h, v6.8b, v17.8b
+; CHECK-NEXT:    uaddl v5.8h, v16.8b, v5.8b
+; CHECK-NEXT:    uaddl v2.8h, v2.8b, v3.8b
+; CHECK-NEXT:    ushll v3.4s, v0.4h, #3
+; CHECK-NEXT:    ushll2 v16.4s, v0.8h, #3
+; CHECK-NEXT:    mov v4.s[3], v1.s[0]
+; CHECK-NEXT:    ushll v7.4s, v5.4h, #3
+; CHECK-NEXT:    ushll2 v5.4s, v5.8h, #3
+; CHECK-NEXT:    uaddw v0.4s, v3.4s, v6.4h
+; CHECK-NEXT:    uaddw2 v1.4s, v16.4s, v6.8h
 ; CHECK-NEXT:    str q4, [x4]
-; CHECK-NEXT:    uaddw2 v3.4s, v3.4s, v7.8h
-; CHECK-NEXT:    uaddw v2.4s, v16.4s, v7.4h
+; CHECK-NEXT:    uaddw2 v3.4s, v5.4s, v2.8h
+; CHECK-NEXT:    uaddw v2.4s, v7.4s, v2.4h
 ; CHECK-NEXT:    ret
   %lp1 = load <4 x i8>, ptr %p
   %p2 = getelementptr i8, ptr %p, i32 4
@@ -860,36 +861,36 @@ define <16 x i32> @extrause_shuffle(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 define <16 x i32> @extrause_ext(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-LABEL: extrause_ext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp s1, s2, [x2]
-; CHECK-NEXT:    add x10, x3, #12
-; CHECK-NEXT:    ldp s3, s5, [x0]
-; CHECK-NEXT:    add x11, x1, #12
-; CHECK-NEXT:    ldp s6, s0, [x2, #8]
+; CHECK-NEXT:    ldp s0, s3, [x2]
 ; CHECK-NEXT:    add x8, x3, #8
+; CHECK-NEXT:    add x9, x3, #12
+; CHECK-NEXT:    add x10, x1, #8
+; CHECK-NEXT:    add x11, x1, #12
+; CHECK-NEXT:    ld1 { v0.s }[1], [x3], #4
+; CHECK-NEXT:    ldp s1, s2, [x0]
+; CHECK-NEXT:    ld1 { v1.s }[1], [x1], #4
+; CHECK-NEXT:    ld1 { v2.s }[1], [x1]
 ; CHECK-NEXT:    ldp s7, s4, [x0, #8]
-; CHECK-NEXT:    add x9, x1, #8
-; CHECK-NEXT:    ld1 { v1.s }[1], [x3], #4
-; CHECK-NEXT:    ld1 { v3.s }[1], [x1], #4
-; CHECK-NEXT:    ld1 { v5.s }[1], [x1]
+; CHECK-NEXT:    ld1 { v3.s }[1], [x3]
+; CHECK-NEXT:    ldp s6, s5, [x2, #8]
 ; CHECK-NEXT:    ld1 { v4.s }[1], [x11]
-; CHECK-NEXT:    ld1 { v2.s }[1], [x3]
-; CHECK-NEXT:    ld1 { v0.s }[1], [x10]
+; CHECK-NEXT:    ld1 { v7.s }[1], [x10]
+; CHECK-NEXT:    ld1 { v5.s }[1], [x9]
 ; CHECK-NEXT:    ld1 { v6.s }[1], [x8]
-; CHECK-NEXT:    ld1 { v7.s }[1], [x9]
-; CHECK-NEXT:    uaddl v5.8h, v5.8b, v4.8b
-; CHECK-NEXT:    uaddl v2.8h, v2.8b, v0.8b
-; CHECK-NEXT:    ushll v16.8h, v0.8b, #0
-; CHECK-NEXT:    uaddl v3.8h, v3.8b, v7.8b
-; CHECK-NEXT:    uaddl v6.8h, v1.8b, v6.8b
+; CHECK-NEXT:    uaddl v2.8h, v2.8b, v4.8b
+; CHECK-NEXT:    uaddl v1.8h, v1.8b, v7.8b
 ; CHECK-NEXT:    ushll v4.8h, v4.8b, #0
-; CHECK-NEXT:    ushll v1.4s, v5.4h, #3
-; CHECK-NEXT:    ushll v7.4s, v2.4h, #3
+; CHECK-NEXT:    uaddl v3.8h, v3.8b, v5.8b
+; CHECK-NEXT:    uaddl v6.8h, v0.8b, v6.8b
+; CHECK-NEXT:    ushll v5.8h, v5.8b, #0
+; CHECK-NEXT:    ushll v0.4s, v2.4h, #3
 ; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #3
-; CHECK-NEXT:    ushll2 v5.4s, v5.8h, #3
-; CHECK-NEXT:    stp q4, q16, [x4]
-; CHECK-NEXT:    uaddw v0.4s, v1.4s, v3.4h
-; CHECK-NEXT:    uaddw2 v1.4s, v5.4s, v3.8h
-; CHECK-NEXT:    uaddw2 v3.4s, v2.4s, v6.8h
+; CHECK-NEXT:    ushll v7.4s, v3.4h, #3
+; CHECK-NEXT:    ushll2 v3.4s, v3.8h, #3
+; CHECK-NEXT:    stp q4, q5, [x4]
+; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NEXT:    uaddw2 v1.4s, v2.4s, v1.8h
+; CHECK-NEXT:    uaddw2 v3.4s, v3.4s, v6.8h
 ; CHECK-NEXT:    uaddw v2.4s, v7.4s, v6.4h
 ; CHECK-NEXT:    ret
   %lp1 = load <4 x i8>, ptr %p
@@ -959,35 +960,35 @@ define <16 x i32> @extrause_ext(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 define <16 x i32> @extrause_add(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-LABEL: extrause_add:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp s0, s1, [x0]
-; CHECK-NEXT:    add x10, x3, #12
-; CHECK-NEXT:    ldp s2, s3, [x2]
-; CHECK-NEXT:    add x11, x1, #12
-; CHECK-NEXT:    ldp s4, s5, [x0, #8]
+; CHECK-NEXT:    ldp s0, s4, [x2]
 ; CHECK-NEXT:    add x8, x3, #8
-; CHECK-NEXT:    ldp s6, s7, [x2, #8]
-; CHECK-NEXT:    add x9, x1, #8
-; CHECK-NEXT:    ld1 { v2.s }[1], [x3], #4
-; CHECK-NEXT:    ld1 { v0.s }[1], [x1], #4
-; CHECK-NEXT:    ld1 { v1.s }[1], [x1]
-; CHECK-NEXT:    ld1 { v5.s }[1], [x11]
-; CHECK-NEXT:    ld1 { v3.s }[1], [x3]
+; CHECK-NEXT:    add x9, x3, #12
+; CHECK-NEXT:    add x10, x1, #8
+; CHECK-NEXT:    add x11, x1, #12
+; CHECK-NEXT:    ld1 { v0.s }[1], [x3], #4
+; CHECK-NEXT:    ldp s1, s2, [x0]
+; CHECK-NEXT:    ld1 { v1.s }[1], [x1], #4
+; CHECK-NEXT:    ld1 { v2.s }[1], [x1]
+; CHECK-NEXT:    ldp s7, s3, [x0, #8]
+; CHECK-NEXT:    ld1 { v4.s }[1], [x3]
+; CHECK-NEXT:    ldp s6, s5, [x2, #8]
+; CHECK-NEXT:    ld1 { v3.s }[1], [x11]
 ; CHECK-NEXT:    ld1 { v7.s }[1], [x10]
+; CHECK-NEXT:    ld1 { v5.s }[1], [x9]
 ; CHECK-NEXT:    ld1 { v6.s }[1], [x8]
-; CHECK-NEXT:    ld1 { v4.s }[1], [x9]
-; CHECK-NEXT:    uaddl v5.8h, v1.8b, v5.8b
-; CHECK-NEXT:    uaddl v7.8h, v3.8b, v7.8b
-; CHECK-NEXT:    uaddl v1.8h, v0.8b, v4.8b
-; CHECK-NEXT:    uaddl v2.8h, v2.8b, v6.8b
-; CHECK-NEXT:    ushll v0.4s, v5.4h, #3
-; CHECK-NEXT:    ushll v4.4s, v7.4h, #3
-; CHECK-NEXT:    ushll2 v3.4s, v7.8h, #3
-; CHECK-NEXT:    ushll2 v6.4s, v5.8h, #3
-; CHECK-NEXT:    stp q5, q7, [x4]
+; CHECK-NEXT:    uaddl v16.8h, v2.8b, v3.8b
+; CHECK-NEXT:    uaddl v1.8h, v1.8b, v7.8b
+; CHECK-NEXT:    uaddl v4.8h, v4.8b, v5.8b
+; CHECK-NEXT:    uaddl v2.8h, v0.8b, v6.8b
+; CHECK-NEXT:    ushll v0.4s, v16.4h, #3
+; CHECK-NEXT:    ushll2 v6.4s, v16.8h, #3
+; CHECK-NEXT:    ushll v5.4s, v4.4h, #3
+; CHECK-NEXT:    ushll2 v3.4s, v4.8h, #3
+; CHECK-NEXT:    stp q16, q4, [x4]
 ; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
-; CHECK-NEXT:    uaddw2 v3.4s, v3.4s, v2.8h
-; CHECK-NEXT:    uaddw v2.4s, v4.4s, v2.4h
 ; CHECK-NEXT:    uaddw2 v1.4s, v6.4s, v1.8h
+; CHECK-NEXT:    uaddw2 v3.4s, v3.4s, v2.8h
+; CHECK-NEXT:    uaddw v2.4s, v5.4s, v2.4h
 ; CHECK-NEXT:    ret
   %lp1 = load <4 x i8>, ptr %p
   %p2 = getelementptr i8, ptr %p, i32 4
@@ -1056,38 +1057,38 @@ define <16 x i32> @extrause_add(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 define <16 x i32> @extrause_ext2(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-LABEL: extrause_ext2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp s0, s1, [x2]
-; CHECK-NEXT:    add x10, x3, #12
-; CHECK-NEXT:    ldp s2, s3, [x0]
-; CHECK-NEXT:    add x11, x1, #12
-; CHECK-NEXT:    ldp s4, s5, [x2, #8]
+; CHECK-NEXT:    ldp s0, s4, [x2]
 ; CHECK-NEXT:    add x8, x3, #8
-; CHECK-NEXT:    ldp s6, s7, [x0, #8]
-; CHECK-NEXT:    add x9, x1, #8
+; CHECK-NEXT:    add x9, x3, #12
+; CHECK-NEXT:    add x10, x1, #8
+; CHECK-NEXT:    add x11, x1, #12
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x3], #4
-; CHECK-NEXT:    ld1 { v2.s }[1], [x1], #4
-; CHECK-NEXT:    ld1 { v3.s }[1], [x1]
-; CHECK-NEXT:    ld1 { v7.s }[1], [x11]
-; CHECK-NEXT:    ld1 { v1.s }[1], [x3]
-; CHECK-NEXT:    ld1 { v5.s }[1], [x10]
-; CHECK-NEXT:    ld1 { v6.s }[1], [x9]
-; CHECK-NEXT:    ld1 { v4.s }[1], [x8]
-; CHECK-NEXT:    uaddl v7.8h, v3.8b, v7.8b
-; CHECK-NEXT:    uaddl v1.8h, v1.8b, v5.8b
-; CHECK-NEXT:    uaddl v2.8h, v2.8b, v6.8b
-; CHECK-NEXT:    uaddl v4.8h, v0.8b, v4.8b
-; CHECK-NEXT:    ushll v0.4s, v7.4h, #3
-; CHECK-NEXT:    ushll2 v3.4s, v7.8h, #3
-; CHECK-NEXT:    ushll v6.4s, v1.4h, #3
-; CHECK-NEXT:    ushll2 v16.4s, v1.8h, #3
-; CHECK-NEXT:    ushll2 v5.4s, v1.8h, #0
-; CHECK-NEXT:    ushll v17.4s, v1.4h, #0
-; CHECK-NEXT:    ushll2 v18.4s, v7.8h, #0
-; CHECK-NEXT:    uaddw2 v1.4s, v3.4s, v2.8h
-; CHECK-NEXT:    uaddw v0.4s, v0.4s, v2.4h
+; CHECK-NEXT:    ldp s1, s2, [x0]
+; CHECK-NEXT:    ld1 { v1.s }[1], [x1], #4
+; CHECK-NEXT:    ld1 { v2.s }[1], [x1]
+; CHECK-NEXT:    ldp s6, s3, [x0, #8]
+; CHECK-NEXT:    ld1 { v4.s }[1], [x3]
+; CHECK-NEXT:    ldp s7, s5, [x2, #8]
+; CHECK-NEXT:    ld1 { v3.s }[1], [x11]
+; CHECK-NEXT:    ld1 { v6.s }[1], [x10]
+; CHECK-NEXT:    ld1 { v5.s }[1], [x9]
+; CHECK-NEXT:    ld1 { v7.s }[1], [x8]
+; CHECK-NEXT:    uaddl v16.8h, v2.8b, v3.8b
+; CHECK-NEXT:    uaddl v3.8h, v1.8b, v6.8b
+; CHECK-NEXT:    uaddl v2.8h, v4.8b, v5.8b
+; CHECK-NEXT:    uaddl v4.8h, v0.8b, v7.8b
+; CHECK-NEXT:    ushll v0.4s, v16.4h, #3
+; CHECK-NEXT:    ushll2 v1.4s, v16.8h, #3
+; CHECK-NEXT:    ushll2 v18.4s, v16.8h, #0
+; CHECK-NEXT:    ushll v6.4s, v2.4h, #3
+; CHECK-NEXT:    ushll2 v7.4s, v2.8h, #3
+; CHECK-NEXT:    ushll2 v5.4s, v2.8h, #0
+; CHECK-NEXT:    ushll v17.4s, v2.4h, #0
+; CHECK-NEXT:    uaddw2 v1.4s, v1.4s, v3.8h
+; CHECK-NEXT:    uaddw v0.4s, v0.4s, v3.4h
+; CHECK-NEXT:    uaddw2 v3.4s, v7.4s, v4.8h
 ; CHECK-NEXT:    uaddw v2.4s, v6.4s, v4.4h
-; CHECK-NEXT:    uaddw2 v3.4s, v16.4s, v4.8h
-; CHECK-NEXT:    ushll v4.4s, v7.4h, #0
+; CHECK-NEXT:    ushll v4.4s, v16.4h, #0
 ; CHECK-NEXT:    stp q17, q5, [x4, #32]
 ; CHECK-NEXT:    stp q4, q18, [x4]
 ; CHECK-NEXT:    ret
@@ -1158,36 +1159,36 @@ define <16 x i32> @extrause_ext2(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 define <16 x i32> @extrause_shl(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-LABEL: extrause_shl:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp s1, s2, [x0]
-; CHECK-NEXT:    add x10, x3, #12
-; CHECK-NEXT:    ldp s0, s3, [x2]
-; CHECK-NEXT:    add x11, x1, #12
-; CHECK-NEXT:    ldp s4, s5, [x0, #8]
+; CHECK-NEXT:    ldp s0, s4, [x2]
 ; CHECK-NEXT:    add x8, x3, #8
-; CHECK-NEXT:    ldp s6, s7, [x2, #8]
-; CHECK-NEXT:    add x9, x1, #8
+; CHECK-NEXT:    add x9, x3, #12
+; CHECK-NEXT:    add x10, x1, #8
+; CHECK-NEXT:    add x11, x1, #12
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x3], #4
+; CHECK-NEXT:    ldp s1, s2, [x0]
 ; CHECK-NEXT:    ld1 { v1.s }[1], [x1], #4
 ; CHECK-NEXT:    ld1 { v2.s }[1], [x1]
-; CHECK-NEXT:    ld1 { v5.s }[1], [x11]
-; CHECK-NEXT:    ld1 { v3.s }[1], [x3]
-; CHECK-NEXT:    ld1 { v7.s }[1], [x10]
-; CHECK-NEXT:    ld1 { v4.s }[1], [x9]
-; CHECK-NEXT:    ld1 { v6.s }[1], [x8]
-; CHECK-NEXT:    uaddl v2.8h, v2.8b, v5.8b
-; CHECK-NEXT:    uaddl v3.8h, v3.8b, v7.8b
-; CHECK-NEXT:    uaddl v4.8h, v1.8b, v4.8b
+; CHECK-NEXT:    ldp s6, s3, [x0, #8]
+; CHECK-NEXT:    ld1 { v4.s }[1], [x3]
+; CHECK-NEXT:    ldp s7, s5, [x2, #8]
+; CHECK-NEXT:    ld1 { v3.s }[1], [x11]
+; CHECK-NEXT:    ld1 { v6.s }[1], [x10]
+; CHECK-NEXT:    ld1 { v5.s }[1], [x9]
+; CHECK-NEXT:    ld1 { v7.s }[1], [x8]
+; CHECK-NEXT:    uaddl v2.8h, v2.8b, v3.8b
+; CHECK-NEXT:    uaddl v3.8h, v4.8b, v5.8b
+; CHECK-NEXT:    uaddl v4.8h, v1.8b, v6.8b
 ; CHECK-NEXT:    ushll v5.4s, v2.4h, #3
-; CHECK-NEXT:    ushll2 v7.4s, v2.8h, #3
-; CHECK-NEXT:    uaddl v2.8h, v0.8b, v6.8b
-; CHECK-NEXT:    ushll v6.4s, v3.4h, #3
+; CHECK-NEXT:    ushll2 v6.4s, v2.8h, #3
+; CHECK-NEXT:    uaddl v2.8h, v0.8b, v7.8b
+; CHECK-NEXT:    ushll v7.4s, v3.4h, #3
 ; CHECK-NEXT:    ushll2 v16.4s, v3.8h, #3
-; CHECK-NEXT:    uaddw2 v1.4s, v7.4s, v4.8h
+; CHECK-NEXT:    uaddw2 v1.4s, v6.4s, v4.8h
 ; CHECK-NEXT:    uaddw v0.4s, v5.4s, v4.4h
-; CHECK-NEXT:    stp q5, q7, [x4]
+; CHECK-NEXT:    stp q5, q6, [x4]
 ; CHECK-NEXT:    uaddw2 v3.4s, v16.4s, v2.8h
-; CHECK-NEXT:    uaddw v2.4s, v6.4s, v2.4h
-; CHECK-NEXT:    stp q6, q16, [x4, #32]
+; CHECK-NEXT:    uaddw v2.4s, v7.4s, v2.4h
+; CHECK-NEXT:    stp q7, q16, [x4, #32]
 ; CHECK-NEXT:    ret
   %lp1 = load <4 x i8>, ptr %p
   %p2 = getelementptr i8, ptr %p, i32 4

diff  --git a/llvm/test/CodeGen/AArch64/load-insert-undef.ll b/llvm/test/CodeGen/AArch64/load-insert-undef.ll
index 1e776d1c06fcb3..b1b1289a0e53f8 100644
--- a/llvm/test/CodeGen/AArch64/load-insert-undef.ll
+++ b/llvm/test/CodeGen/AArch64/load-insert-undef.ll
@@ -170,8 +170,7 @@ define <16 x i8> @loadv16i8_offset(ptr %p) {
 define <4 x i16> @loadv4i16_offset(ptr %p) {
 ; CHECK-LABEL: loadv4i16_offset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldurh w8, [x0, #1]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldur h0, [x0, #1]
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i8, ptr %p, i64 1
   %l = load i16, ptr %g
@@ -182,8 +181,7 @@ define <4 x i16> @loadv4i16_offset(ptr %p) {
 define <8 x i16> @loadv8i16_offset(ptr %p) {
 ; CHECK-LABEL: loadv8i16_offset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldurh w8, [x0, #1]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldur h0, [x0, #1]
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i8, ptr %p, i64 1
   %l = load i16, ptr %g
@@ -194,8 +192,7 @@ define <8 x i16> @loadv8i16_offset(ptr %p) {
 define <2 x i32> @loadv2i32_offset(ptr %p) {
 ; CHECK-LABEL: loadv2i32_offset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldur w8, [x0, #1]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldur s0, [x0, #1]
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i8, ptr %p, i64 1
   %l = load i32, ptr %g
@@ -206,8 +203,7 @@ define <2 x i32> @loadv2i32_offset(ptr %p) {
 define <4 x i32> @loadv4i32_offset(ptr %p) {
 ; CHECK-LABEL: loadv4i32_offset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldur w8, [x0, #1]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldur s0, [x0, #1]
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i8, ptr %p, i64 1
   %l = load i32, ptr %g
@@ -218,8 +214,7 @@ define <4 x i32> @loadv4i32_offset(ptr %p) {
 define <2 x i64> @loadv2i64_offset(ptr %p) {
 ; CHECK-LABEL: loadv2i64_offset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldur x8, [x0, #1]
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldur d0, [x0, #1]
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i8, ptr %p, i64 1
   %l = load i64, ptr %g
@@ -309,8 +304,7 @@ define <2 x double> @loadv2f64_offset(ptr %p) {
 define <8 x i8> @loadv8i8_noffset(ptr %p) {
 ; CHECK-LABEL: loadv8i8_noffset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldurb w8, [x0, #-1]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldur b0, [x0, #-1]
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i8, ptr %p, i64 -1
   %l = load i8, ptr %g
@@ -321,8 +315,7 @@ define <8 x i8> @loadv8i8_noffset(ptr %p) {
 define <16 x i8> @loadv16i8_noffset(ptr %p) {
 ; CHECK-LABEL: loadv16i8_noffset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldurb w8, [x0, #-1]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldur b0, [x0, #-1]
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i8, ptr %p, i64 -1
   %l = load i8, ptr %g
@@ -333,8 +326,7 @@ define <16 x i8> @loadv16i8_noffset(ptr %p) {
 define <4 x i16> @loadv4i16_noffset(ptr %p) {
 ; CHECK-LABEL: loadv4i16_noffset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldurh w8, [x0, #-1]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldur h0, [x0, #-1]
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i8, ptr %p, i64 -1
   %l = load i16, ptr %g
@@ -345,8 +337,7 @@ define <4 x i16> @loadv4i16_noffset(ptr %p) {
 define <8 x i16> @loadv8i16_noffset(ptr %p) {
 ; CHECK-LABEL: loadv8i16_noffset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldurh w8, [x0, #-1]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldur h0, [x0, #-1]
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i8, ptr %p, i64 -1
   %l = load i16, ptr %g
@@ -357,8 +348,7 @@ define <8 x i16> @loadv8i16_noffset(ptr %p) {
 define <2 x i32> @loadv2i32_noffset(ptr %p) {
 ; CHECK-LABEL: loadv2i32_noffset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldur w8, [x0, #-1]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldur s0, [x0, #-1]
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i8, ptr %p, i64 -1
   %l = load i32, ptr %g
@@ -369,8 +359,7 @@ define <2 x i32> @loadv2i32_noffset(ptr %p) {
 define <4 x i32> @loadv4i32_noffset(ptr %p) {
 ; CHECK-LABEL: loadv4i32_noffset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldur w8, [x0, #-1]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldur s0, [x0, #-1]
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i8, ptr %p, i64 -1
   %l = load i32, ptr %g
@@ -381,8 +370,7 @@ define <4 x i32> @loadv4i32_noffset(ptr %p) {
 define <2 x i64> @loadv2i64_noffset(ptr %p) {
 ; CHECK-LABEL: loadv2i64_noffset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldur x8, [x0, #-1]
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldur d0, [x0, #-1]
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i8, ptr %p, i64 -1
   %l = load i64, ptr %g
@@ -798,8 +786,7 @@ define <vscale x 8 x i8> @loadnxv8i8(ptr %p) {
 define <vscale x 16 x i8> @loadnxv16i8(ptr %p) {
 ; CHECK-LABEL: loadnxv16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x0]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldr b0, [x0]
 ; CHECK-NEXT:    ret
   %l = load i8, ptr %p
   %v = insertelement <vscale x 16 x i8> poison, i8 %l, i32 0
@@ -820,8 +807,7 @@ define <vscale x 4 x i16> @loadnxv4i16(ptr %p) {
 define <vscale x 8 x i16> @loadnxv8i16(ptr %p) {
 ; CHECK-LABEL: loadnxv8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldr h0, [x0]
 ; CHECK-NEXT:    ret
   %l = load i16, ptr %p
   %v = insertelement <vscale x 8 x i16> poison, i16 %l, i32 0
@@ -842,8 +828,7 @@ define <vscale x 2 x i32> @loadnxv2i32(ptr %p) {
 define <vscale x 4 x i32> @loadnxv4i32(ptr %p) {
 ; CHECK-LABEL: loadnxv4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ret
   %l = load i32, ptr %p
   %v = insertelement <vscale x 4 x i32> poison, i32 %l, i32 0
@@ -853,8 +838,7 @@ define <vscale x 4 x i32> @loadnxv4i32(ptr %p) {
 define <vscale x 2 x i64> @loadnxv2i64(ptr %p) {
 ; CHECK-LABEL: loadnxv2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ret
   %l = load i64, ptr %p
   %v = insertelement <vscale x 2 x i64> poison, i64 %l, i32 0
@@ -950,8 +934,7 @@ define <vscale x 8 x i8> @loadnxv8i8_offset(ptr %p) {
 define <vscale x 16 x i8> @loadnxv16i8_offset(ptr %p) {
 ; CHECK-LABEL: loadnxv16i8_offset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x0, #1]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldr b0, [x0, #1]
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i8, ptr %p, i64 1
   %l = load i8, ptr %g
@@ -974,8 +957,7 @@ define <vscale x 4 x i16> @loadnxv4i16_offset(ptr %p) {
 define <vscale x 8 x i16> @loadnxv8i16_offset(ptr %p) {
 ; CHECK-LABEL: loadnxv8i16_offset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldurh w8, [x0, #1]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldur h0, [x0, #1]
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i8, ptr %p, i64 1
   %l = load i16, ptr %g
@@ -998,8 +980,7 @@ define <vscale x 2 x i32> @loadnxv2i32_offset(ptr %p) {
 define <vscale x 4 x i32> @loadnxv4i32_offset(ptr %p) {
 ; CHECK-LABEL: loadnxv4i32_offset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldur w8, [x0, #1]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldur s0, [x0, #1]
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i8, ptr %p, i64 1
   %l = load i32, ptr %g
@@ -1010,8 +991,7 @@ define <vscale x 4 x i32> @loadnxv4i32_offset(ptr %p) {
 define <vscale x 2 x i64> @loadnxv2i64_offset(ptr %p) {
 ; CHECK-LABEL: loadnxv2i64_offset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldur x8, [x0, #1]
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ldur d0, [x0, #1]
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i8, ptr %p, i64 1
   %l = load i64, ptr %g

diff  --git a/llvm/test/CodeGen/AArch64/load-insert-zero.ll b/llvm/test/CodeGen/AArch64/load-insert-zero.ll
index ccbd6f03fbcc36..d66944e646dabb 100644
--- a/llvm/test/CodeGen/AArch64/load-insert-zero.ll
+++ b/llvm/test/CodeGen/AArch64/load-insert-zero.ll
@@ -505,8 +505,7 @@ define <8 x i16> @loadv8i16_roW(ptr %p, i32 %o) {
 define <2 x i32> @loadv2i32_roW(ptr %p, i32 %o) {
 ; CHECK-LABEL: loadv2i32_roW:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, w1, sxtw #2
-; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ldr s0, [x0, w1, sxtw #2]
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i32, ptr %p, i32 %o
   %l = load i32, ptr %g
@@ -517,8 +516,7 @@ define <2 x i32> @loadv2i32_roW(ptr %p, i32 %o) {
 define <4 x i32> @loadv4i32_roW(ptr %p, i32 %o) {
 ; CHECK-LABEL: loadv4i32_roW:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, w1, sxtw #2
-; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ldr s0, [x0, w1, sxtw #2]
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i32, ptr %p, i32 %o
   %l = load i32, ptr %g
@@ -584,8 +582,7 @@ define <8 x bfloat> @loadv8bf16_roW(ptr %p, i32 %o) {
 define <2 x float> @loadv2f32_roW(ptr %p, i32 %o) {
 ; CHECK-LABEL: loadv2f32_roW:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, w1, sxtw #2
-; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ldr s0, [x0, w1, sxtw #2]
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds float, ptr %p, i32 %o
   %l = load float, ptr %g
@@ -596,8 +593,7 @@ define <2 x float> @loadv2f32_roW(ptr %p, i32 %o) {
 define <4 x float> @loadv4f32_roW(ptr %p, i32 %o) {
 ; CHECK-LABEL: loadv4f32_roW:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, w1, sxtw #2
-; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ldr s0, [x0, w1, sxtw #2]
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds float, ptr %p, i32 %o
   %l = load float, ptr %g
@@ -666,8 +662,7 @@ define <8 x i16> @loadv8i16_roX(ptr %p, i64 %o) {
 define <2 x i32> @loadv2i32_roX(ptr %p, i64 %o) {
 ; CHECK-LABEL: loadv2i32_roX:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, x1, lsl #2
-; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ldr s0, [x0, x1, lsl #2]
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i32, ptr %p, i64 %o
   %l = load i32, ptr %g
@@ -678,8 +673,7 @@ define <2 x i32> @loadv2i32_roX(ptr %p, i64 %o) {
 define <4 x i32> @loadv4i32_roX(ptr %p, i64 %o) {
 ; CHECK-LABEL: loadv4i32_roX:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, x1, lsl #2
-; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ldr s0, [x0, x1, lsl #2]
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i32, ptr %p, i64 %o
   %l = load i32, ptr %g
@@ -745,8 +739,7 @@ define <8 x bfloat> @loadv8bf16_roX(ptr %p, i64 %o) {
 define <2 x float> @loadv2f32_roX(ptr %p, i64 %o) {
 ; CHECK-LABEL: loadv2f32_roX:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, x1, lsl #2
-; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ldr s0, [x0, x1, lsl #2]
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds float, ptr %p, i64 %o
   %l = load float, ptr %g
@@ -757,8 +750,7 @@ define <2 x float> @loadv2f32_roX(ptr %p, i64 %o) {
 define <4 x float> @loadv4f32_roX(ptr %p, i64 %o) {
 ; CHECK-LABEL: loadv4f32_roX:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, x1, lsl #2
-; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ldr s0, [x0, x1, lsl #2]
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds float, ptr %p, i64 %o
   %l = load float, ptr %g

diff  --git a/llvm/test/CodeGen/AArch64/merge-scoped-aa-store.ll b/llvm/test/CodeGen/AArch64/merge-scoped-aa-store.ll
index 23011dfc2d4fd5..a0eba7c747e954 100644
--- a/llvm/test/CodeGen/AArch64/merge-scoped-aa-store.ll
+++ b/llvm/test/CodeGen/AArch64/merge-scoped-aa-store.ll
@@ -15,7 +15,7 @@
 define void @blam0(ptr %g0, ptr %g1) {
 ; MIR-LABEL: name: blam0
 ; MIR: LDRDui %0, 0 :: (load (s64) from %ir.g0, align 4, !alias.scope ![[SET0]], !noalias ![[SET1]])
-; MIR: STRDui killed %5, %1, 0 :: (store (s64) into %ir.tmp41, align 4, !alias.scope ![[SET1]], !noalias ![[SET0]])
+; MIR: STRDui killed %{{[0-9]*}}, %1, 0 :: (store (s64) into %ir.tmp41, align 4, !alias.scope ![[SET1]], !noalias ![[SET0]])
   %tmp4 = getelementptr inbounds <3 x float>, ptr %g1, i64 0, i64 0
   %tmp5 = load <3 x float>, ptr %g0, align 4, !alias.scope !0, !noalias !1
   %tmp6 = extractelement <3 x float> %tmp5, i64 0
@@ -35,7 +35,7 @@ define void @blam1(ptr %g0, ptr %g1) {
 ; MIR-DAG: ![[MMSET1:[0-9]+]] = !{}
 ; MIR: body:
 ; MIR: LDRDui %0, 0 :: (load (s64) from %ir.g0, align 4, !alias.scope ![[SET0]], !noalias ![[SET1]])
-; MIR: STRDui killed %5, %1, 0 :: (store (s64) into %ir.tmp41, align 4, !alias.scope ![[MMSET0]], !noalias ![[MMSET1]])
+; MIR: STRDui killed %{{[0-9]*}}, %1, 0 :: (store (s64) into %ir.tmp41, align 4, !alias.scope ![[MMSET0]], !noalias ![[MMSET1]])
   %tmp4 = getelementptr inbounds <3 x float>, ptr %g1, i64 0, i64 0
   %tmp5 = load <3 x float>, ptr %g0, align 4, !alias.scope !0, !noalias !1
   %tmp6 = extractelement <3 x float> %tmp5, i64 0

diff  --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index 33245a2b120ea4..a446aae21ee8fb 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -1089,209 +1089,209 @@ define i32 @test_sdot_v25i8_double(<25 x i8> %a, <25 x i8> %b, <25 x i8> %c, <25
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ldr b0, [sp, #280]
-; CHECK-NEXT:    add x8, sp, #288
-; CHECK-NEXT:    ldr b1, [sp, #80]
-; CHECK-NEXT:    ldr b2, [sp, #152]
-; CHECK-NEXT:    add x9, sp, #296
-; CHECK-NEXT:    ldr b4, [sp, #216]
+; CHECK-NEXT:    ldr b0, [sp, #216]
+; CHECK-NEXT:    add x8, sp, #224
+; CHECK-NEXT:    ldr b1, [sp, #16]
+; CHECK-NEXT:    ldr b2, [sp, #280]
+; CHECK-NEXT:    add x9, sp, #240
+; CHECK-NEXT:    ldr b4, [sp, #80]
 ; CHECK-NEXT:    ld1 { v0.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #88
-; CHECK-NEXT:    add x10, sp, #320
+; CHECK-NEXT:    add x8, sp, #24
+; CHECK-NEXT:    add x10, sp, #48
 ; CHECK-NEXT:    ld1 { v1.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #160
-; CHECK-NEXT:    add x12, sp, #192
-; CHECK-NEXT:    ld1 { v2.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #304
-; CHECK-NEXT:    add x11, sp, #328
-; CHECK-NEXT:    ld1 { v0.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #96
-; CHECK-NEXT:    ldr b5, [sp, #16]
-; CHECK-NEXT:    ld1 { v1.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #168
-; CHECK-NEXT:    ldr b6, [sp, #680]
-; CHECK-NEXT:    ld1 { v2.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #104
-; CHECK-NEXT:    ldr b7, [sp, #480]
-; CHECK-NEXT:    ld1 { v0.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #312
+; CHECK-NEXT:    add x8, sp, #232
+; CHECK-NEXT:    add x11, sp, #96
+; CHECK-NEXT:    ldr b5, [sp, #152]
+; CHECK-NEXT:    add x12, sp, #168
+; CHECK-NEXT:    ldr b6, [sp, #616]
+; CHECK-NEXT:    ld1 { v0.b }[2], [x8]
+; CHECK-NEXT:    add x8, sp, #32
 ; CHECK-NEXT:    fmov s3, w0
-; CHECK-NEXT:    ld1 { v1.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #176
-; CHECK-NEXT:    ldr b19, [sp, #552]
-; CHECK-NEXT:    ld1 { v2.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #112
+; CHECK-NEXT:    ld1 { v1.b }[2], [x8]
+; CHECK-NEXT:    add x8, sp, #288
+; CHECK-NEXT:    ldr b7, [sp, #416]
+; CHECK-NEXT:    ld1 { v2.b }[1], [x8]
+; CHECK-NEXT:    add x8, sp, #40
 ; CHECK-NEXT:    ldr b22, [sp, #744]
-; CHECK-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-NEXT:    add x8, sp, #336
+; CHECK-NEXT:    ld1 { v0.b }[3], [x9]
+; CHECK-NEXT:    add x9, sp, #248
 ; CHECK-NEXT:    mov v3.b[1], w1
-; CHECK-NEXT:    ld1 { v1.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #184
+; CHECK-NEXT:    ld1 { v1.b }[3], [x8]
+; CHECK-NEXT:    add x8, sp, #88
 ; CHECK-NEXT:    ldr b23, [sp, #544]
-; CHECK-NEXT:    ld1 { v2.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #224
-; CHECK-NEXT:    ldr b20, [sp, #352]
-; CHECK-NEXT:    ld1 { v0.b }[5], [x10]
-; CHECK-NEXT:    ld1 { v4.b }[1], [x9]
-; CHECK-NEXT:    add x10, sp, #120
-; CHECK-NEXT:    ld1 { v1.b }[5], [x10]
-; CHECK-NEXT:    add x10, sp, #128
-; CHECK-NEXT:    add x9, sp, #136
-; CHECK-NEXT:    ld1 { v2.b }[5], [x12]
-; CHECK-NEXT:    add x12, sp, #232
+; CHECK-NEXT:    ld1 { v4.b }[1], [x8]
+; CHECK-NEXT:    add x8, sp, #256
+; CHECK-NEXT:    ldr b19, [sp, #680]
+; CHECK-NEXT:    ld1 { v0.b }[4], [x9]
+; CHECK-NEXT:    add x9, sp, #296
+; CHECK-NEXT:    ldr b20, [sp, #480]
+; CHECK-NEXT:    ld1 { v1.b }[4], [x10]
+; CHECK-NEXT:    ld1 { v2.b }[2], [x9]
+; CHECK-NEXT:    add x10, sp, #160
+; CHECK-NEXT:    ld1 { v4.b }[2], [x11]
+; CHECK-NEXT:    add x11, sp, #304
+; CHECK-NEXT:    ld1 { v5.b }[1], [x10]
+; CHECK-NEXT:    ld1 { v0.b }[5], [x8]
+; CHECK-NEXT:    add x8, sp, #56
+; CHECK-NEXT:    add x10, sp, #264
+; CHECK-NEXT:    ld1 { v1.b }[5], [x8]
+; CHECK-NEXT:    add x8, sp, #64
+; CHECK-NEXT:    ld1 { v2.b }[3], [x11]
+; CHECK-NEXT:    add x9, sp, #272
+; CHECK-NEXT:    ld1 { v5.b }[2], [x12]
+; CHECK-NEXT:    add x11, sp, #72
+; CHECK-NEXT:    ld1 { v0.b }[6], [x10]
+; CHECK-NEXT:    add x10, sp, #312
 ; CHECK-NEXT:    mov v3.b[2], w2
-; CHECK-NEXT:    ld1 { v0.b }[6], [x11]
-; CHECK-NEXT:    ld1 { v4.b }[2], [x12]
-; CHECK-NEXT:    add x11, sp, #240
-; CHECK-NEXT:    add x12, sp, #24
-; CHECK-NEXT:    ld1 { v1.b }[6], [x10]
-; CHECK-NEXT:    add x10, sp, #200
-; CHECK-NEXT:    ld1 { v5.b }[1], [x12]
-; CHECK-NEXT:    ld1 { v2.b }[6], [x10]
-; CHECK-NEXT:    add x10, sp, #256
-; CHECK-NEXT:    ld1 { v0.b }[7], [x8]
-; CHECK-NEXT:    ld1 { v4.b }[3], [x11]
-; CHECK-NEXT:    add x8, sp, #688
-; CHECK-NEXT:    ld1 { v6.b }[1], [x8]
-; CHECK-NEXT:    add x11, sp, #32
-; CHECK-NEXT:    add x8, sp, #248
-; CHECK-NEXT:    ld1 { v5.b }[2], [x11]
-; CHECK-NEXT:    ld1 { v1.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #40
+; CHECK-NEXT:    ld1 { v1.b }[6], [x8]
+; CHECK-NEXT:    add x8, sp, #104
+; CHECK-NEXT:    ld1 { v2.b }[4], [x10]
+; CHECK-NEXT:    ld1 { v4.b }[3], [x8]
+; CHECK-NEXT:    add x8, sp, #112
+; CHECK-NEXT:    add x10, sp, #128
+; CHECK-NEXT:    ld1 { v0.b }[7], [x9]
+; CHECK-NEXT:    add x9, sp, #320
+; CHECK-NEXT:    ldr b21, [sp, #552]
+; CHECK-NEXT:    ld1 { v2.b }[5], [x9]
+; CHECK-NEXT:    add x9, sp, #176
+; CHECK-NEXT:    ld1 { v1.b }[7], [x11]
 ; CHECK-NEXT:    ld1 { v4.b }[4], [x8]
-; CHECK-NEXT:    add x8, sp, #696
-; CHECK-NEXT:    ldr b21, [sp, #616]
-; CHECK-NEXT:    ld1 { v6.b }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #208
-; CHECK-NEXT:    smull v23.8h, v23.8b, v22.8b
+; CHECK-NEXT:    add x8, sp, #624
 ; CHECK-NEXT:    ld1 { v5.b }[3], [x9]
-; CHECK-NEXT:    ld1 { v2.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #704
-; CHECK-NEXT:    ld1 { v4.b }[5], [x10]
-; CHECK-NEXT:    add x9, sp, #48
-; CHECK-NEXT:    add x10, sp, #56
-; CHECK-NEXT:    ld1 { v6.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #264
-; CHECK-NEXT:    ldr b22, [sp, #416]
+; CHECK-NEXT:    ld1 { v6.b }[1], [x8]
+; CHECK-NEXT:    add x8, sp, #120
+; CHECK-NEXT:    add x9, sp, #328
+; CHECK-NEXT:    ld1 { v2.b }[6], [x9]
+; CHECK-NEXT:    add x9, sp, #184
+; CHECK-NEXT:    add x11, sp, #192
+; CHECK-NEXT:    ld1 { v4.b }[5], [x8]
+; CHECK-NEXT:    add x8, sp, #632
 ; CHECK-NEXT:    ld1 { v5.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #488
+; CHECK-NEXT:    ld1 { v6.b }[2], [x8]
+; CHECK-NEXT:    add x9, sp, #640
+; CHECK-NEXT:    add x8, sp, #336
+; CHECK-NEXT:    ld1 { v2.b }[7], [x8]
+; CHECK-NEXT:    add x8, sp, #656
+; CHECK-NEXT:    smull v23.8h, v23.8b, v22.8b
+; CHECK-NEXT:    ld1 { v5.b }[5], [x11]
+; CHECK-NEXT:    add x11, sp, #648
+; CHECK-NEXT:    ld1 { v4.b }[6], [x10]
+; CHECK-NEXT:    ld1 { v6.b }[3], [x9]
+; CHECK-NEXT:    add x9, sp, #200
+; CHECK-NEXT:    add x10, sp, #136
+; CHECK-NEXT:    ldr b22, [sp, #352]
+; CHECK-NEXT:    add x12, sp, #360
 ; CHECK-NEXT:    mov v3.b[3], w3
-; CHECK-NEXT:    ld1 { v4.b }[6], [x8]
-; CHECK-NEXT:    add x8, sp, #712
-; CHECK-NEXT:    ld1 { v7.b }[1], [x9]
-; CHECK-NEXT:    ld1 { v6.b }[4], [x8]
-; CHECK-NEXT:    add x9, sp, #720
-; CHECK-NEXT:    add x8, sp, #64
-; CHECK-NEXT:    ld1 { v5.b }[5], [x10]
-; CHECK-NEXT:    add x10, sp, #496
-; CHECK-NEXT:    add x11, sp, #576
-; CHECK-NEXT:    ld1 { v7.b }[2], [x10]
-; CHECK-NEXT:    add x10, sp, #72
+; CHECK-NEXT:    ld1 { v5.b }[6], [x9]
+; CHECK-NEXT:    add x9, sp, #208
+; CHECK-NEXT:    ld1 { v4.b }[7], [x10]
+; CHECK-NEXT:    ld1 { v6.b }[4], [x11]
+; CHECK-NEXT:    add x11, sp, #424
+; CHECK-NEXT:    add x10, sp, #488
+; CHECK-NEXT:    ld1 { v7.b }[1], [x11]
+; CHECK-NEXT:    add x11, sp, #560
+; CHECK-NEXT:    ld1 { v20.b }[1], [x10]
+; CHECK-NEXT:    ld1 { v5.b }[7], [x9]
+; CHECK-NEXT:    add x9, sp, #440
+; CHECK-NEXT:    ld1 { v21.b }[1], [x11]
+; CHECK-NEXT:    ld1 { v6.b }[5], [x8]
+; CHECK-NEXT:    add x8, sp, #432
+; CHECK-NEXT:    ld1 { v22.b }[1], [x12]
+; CHECK-NEXT:    ld1 { v7.b }[2], [x8]
+; CHECK-NEXT:    add x11, sp, #496
+; CHECK-NEXT:    add x12, sp, #568
+; CHECK-NEXT:    add x13, sp, #368
+; CHECK-NEXT:    ld1 { v20.b }[2], [x11]
+; CHECK-NEXT:    ld1 { v21.b }[2], [x12]
+; CHECK-NEXT:    ld1 { v22.b }[2], [x13]
+; CHECK-NEXT:    add x10, sp, #448
 ; CHECK-NEXT:    mov v3.b[4], w4
-; CHECK-NEXT:    ld1 { v6.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #272
-; CHECK-NEXT:    ldr b16, [sp, #344]
-; CHECK-NEXT:    ld1 { v5.b }[6], [x8]
-; CHECK-NEXT:    add x8, sp, #728
-; CHECK-NEXT:    ld1 { v4.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #504
-; CHECK-NEXT:    ldr b17, [sp, #144]
-; CHECK-NEXT:    sshll v23.4s, v23.4h, #0
-; CHECK-NEXT:    ld1 { v6.b }[6], [x8]
 ; CHECK-NEXT:    ld1 { v7.b }[3], [x9]
-; CHECK-NEXT:    add x8, sp, #736
-; CHECK-NEXT:    add x9, sp, #512
-; CHECK-NEXT:    ld1 { v5.b }[7], [x10]
-; CHECK-NEXT:    add x10, sp, #568
+; CHECK-NEXT:    add x9, sp, #688
+; CHECK-NEXT:    add x11, sp, #576
+; CHECK-NEXT:    ld1 { v19.b }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #696
+; CHECK-NEXT:    add x12, sp, #376
+; CHECK-NEXT:    ld1 { v21.b }[3], [x11]
+; CHECK-NEXT:    ld1 { v22.b }[3], [x12]
+; CHECK-NEXT:    add x11, sp, #512
+; CHECK-NEXT:    ld1 { v7.b }[4], [x10]
+; CHECK-NEXT:    add x10, sp, #504
+; CHECK-NEXT:    add x12, sp, #584
+; CHECK-NEXT:    ld1 { v19.b }[2], [x9]
+; CHECK-NEXT:    add x9, sp, #704
+; CHECK-NEXT:    ld1 { v20.b }[3], [x10]
+; CHECK-NEXT:    add x13, sp, #384
 ; CHECK-NEXT:    mov v3.b[5], w5
-; CHECK-NEXT:    smull v16.8h, v17.8b, v16.8b
-; CHECK-NEXT:    movi v17.2d, #0000000000000000
-; CHECK-NEXT:    ld1 { v6.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #560
-; CHECK-NEXT:    ld1 { v7.b }[4], [x9]
-; CHECK-NEXT:    ld1 { v19.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #360
-; CHECK-NEXT:    add x9, sp, #424
-; CHECK-NEXT:    ld1 { v20.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #624
-; CHECK-NEXT:    ld1 { v22.b }[1], [x9]
-; CHECK-NEXT:    ld1 { v21.b }[1], [x8]
-; CHECK-NEXT:    add x9, sp, #368
-; CHECK-NEXT:    add x8, sp, #520
-; CHECK-NEXT:    ld1 { v19.b }[2], [x10]
-; CHECK-NEXT:    add x10, sp, #432
-; CHECK-NEXT:    ld1 { v7.b }[5], [x8]
-; CHECK-NEXT:    ld1 { v20.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #632
-; CHECK-NEXT:    ld1 { v22.b }[2], [x10]
-; CHECK-NEXT:    ld1 { v21.b }[2], [x9]
-; CHECK-NEXT:    add x8, sp, #376
-; CHECK-NEXT:    add x9, sp, #440
-; CHECK-NEXT:    ld1 { v19.b }[3], [x11]
-; CHECK-NEXT:    add x10, sp, #584
+; CHECK-NEXT:    ld1 { v21.b }[4], [x12]
+; CHECK-NEXT:    ld1 { v22.b }[4], [x13]
+; CHECK-NEXT:    add x10, sp, #456
+; CHECK-NEXT:    ldr b16, [sp, #344]
+; CHECK-NEXT:    ld1 { v19.b }[3], [x9]
+; CHECK-NEXT:    add x9, sp, #712
+; CHECK-NEXT:    ld1 { v20.b }[4], [x11]
+; CHECK-NEXT:    ldr b17, [sp, #144]
+; CHECK-NEXT:    ld1 { v7.b }[5], [x10]
+; CHECK-NEXT:    add x10, sp, #520
 ; CHECK-NEXT:    add x11, sp, #592
-; CHECK-NEXT:    ld1 { v20.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #640
-; CHECK-NEXT:    ld1 { v22.b }[3], [x9]
-; CHECK-NEXT:    ld1 { v21.b }[3], [x8]
-; CHECK-NEXT:    add x9, sp, #384
-; CHECK-NEXT:    add x8, sp, #528
-; CHECK-NEXT:    ld1 { v19.b }[4], [x10]
-; CHECK-NEXT:    add x10, sp, #448
-; CHECK-NEXT:    ld1 { v7.b }[6], [x8]
-; CHECK-NEXT:    ld1 { v20.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #648
-; CHECK-NEXT:    ld1 { v22.b }[4], [x10]
-; CHECK-NEXT:    ld1 { v21.b }[4], [x9]
-; CHECK-NEXT:    add x8, sp, #392
-; CHECK-NEXT:    add x9, sp, #456
-; CHECK-NEXT:    ld1 { v19.b }[5], [x11]
+; CHECK-NEXT:    add x12, sp, #392
 ; CHECK-NEXT:    mov v3.b[6], w6
-; CHECK-NEXT:    add x10, sp, #600
-; CHECK-NEXT:    ld1 { v20.b }[5], [x8]
-; CHECK-NEXT:    add x8, sp, #656
-; CHECK-NEXT:    ld1 { v22.b }[5], [x9]
-; CHECK-NEXT:    ld1 { v21.b }[5], [x8]
-; CHECK-NEXT:    add x9, sp, #400
-; CHECK-NEXT:    add x8, sp, #536
-; CHECK-NEXT:    ld1 { v19.b }[6], [x10]
+; CHECK-NEXT:    ld1 { v19.b }[4], [x9]
+; CHECK-NEXT:    add x9, sp, #720
+; CHECK-NEXT:    ld1 { v20.b }[5], [x10]
+; CHECK-NEXT:    ld1 { v21.b }[5], [x11]
+; CHECK-NEXT:    ld1 { v22.b }[5], [x12]
+; CHECK-NEXT:    smull v16.8h, v17.8b, v16.8b
+; CHECK-NEXT:    add x8, sp, #664
 ; CHECK-NEXT:    add x10, sp, #464
-; CHECK-NEXT:    ld1 { v7.b }[7], [x8]
-; CHECK-NEXT:    ld1 { v20.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #664
-; CHECK-NEXT:    ld1 { v22.b }[6], [x10]
-; CHECK-NEXT:    ld1 { v21.b }[6], [x9]
-; CHECK-NEXT:    add x8, sp, #408
+; CHECK-NEXT:    add x11, sp, #528
+; CHECK-NEXT:    ld1 { v19.b }[5], [x9]
+; CHECK-NEXT:    add x9, sp, #728
+; CHECK-NEXT:    add x12, sp, #600
+; CHECK-NEXT:    add x13, sp, #400
+; CHECK-NEXT:    ld1 { v6.b }[6], [x8]
+; CHECK-NEXT:    ld1 { v20.b }[6], [x11]
+; CHECK-NEXT:    ld1 { v21.b }[6], [x12]
+; CHECK-NEXT:    ld1 { v22.b }[6], [x13]
+; CHECK-NEXT:    ld1 { v7.b }[6], [x10]
+; CHECK-NEXT:    ld1 { v19.b }[6], [x9]
+; CHECK-NEXT:    add x9, sp, #736
 ; CHECK-NEXT:    mov v3.b[7], w7
 ; CHECK-NEXT:    sshll v18.4s, v16.4h, #0
 ; CHECK-NEXT:    movi v16.2d, #0000000000000000
-; CHECK-NEXT:    add x11, sp, #608
-; CHECK-NEXT:    ld1 { v20.b }[7], [x8]
+; CHECK-NEXT:    movi v17.2d, #0000000000000000
 ; CHECK-NEXT:    add x8, sp, #672
-; CHECK-NEXT:    add x9, sp, #472
-; CHECK-NEXT:    ld1 { v19.b }[7], [x11]
-; CHECK-NEXT:    ld1 { v21.b }[7], [x8]
-; CHECK-NEXT:    ld1 { v22.b }[7], [x9]
+; CHECK-NEXT:    add x10, sp, #472
+; CHECK-NEXT:    add x11, sp, #608
+; CHECK-NEXT:    ld1 { v19.b }[7], [x9]
+; CHECK-NEXT:    add x9, sp, #536
+; CHECK-NEXT:    add x12, sp, #408
+; CHECK-NEXT:    ld1 { v20.b }[7], [x9]
+; CHECK-NEXT:    ld1 { v21.b }[7], [x11]
+; CHECK-NEXT:    ld1 { v22.b }[7], [x12]
+; CHECK-NEXT:    ld1 { v6.b }[7], [x8]
+; CHECK-NEXT:    ld1 { v7.b }[7], [x10]
+; CHECK-NEXT:    sshll v23.4s, v23.4h, #0
 ; CHECK-NEXT:    smull v0.8h, v1.8b, v0.8b
-; CHECK-NEXT:    smull v1.8h, v3.8b, v2.8b
-; CHECK-NEXT:    smull v2.8h, v5.8b, v4.8b
+; CHECK-NEXT:    smull v1.8h, v4.8b, v2.8b
+; CHECK-NEXT:    smull v2.8h, v3.8b, v5.8b
+; CHECK-NEXT:    smull v3.8h, v20.8b, v19.8b
+; CHECK-NEXT:    smull v4.8h, v22.8b, v21.8b
 ; CHECK-NEXT:    mov v17.s[0], v18.s[0]
-; CHECK-NEXT:    smull v3.8h, v7.8b, v6.8b
+; CHECK-NEXT:    smull v5.8h, v7.8b, v6.8b
 ; CHECK-NEXT:    mov v16.s[0], v23.s[0]
-; CHECK-NEXT:    smull v4.8h, v20.8b, v19.8b
-; CHECK-NEXT:    smull v5.8h, v22.8b, v21.8b
-; CHECK-NEXT:    saddl v7.4s, v1.4h, v0.4h
-; CHECK-NEXT:    saddl2 v0.4s, v1.8h, v0.8h
-; CHECK-NEXT:    saddw v6.4s, v17.4s, v2.4h
-; CHECK-NEXT:    saddl v1.4s, v4.4h, v3.4h
-; CHECK-NEXT:    saddl2 v3.4s, v4.8h, v3.8h
-; CHECK-NEXT:    saddw v4.4s, v16.4s, v5.4h
-; CHECK-NEXT:    saddw2 v0.4s, v0.4s, v2.8h
-; CHECK-NEXT:    add v6.4s, v7.4s, v6.4s
+; CHECK-NEXT:    saddl2 v6.4s, v2.8h, v1.8h
+; CHECK-NEXT:    saddl v1.4s, v2.4h, v1.4h
+; CHECK-NEXT:    saddl2 v2.4s, v4.8h, v3.8h
+; CHECK-NEXT:    saddl v3.4s, v4.4h, v3.4h
+; CHECK-NEXT:    saddw v4.4s, v17.4s, v0.4h
+; CHECK-NEXT:    saddw v7.4s, v16.4s, v5.4h
+; CHECK-NEXT:    saddw2 v0.4s, v6.4s, v0.8h
 ; CHECK-NEXT:    add v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    saddw2 v2.4s, v3.4s, v5.8h
-; CHECK-NEXT:    add v0.4s, v6.4s, v0.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    saddw2 v2.4s, v2.4s, v5.8h
+; CHECK-NEXT:    add v3.4s, v3.4s, v7.4s
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    add v1.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
@@ -1558,22 +1558,22 @@ define i32 @test_udot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr b0, [x0, #32]
 ; CHECK-NEXT:    ldr b1, [x1, #32]
-; CHECK-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-NEXT:    ldp q4, q2, [x1]
+; CHECK-NEXT:    movi v7.2d, #0000000000000000
+; CHECK-NEXT:    ldp q3, q4, [x1]
 ; CHECK-NEXT:    umull v0.8h, v1.8b, v0.8b
-; CHECK-NEXT:    ldp q3, q1, [x0]
-; CHECK-NEXT:    umull v6.8h, v2.8b, v1.8b
-; CHECK-NEXT:    umull2 v1.8h, v2.16b, v1.16b
-; CHECK-NEXT:    umull v2.8h, v4.8b, v3.8b
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    umull v5.8h, v4.8b, v2.8b
+; CHECK-NEXT:    umull v6.8h, v3.8b, v1.8b
+; CHECK-NEXT:    umull2 v2.8h, v4.16b, v2.16b
 ; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    umull2 v3.8h, v4.16b, v3.16b
-; CHECK-NEXT:    mov v5.s[0], v0.s[0]
-; CHECK-NEXT:    uaddl2 v4.4s, v2.8h, v6.8h
-; CHECK-NEXT:    uaddl2 v0.4s, v3.8h, v1.8h
-; CHECK-NEXT:    uaddl v1.4s, v3.4h, v1.4h
-; CHECK-NEXT:    add v0.4s, v4.4s, v0.4s
-; CHECK-NEXT:    uaddw v2.4s, v5.4s, v2.4h
-; CHECK-NEXT:    uaddw v2.4s, v2.4s, v6.4h
+; CHECK-NEXT:    umull2 v1.8h, v3.16b, v1.16b
+; CHECK-NEXT:    mov v7.s[0], v0.s[0]
+; CHECK-NEXT:    uaddl2 v3.4s, v6.8h, v5.8h
+; CHECK-NEXT:    uaddl2 v0.4s, v1.8h, v2.8h
+; CHECK-NEXT:    uaddl v1.4s, v1.4h, v2.4h
+; CHECK-NEXT:    add v0.4s, v3.4s, v0.4s
+; CHECK-NEXT:    uaddw v2.4s, v7.4s, v6.4h
+; CHECK-NEXT:    uaddw v2.4s, v2.4s, v5.4h
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    add v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    addv s0, v0.4s
@@ -1626,22 +1626,22 @@ define i32 @test_sdot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr b0, [x0, #32]
 ; CHECK-NEXT:    ldr b1, [x1, #32]
-; CHECK-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-NEXT:    ldp q4, q2, [x1]
+; CHECK-NEXT:    movi v7.2d, #0000000000000000
+; CHECK-NEXT:    ldp q3, q4, [x1]
 ; CHECK-NEXT:    smull v0.8h, v1.8b, v0.8b
-; CHECK-NEXT:    ldp q3, q1, [x0]
-; CHECK-NEXT:    smull v6.8h, v2.8b, v1.8b
-; CHECK-NEXT:    smull2 v1.8h, v2.16b, v1.16b
-; CHECK-NEXT:    smull v2.8h, v4.8b, v3.8b
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    smull v5.8h, v4.8b, v2.8b
+; CHECK-NEXT:    smull v6.8h, v3.8b, v1.8b
+; CHECK-NEXT:    smull2 v2.8h, v4.16b, v2.16b
 ; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-NEXT:    smull2 v3.8h, v4.16b, v3.16b
-; CHECK-NEXT:    mov v5.s[0], v0.s[0]
-; CHECK-NEXT:    saddl2 v4.4s, v2.8h, v6.8h
-; CHECK-NEXT:    saddl2 v0.4s, v3.8h, v1.8h
-; CHECK-NEXT:    saddl v1.4s, v3.4h, v1.4h
-; CHECK-NEXT:    add v0.4s, v4.4s, v0.4s
-; CHECK-NEXT:    saddw v2.4s, v5.4s, v2.4h
-; CHECK-NEXT:    saddw v2.4s, v2.4s, v6.4h
+; CHECK-NEXT:    smull2 v1.8h, v3.16b, v1.16b
+; CHECK-NEXT:    mov v7.s[0], v0.s[0]
+; CHECK-NEXT:    saddl2 v3.4s, v6.8h, v5.8h
+; CHECK-NEXT:    saddl2 v0.4s, v1.8h, v2.8h
+; CHECK-NEXT:    saddl v1.4s, v1.4h, v2.4h
+; CHECK-NEXT:    add v0.4s, v3.4s, v0.4s
+; CHECK-NEXT:    saddw v2.4s, v7.4s, v6.4h
+; CHECK-NEXT:    saddw v2.4s, v2.4s, v5.4h
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    add v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    addv s0, v0.4s
@@ -1667,271 +1667,271 @@ define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ldr b0, [sp, #344]
 ; CHECK-NEXT:    add x8, sp, #352
-; CHECK-NEXT:    ldr b2, [sp, #80]
-; CHECK-NEXT:    add x9, sp, #88
-; CHECK-NEXT:    ldr b3, [sp, #216]
-; CHECK-NEXT:    add x10, sp, #232
+; CHECK-NEXT:    ldr b1, [sp, #80]
+; CHECK-NEXT:    ldr b2, [sp, #216]
+; CHECK-NEXT:    add x9, sp, #96
+; CHECK-NEXT:    add x10, sp, #104
 ; CHECK-NEXT:    ld1 { v0.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #224
-; CHECK-NEXT:    ld1 { v2.b }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #360
-; CHECK-NEXT:    ld1 { v3.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #96
-; CHECK-NEXT:    add x11, sp, #376
+; CHECK-NEXT:    add x8, sp, #88
 ; CHECK-NEXT:    ldr b4, [sp, #408]
-; CHECK-NEXT:    add x12, sp, #384
-; CHECK-NEXT:    ld1 { v0.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #368
-; CHECK-NEXT:    ld1 { v2.b }[2], [x8]
-; CHECK-NEXT:    ld1 { v3.b }[2], [x10]
-; CHECK-NEXT:    add x8, sp, #104
-; CHECK-NEXT:    add x14, sp, #248
-; CHECK-NEXT:    add x10, sp, #392
+; CHECK-NEXT:    ld1 { v1.b }[1], [x8]
+; CHECK-NEXT:    add x8, sp, #360
+; CHECK-NEXT:    add x12, sp, #248
+; CHECK-NEXT:    add x13, sp, #432
+; CHECK-NEXT:    add x11, sp, #384
 ; CHECK-NEXT:    ldr b5, [sp, #144]
+; CHECK-NEXT:    ld1 { v0.b }[2], [x8]
+; CHECK-NEXT:    add x8, sp, #224
 ; CHECK-NEXT:    ldr b6, [sp, #280]
-; CHECK-NEXT:    ld1 { v0.b }[3], [x9]
+; CHECK-NEXT:    ld1 { v2.b }[1], [x8]
+; CHECK-NEXT:    ld1 { v1.b }[2], [x9]
+; CHECK-NEXT:    add x8, sp, #368
+; CHECK-NEXT:    add x9, sp, #232
+; CHECK-NEXT:    ldr b16, [sp, #744]
+; CHECK-NEXT:    ldr b17, [sp, #480]
+; CHECK-NEXT:    ld1 { v0.b }[3], [x8]
+; CHECK-NEXT:    add x8, sp, #376
+; CHECK-NEXT:    ldr b18, [sp, #936]
+; CHECK-NEXT:    ld1 { v2.b }[2], [x9]
+; CHECK-NEXT:    ld1 { v1.b }[3], [x10]
 ; CHECK-NEXT:    add x9, sp, #240
-; CHECK-NEXT:    ld1 { v2.b }[3], [x8]
-; CHECK-NEXT:    ld1 { v3.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #112
-; CHECK-NEXT:    add x8, sp, #400
-; CHECK-NEXT:    add x13, sp, #128
-; CHECK-NEXT:    ldr b17, [sp, #744]
-; CHECK-NEXT:    ldr b19, [sp, #480]
-; CHECK-NEXT:    ld1 { v0.b }[4], [x11]
-; CHECK-NEXT:    ld1 { v2.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #416
-; CHECK-NEXT:    ld1 { v4.b }[1], [x9]
-; CHECK-NEXT:    ld1 { v3.b }[4], [x14]
-; CHECK-NEXT:    add x11, sp, #120
-; CHECK-NEXT:    add x9, sp, #136
-; CHECK-NEXT:    ldr b21, [sp, #936]
-; CHECK-NEXT:    ldr b22, [sp, #672]
-; CHECK-NEXT:    ld1 { v0.b }[5], [x12]
-; CHECK-NEXT:    ld1 { v2.b }[5], [x11]
-; CHECK-NEXT:    add x11, sp, #424
-; CHECK-NEXT:    add x12, sp, #256
-; CHECK-NEXT:    ld1 { v4.b }[2], [x11]
-; CHECK-NEXT:    add x11, sp, #152
-; CHECK-NEXT:    ld1 { v3.b }[5], [x12]
-; CHECK-NEXT:    ld1 { v5.b }[1], [x11]
-; CHECK-NEXT:    add x11, sp, #432
-; CHECK-NEXT:    ld1 { v0.b }[6], [x10]
-; CHECK-NEXT:    add x10, sp, #264
-; CHECK-NEXT:    ld1 { v2.b }[6], [x13]
-; CHECK-NEXT:    ld1 { v4.b }[3], [x11]
-; CHECK-NEXT:    add x11, sp, #160
-; CHECK-NEXT:    ldr b7, [sp, #472]
-; CHECK-NEXT:    ld1 { v3.b }[6], [x10]
-; CHECK-NEXT:    ld1 { v5.b }[2], [x11]
-; CHECK-NEXT:    add x10, sp, #440
-; CHECK-NEXT:    ld1 { v0.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #288
-; CHECK-NEXT:    add x11, sp, #168
-; CHECK-NEXT:    ld1 { v6.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #272
-; CHECK-NEXT:    ld1 { v4.b }[4], [x10]
-; CHECK-NEXT:    ld1 { v3.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #296
-; CHECK-NEXT:    ld1 { v5.b }[3], [x11]
-; CHECK-NEXT:    ld1 { v2.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #448
+; CHECK-NEXT:    add x10, sp, #392
+; CHECK-NEXT:    ldr b19, [sp, #672]
+; CHECK-NEXT:    ldr b7, [sp, #16]
+; CHECK-NEXT:    ld1 { v0.b }[4], [x8]
+; CHECK-NEXT:    add x8, sp, #112
+; CHECK-NEXT:    ldr b21, [sp, #1000]
+; CHECK-NEXT:    ld1 { v2.b }[3], [x9]
+; CHECK-NEXT:    ld1 { v1.b }[4], [x8]
+; CHECK-NEXT:    add x8, sp, #416
+; CHECK-NEXT:    ld1 { v4.b }[1], [x8]
+; CHECK-NEXT:    add x8, sp, #120
+; CHECK-NEXT:    add x9, sp, #400
+; CHECK-NEXT:    ld1 { v0.b }[5], [x11]
+; CHECK-NEXT:    add x11, sp, #128
+; CHECK-NEXT:    ldr b22, [sp, #736]
+; CHECK-NEXT:    ld1 { v2.b }[4], [x12]
+; CHECK-NEXT:    add x12, sp, #424
+; CHECK-NEXT:    ld1 { v1.b }[5], [x8]
+; CHECK-NEXT:    ld1 { v4.b }[2], [x12]
+; CHECK-NEXT:    add x12, sp, #152
+; CHECK-NEXT:    add x8, sp, #136
+; CHECK-NEXT:    ld1 { v5.b }[1], [x12]
+; CHECK-NEXT:    add x12, sp, #440
+; CHECK-NEXT:    ld1 { v0.b }[6], [x10]
+; CHECK-NEXT:    ld1 { v1.b }[6], [x11]
+; CHECK-NEXT:    add x11, sp, #288
+; CHECK-NEXT:    add x10, sp, #256
+; CHECK-NEXT:    ld1 { v4.b }[3], [x13]
+; CHECK-NEXT:    ld1 { v6.b }[1], [x11]
+; CHECK-NEXT:    add x11, sp, #296
+; CHECK-NEXT:    ld1 { v0.b }[7], [x9]
+; CHECK-NEXT:    add x9, sp, #160
+; CHECK-NEXT:    ld1 { v2.b }[5], [x10]
+; CHECK-NEXT:    ld1 { v5.b }[2], [x9]
+; CHECK-NEXT:    add x10, sp, #168
+; CHECK-NEXT:    ld1 { v1.b }[7], [x8]
+; CHECK-NEXT:    ld1 { v4.b }[4], [x12]
+; CHECK-NEXT:    add x12, sp, #448
+; CHECK-NEXT:    ld1 { v6.b }[2], [x11]
+; CHECK-NEXT:    add x11, sp, #304
+; CHECK-NEXT:    add x8, sp, #464
+; CHECK-NEXT:    add x13, sp, #768
+; CHECK-NEXT:    ld1 { v5.b }[3], [x10]
 ; CHECK-NEXT:    add x10, sp, #176
-; CHECK-NEXT:    ld1 { v6.b }[2], [x8]
-; CHECK-NEXT:    ld1 { v4.b }[5], [x9]
-; CHECK-NEXT:    add x8, sp, #304
+; CHECK-NEXT:    add x9, sp, #264
+; CHECK-NEXT:    ld1 { v4.b }[5], [x12]
+; CHECK-NEXT:    add x12, sp, #456
+; CHECK-NEXT:    ld1 { v6.b }[3], [x11]
+; CHECK-NEXT:    add x11, sp, #760
+; CHECK-NEXT:    ld1 { v2.b }[6], [x9]
+; CHECK-NEXT:    add x9, sp, #272
 ; CHECK-NEXT:    ld1 { v5.b }[4], [x10]
-; CHECK-NEXT:    add x9, sp, #456
-; CHECK-NEXT:    add x10, sp, #184
-; CHECK-NEXT:    add x11, sp, #192
-; CHECK-NEXT:    ldr b16, [sp, #208]
-; CHECK-NEXT:    add x12, sp, #784
-; CHECK-NEXT:    ld1 { v6.b }[3], [x8]
-; CHECK-NEXT:    ld1 { v4.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #312
-; CHECK-NEXT:    ld1 { v5.b }[5], [x10]
-; CHECK-NEXT:    add x10, sp, #752
-; CHECK-NEXT:    smull v7.8h, v16.8b, v7.8b
-; CHECK-NEXT:    ld1 { v17.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #760
-; CHECK-NEXT:    ldr b16, [sp, #16]
-; CHECK-NEXT:    ld1 { v6.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #320
-; CHECK-NEXT:    ldr b18, [sp, #1000]
-; CHECK-NEXT:    ld1 { v5.b }[6], [x11]
-; CHECK-NEXT:    add x11, sp, #768
-; CHECK-NEXT:    ldr b20, [sp, #736]
+; CHECK-NEXT:    add x10, sp, #312
+; CHECK-NEXT:    fmov s3, w0
+; CHECK-NEXT:    ld1 { v4.b }[6], [x12]
+; CHECK-NEXT:    ld1 { v6.b }[4], [x10]
+; CHECK-NEXT:    add x10, sp, #320
+; CHECK-NEXT:    add x12, sp, #680
+; CHECK-NEXT:    ld1 { v2.b }[7], [x9]
+; CHECK-NEXT:    add x9, sp, #184
+; CHECK-NEXT:    ld1 { v19.b }[1], [x12]
+; CHECK-NEXT:    add x12, sp, #776
+; CHECK-NEXT:    ld1 { v5.b }[5], [x9]
+; CHECK-NEXT:    ld1 { v4.b }[7], [x8]
+; CHECK-NEXT:    add x8, sp, #752
+; CHECK-NEXT:    ld1 { v6.b }[5], [x10]
+; CHECK-NEXT:    ld1 { v16.b }[1], [x8]
+; CHECK-NEXT:    add x10, sp, #24
+; CHECK-NEXT:    smull v22.8h, v22.8b, v21.8b
+; CHECK-NEXT:    ld1 { v7.b }[1], [x10]
+; CHECK-NEXT:    add x10, sp, #496
+; CHECK-NEXT:    mov v3.b[1], w1
+; CHECK-NEXT:    add x9, sp, #192
+; CHECK-NEXT:    ldr b20, [sp, #472]
+; CHECK-NEXT:    ldr b23, [sp, #208]
+; CHECK-NEXT:    ld1 { v16.b }[2], [x11]
+; CHECK-NEXT:    add x11, sp, #488
+; CHECK-NEXT:    ld1 { v5.b }[6], [x9]
+; CHECK-NEXT:    ld1 { v17.b }[1], [x11]
+; CHECK-NEXT:    add x11, sp, #944
+; CHECK-NEXT:    add x9, sp, #328
+; CHECK-NEXT:    ld1 { v18.b }[1], [x11]
+; CHECK-NEXT:    add x11, sp, #688
+; CHECK-NEXT:    ld1 { v6.b }[6], [x9]
+; CHECK-NEXT:    ld1 { v16.b }[3], [x13]
+; CHECK-NEXT:    ld1 { v19.b }[2], [x11]
+; CHECK-NEXT:    add x11, sp, #504
 ; CHECK-NEXT:    ld1 { v17.b }[2], [x10]
-; CHECK-NEXT:    add x10, sp, #680
-; CHECK-NEXT:    fmov s1, w0
-; CHECK-NEXT:    ld1 { v6.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #488
-; CHECK-NEXT:    ld1 { v22.b }[1], [x10]
-; CHECK-NEXT:    ld1 { v19.b }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #944
-; CHECK-NEXT:    add x10, sp, #688
-; CHECK-NEXT:    ld1 { v21.b }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #496
+; CHECK-NEXT:    add x10, sp, #952
+; CHECK-NEXT:    add x13, sp, #784
+; CHECK-NEXT:    ld1 { v18.b }[2], [x10]
+; CHECK-NEXT:    add x10, sp, #32
+; CHECK-NEXT:    add x9, sp, #40
+; CHECK-NEXT:    ld1 { v16.b }[4], [x12]
+; CHECK-NEXT:    add x12, sp, #696
+; CHECK-NEXT:    ld1 { v7.b }[2], [x10]
 ; CHECK-NEXT:    ld1 { v17.b }[3], [x11]
-; CHECK-NEXT:    ld1 { v22.b }[2], [x10]
-; CHECK-NEXT:    add x11, sp, #776
-; CHECK-NEXT:    add x10, sp, #504
-; CHECK-NEXT:    ld1 { v19.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #952
-; CHECK-NEXT:    smull v20.8h, v20.8b, v18.8b
-; CHECK-NEXT:    ld1 { v21.b }[2], [x9]
-; CHECK-NEXT:    ld1 { v17.b }[4], [x11]
-; CHECK-NEXT:    add x11, sp, #696
-; CHECK-NEXT:    add x9, sp, #24
-; CHECK-NEXT:    ld1 { v22.b }[3], [x11]
-; CHECK-NEXT:    add x11, sp, #792
-; CHECK-NEXT:    ld1 { v19.b }[3], [x10]
-; CHECK-NEXT:    add x10, sp, #960
-; CHECK-NEXT:    ld1 { v16.b }[1], [x9]
-; CHECK-NEXT:    ld1 { v21.b }[3], [x10]
-; CHECK-NEXT:    add x9, sp, #512
-; CHECK-NEXT:    ld1 { v17.b }[5], [x12]
-; CHECK-NEXT:    add x10, sp, #704
-; CHECK-NEXT:    add x12, sp, #800
-; CHECK-NEXT:    movi v18.2d, #0000000000000000
-; CHECK-NEXT:    ld1 { v19.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #968
-; CHECK-NEXT:    ld1 { v22.b }[4], [x10]
-; CHECK-NEXT:    ld1 { v21.b }[4], [x9]
+; CHECK-NEXT:    add x11, sp, #960
+; CHECK-NEXT:    ld1 { v19.b }[3], [x12]
+; CHECK-NEXT:    ld1 { v18.b }[3], [x11]
+; CHECK-NEXT:    add x10, sp, #512
+; CHECK-NEXT:    add x11, sp, #704
+; CHECK-NEXT:    ld1 { v16.b }[5], [x13]
+; CHECK-NEXT:    add x12, sp, #792
+; CHECK-NEXT:    sshll v24.4s, v22.4h, #0
+; CHECK-NEXT:    ld1 { v17.b }[4], [x10]
+; CHECK-NEXT:    add x10, sp, #968
+; CHECK-NEXT:    ld1 { v19.b }[4], [x11]
+; CHECK-NEXT:    ld1 { v18.b }[4], [x10]
 ; CHECK-NEXT:    add x10, sp, #520
+; CHECK-NEXT:    add x11, sp, #976
+; CHECK-NEXT:    ld1 { v16.b }[6], [x12]
+; CHECK-NEXT:    add x12, sp, #712
+; CHECK-NEXT:    smull v20.8h, v23.8b, v20.8b
+; CHECK-NEXT:    ld1 { v17.b }[5], [x10]
+; CHECK-NEXT:    ld1 { v19.b }[5], [x12]
+; CHECK-NEXT:    add x12, sp, #720
+; CHECK-NEXT:    ld1 { v18.b }[5], [x11]
+; CHECK-NEXT:    add x11, sp, #528
+; CHECK-NEXT:    add x10, sp, #800
+; CHECK-NEXT:    ld1 { v16.b }[7], [x10]
+; CHECK-NEXT:    add x10, sp, #536
+; CHECK-NEXT:    ldr b22, [sp, #872]
 ; CHECK-NEXT:    ld1 { v17.b }[6], [x11]
-; CHECK-NEXT:    add x11, sp, #712
-; CHECK-NEXT:    add x9, sp, #32
-; CHECK-NEXT:    sshll v23.4s, v20.4h, #0
-; CHECK-NEXT:    ld1 { v19.b }[5], [x10]
-; CHECK-NEXT:    add x10, sp, #976
-; CHECK-NEXT:    ld1 { v22.b }[5], [x11]
-; CHECK-NEXT:    ld1 { v21.b }[5], [x10]
-; CHECK-NEXT:    add x10, sp, #528
-; CHECK-NEXT:    add x11, sp, #720
-; CHECK-NEXT:    ld1 { v16.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #536
-; CHECK-NEXT:    ld1 { v17.b }[7], [x12]
-; CHECK-NEXT:    ld1 { v19.b }[6], [x10]
-; CHECK-NEXT:    add x10, sp, #984
-; CHECK-NEXT:    ld1 { v22.b }[6], [x11]
-; CHECK-NEXT:    ld1 { v21.b }[6], [x10]
-; CHECK-NEXT:    add x10, sp, #992
-; CHECK-NEXT:    add x11, sp, #728
-; CHECK-NEXT:    mov v1.b[1], w1
-; CHECK-NEXT:    ldr b20, [sp, #872]
-; CHECK-NEXT:    mov v18.s[0], v23.s[0]
-; CHECK-NEXT:    ld1 { v19.b }[7], [x9]
-; CHECK-NEXT:    ld1 { v22.b }[7], [x11]
-; CHECK-NEXT:    add x9, sp, #328
-; CHECK-NEXT:    ld1 { v21.b }[7], [x10]
-; CHECK-NEXT:    add x10, sp, #40
+; CHECK-NEXT:    add x11, sp, #984
+; CHECK-NEXT:    ld1 { v19.b }[6], [x12]
+; CHECK-NEXT:    ld1 { v18.b }[6], [x11]
+; CHECK-NEXT:    add x11, sp, #992
+; CHECK-NEXT:    add x12, sp, #728
 ; CHECK-NEXT:    ldr b23, [sp, #608]
-; CHECK-NEXT:    ld1 { v16.b }[3], [x10]
+; CHECK-NEXT:    ld1 { v7.b }[3], [x9]
+; CHECK-NEXT:    add x9, sp, #880
+; CHECK-NEXT:    ld1 { v17.b }[7], [x10]
+; CHECK-NEXT:    ld1 { v19.b }[7], [x12]
 ; CHECK-NEXT:    add x10, sp, #816
+; CHECK-NEXT:    ld1 { v18.b }[7], [x11]
 ; CHECK-NEXT:    add x11, sp, #552
-; CHECK-NEXT:    smull v17.8h, v19.8b, v17.8b
-; CHECK-NEXT:    ld1 { v6.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #880
-; CHECK-NEXT:    smull v19.8h, v22.8b, v21.8b
-; CHECK-NEXT:    ldr b21, [sp, #808]
-; CHECK-NEXT:    ldr b22, [sp, #544]
 ; CHECK-NEXT:    add x12, sp, #616
-; CHECK-NEXT:    mov v1.b[2], w2
-; CHECK-NEXT:    ld1 { v20.b }[1], [x9]
-; CHECK-NEXT:    ld1 { v21.b }[1], [x10]
-; CHECK-NEXT:    ld1 { v22.b }[1], [x11]
+; CHECK-NEXT:    mov v3.b[2], w2
+; CHECK-NEXT:    ld1 { v22.b }[1], [x9]
 ; CHECK-NEXT:    ld1 { v23.b }[1], [x12]
-; CHECK-NEXT:    add x11, sp, #824
+; CHECK-NEXT:    smull v16.8h, v17.8b, v16.8b
 ; CHECK-NEXT:    add x12, sp, #560
 ; CHECK-NEXT:    add x9, sp, #888
+; CHECK-NEXT:    smull v17.8h, v19.8b, v18.8b
+; CHECK-NEXT:    ldr b18, [sp, #808]
+; CHECK-NEXT:    ldr b19, [sp, #544]
 ; CHECK-NEXT:    add x13, sp, #624
+; CHECK-NEXT:    ld1 { v22.b }[2], [x9]
+; CHECK-NEXT:    add x9, sp, #896
+; CHECK-NEXT:    ld1 { v18.b }[1], [x10]
+; CHECK-NEXT:    ld1 { v19.b }[1], [x11]
+; CHECK-NEXT:    add x11, sp, #824
 ; CHECK-NEXT:    add x10, sp, #48
-; CHECK-NEXT:    ld1 { v20.b }[2], [x9]
-; CHECK-NEXT:    ld1 { v21.b }[2], [x11]
-; CHECK-NEXT:    ld1 { v22.b }[2], [x12]
 ; CHECK-NEXT:    ld1 { v23.b }[2], [x13]
-; CHECK-NEXT:    mov v1.b[3], w3
-; CHECK-NEXT:    ld1 { v16.b }[4], [x10]
+; CHECK-NEXT:    mov v3.b[3], w3
+; CHECK-NEXT:    ld1 { v7.b }[4], [x10]
 ; CHECK-NEXT:    add x10, sp, #832
+; CHECK-NEXT:    ld1 { v22.b }[3], [x9]
+; CHECK-NEXT:    ld1 { v18.b }[2], [x11]
+; CHECK-NEXT:    ld1 { v19.b }[2], [x12]
 ; CHECK-NEXT:    add x11, sp, #568
-; CHECK-NEXT:    add x9, sp, #896
 ; CHECK-NEXT:    add x12, sp, #632
-; CHECK-NEXT:    ld1 { v21.b }[3], [x10]
-; CHECK-NEXT:    ld1 { v22.b }[3], [x11]
-; CHECK-NEXT:    ld1 { v20.b }[3], [x9]
-; CHECK-NEXT:    ld1 { v23.b }[3], [x12]
-; CHECK-NEXT:    add x11, sp, #840
-; CHECK-NEXT:    add x12, sp, #576
-; CHECK-NEXT:    mov v1.b[4], w4
 ; CHECK-NEXT:    add x9, sp, #904
 ; CHECK-NEXT:    add x13, sp, #640
-; CHECK-NEXT:    ld1 { v21.b }[4], [x11]
-; CHECK-NEXT:    ld1 { v22.b }[4], [x12]
+; CHECK-NEXT:    ld1 { v23.b }[3], [x12]
+; CHECK-NEXT:    add x12, sp, #576
+; CHECK-NEXT:    mov v3.b[4], w4
+; CHECK-NEXT:    ld1 { v18.b }[3], [x10]
+; CHECK-NEXT:    ld1 { v19.b }[3], [x11]
+; CHECK-NEXT:    add x11, sp, #840
 ; CHECK-NEXT:    add x10, sp, #56
-; CHECK-NEXT:    ld1 { v20.b }[4], [x9]
+; CHECK-NEXT:    ld1 { v22.b }[4], [x9]
+; CHECK-NEXT:    add x9, sp, #912
 ; CHECK-NEXT:    ld1 { v23.b }[4], [x13]
-; CHECK-NEXT:    ld1 { v16.b }[5], [x10]
+; CHECK-NEXT:    ld1 { v7.b }[5], [x10]
 ; CHECK-NEXT:    add x10, sp, #848
+; CHECK-NEXT:    ld1 { v18.b }[4], [x11]
+; CHECK-NEXT:    ld1 { v19.b }[4], [x12]
 ; CHECK-NEXT:    add x11, sp, #584
-; CHECK-NEXT:    add x9, sp, #912
 ; CHECK-NEXT:    add x12, sp, #648
-; CHECK-NEXT:    ld1 { v21.b }[5], [x10]
-; CHECK-NEXT:    ld1 { v22.b }[5], [x11]
-; CHECK-NEXT:    mov v1.b[5], w5
-; CHECK-NEXT:    ld1 { v20.b }[5], [x9]
+; CHECK-NEXT:    mov v3.b[5], w5
+; CHECK-NEXT:    ld1 { v22.b }[5], [x9]
 ; CHECK-NEXT:    ld1 { v23.b }[5], [x12]
-; CHECK-NEXT:    add x11, sp, #856
 ; CHECK-NEXT:    add x12, sp, #592
+; CHECK-NEXT:    movi v21.2d, #0000000000000000
+; CHECK-NEXT:    ld1 { v18.b }[5], [x10]
+; CHECK-NEXT:    ld1 { v19.b }[5], [x11]
+; CHECK-NEXT:    add x11, sp, #856
 ; CHECK-NEXT:    add x9, sp, #920
 ; CHECK-NEXT:    add x13, sp, #656
-; CHECK-NEXT:    ld1 { v21.b }[6], [x11]
-; CHECK-NEXT:    ld1 { v22.b }[6], [x12]
 ; CHECK-NEXT:    add x10, sp, #64
-; CHECK-NEXT:    ld1 { v20.b }[6], [x9]
+; CHECK-NEXT:    ld1 { v22.b }[6], [x9]
 ; CHECK-NEXT:    ld1 { v23.b }[6], [x13]
-; CHECK-NEXT:    mov v1.b[6], w6
-; CHECK-NEXT:    ld1 { v16.b }[6], [x10]
+; CHECK-NEXT:    mov v3.b[6], w6
+; CHECK-NEXT:    ld1 { v18.b }[6], [x11]
+; CHECK-NEXT:    ld1 { v19.b }[6], [x12]
+; CHECK-NEXT:    ld1 { v7.b }[6], [x10]
 ; CHECK-NEXT:    add x10, sp, #864
 ; CHECK-NEXT:    add x11, sp, #600
 ; CHECK-NEXT:    add x9, sp, #928
 ; CHECK-NEXT:    add x12, sp, #664
-; CHECK-NEXT:    ld1 { v21.b }[7], [x10]
-; CHECK-NEXT:    ld1 { v22.b }[7], [x11]
-; CHECK-NEXT:    add x8, sp, #464
-; CHECK-NEXT:    ld1 { v20.b }[7], [x9]
+; CHECK-NEXT:    mov v21.s[0], v24.s[0]
+; CHECK-NEXT:    ld1 { v22.b }[7], [x9]
+; CHECK-NEXT:    ld1 { v18.b }[7], [x10]
+; CHECK-NEXT:    ld1 { v19.b }[7], [x11]
 ; CHECK-NEXT:    ld1 { v23.b }[7], [x12]
-; CHECK-NEXT:    ld1 { v4.b }[7], [x8]
 ; CHECK-NEXT:    add x8, sp, #200
-; CHECK-NEXT:    mov v1.b[7], w7
+; CHECK-NEXT:    mov v3.b[7], w7
 ; CHECK-NEXT:    add x10, sp, #336
 ; CHECK-NEXT:    ld1 { v5.b }[7], [x8]
 ; CHECK-NEXT:    add x8, sp, #72
-; CHECK-NEXT:    smull v21.8h, v22.8b, v21.8b
-; CHECK-NEXT:    movi v22.2d, #0000000000000000
 ; CHECK-NEXT:    ld1 { v6.b }[7], [x10]
-; CHECK-NEXT:    ld1 { v16.b }[7], [x8]
-; CHECK-NEXT:    smull v20.8h, v23.8b, v20.8b
-; CHECK-NEXT:    sshll v7.4s, v7.4h, #0
-; CHECK-NEXT:    smull v0.8h, v2.8b, v0.8b
-; CHECK-NEXT:    saddw v2.4s, v18.4s, v17.4h
-; CHECK-NEXT:    smull v1.8h, v1.8b, v3.8b
+; CHECK-NEXT:    smull v18.8h, v19.8b, v18.8b
+; CHECK-NEXT:    movi v19.2d, #0000000000000000
+; CHECK-NEXT:    ld1 { v7.b }[7], [x8]
+; CHECK-NEXT:    smull v22.8h, v23.8b, v22.8b
+; CHECK-NEXT:    sshll v20.4s, v20.4h, #0
+; CHECK-NEXT:    smull v0.8h, v1.8b, v0.8b
+; CHECK-NEXT:    saddw v1.4s, v21.4s, v16.4h
+; CHECK-NEXT:    smull v2.8h, v3.8b, v2.8b
 ; CHECK-NEXT:    smull v3.8h, v5.8b, v4.8b
-; CHECK-NEXT:    smull v4.8h, v16.8b, v6.8b
-; CHECK-NEXT:    saddl2 v5.4s, v21.8h, v19.8h
-; CHECK-NEXT:    mov v22.s[0], v7.s[0]
-; CHECK-NEXT:    saddl v7.4s, v21.4h, v19.4h
-; CHECK-NEXT:    saddl2 v6.4s, v17.8h, v20.8h
-; CHECK-NEXT:    saddw v2.4s, v2.4s, v20.4h
-; CHECK-NEXT:    saddl2 v17.4s, v1.8h, v0.8h
+; CHECK-NEXT:    smull v4.8h, v7.8b, v6.8b
+; CHECK-NEXT:    mov v19.s[0], v20.s[0]
+; CHECK-NEXT:    saddl2 v5.4s, v18.8h, v17.8h
+; CHECK-NEXT:    saddl v7.4s, v18.4h, v17.4h
+; CHECK-NEXT:    saddl2 v6.4s, v16.8h, v22.8h
+; CHECK-NEXT:    saddw v1.4s, v1.4s, v22.4h
+; CHECK-NEXT:    saddl2 v17.4s, v2.8h, v0.8h
 ; CHECK-NEXT:    saddl2 v16.4s, v4.8h, v3.8h
 ; CHECK-NEXT:    saddl v3.4s, v4.4h, v3.4h
-; CHECK-NEXT:    saddw v1.4s, v22.4s, v1.4h
+; CHECK-NEXT:    saddw v2.4s, v19.4s, v2.4h
 ; CHECK-NEXT:    add v5.4s, v6.4s, v5.4s
-; CHECK-NEXT:    add v2.4s, v2.4s, v7.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v7.4s
 ; CHECK-NEXT:    add v6.4s, v17.4s, v16.4s
-; CHECK-NEXT:    saddw v0.4s, v1.4s, v0.4h
-; CHECK-NEXT:    add v1.4s, v2.4s, v5.4s
+; CHECK-NEXT:    saddw v0.4s, v2.4s, v0.4h
+; CHECK-NEXT:    add v1.4s, v1.4s, v5.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v3.4s
 ; CHECK-NEXT:    add v1.4s, v6.4s, v1.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
@@ -1961,147 +1961,147 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
 ; CHECK-NEXT:    ldr b0, [sp, #80]
 ; CHECK-NEXT:    add x8, sp, #88
 ; CHECK-NEXT:    ldr b2, [sp, #144]
-; CHECK-NEXT:    add x9, sp, #96
-; CHECK-NEXT:    add x10, sp, #152
-; CHECK-NEXT:    add x11, sp, #160
-; CHECK-NEXT:    ld1 { v0.b }[1], [x8]
-; CHECK-NEXT:    ld1 { v2.b }[1], [x10]
-; CHECK-NEXT:    add x8, sp, #104
+; CHECK-NEXT:    add x9, sp, #152
 ; CHECK-NEXT:    ldr b3, [sp, #16]
+; CHECK-NEXT:    add x11, sp, #104
+; CHECK-NEXT:    ld1 { v0.b }[1], [x8]
+; CHECK-NEXT:    ld1 { v2.b }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #24
+; CHECK-NEXT:    add x8, sp, #96
+; CHECK-NEXT:    ld1 { v3.b }[1], [x9]
 ; CHECK-NEXT:    ldr b5, [sp, #480]
 ; CHECK-NEXT:    fmov s1, w0
-; CHECK-NEXT:    add x10, sp, #24
-; CHECK-NEXT:    add x13, sp, #488
+; CHECK-NEXT:    add x10, sp, #112
+; CHECK-NEXT:    add x12, sp, #168
+; CHECK-NEXT:    ld1 { v0.b }[2], [x8]
+; CHECK-NEXT:    add x8, sp, #160
 ; CHECK-NEXT:    ldr b4, [sp, #608]
-; CHECK-NEXT:    ld1 { v0.b }[2], [x9]
-; CHECK-NEXT:    ld1 { v2.b }[2], [x11]
-; CHECK-NEXT:    add x12, sp, #112
-; CHECK-NEXT:    ld1 { v3.b }[1], [x10]
-; CHECK-NEXT:    ld1 { v5.b }[1], [x13]
-; CHECK-NEXT:    add x10, sp, #616
+; CHECK-NEXT:    ld1 { v2.b }[2], [x8]
+; CHECK-NEXT:    add x8, sp, #32
+; CHECK-NEXT:    add x13, sp, #496
+; CHECK-NEXT:    ld1 { v3.b }[2], [x8]
 ; CHECK-NEXT:    mov v1.b[1], w1
-; CHECK-NEXT:    ld1 { v4.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #32
-; CHECK-NEXT:    ld1 { v0.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #168
-; CHECK-NEXT:    add x9, sp, #120
-; CHECK-NEXT:    ld1 { v2.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #496
-; CHECK-NEXT:    ld1 { v3.b }[2], [x10]
-; CHECK-NEXT:    ld1 { v5.b }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #176
 ; CHECK-NEXT:    ldr b6, [sp, #672]
-; CHECK-NEXT:    ld1 { v0.b }[4], [x12]
-; CHECK-NEXT:    add x14, sp, #680
-; CHECK-NEXT:    ldr b7, [sp, #544]
-; CHECK-NEXT:    ld1 { v2.b }[4], [x8]
-; CHECK-NEXT:    add x13, sp, #40
-; CHECK-NEXT:    ld1 { v6.b }[1], [x14]
-; CHECK-NEXT:    mov v1.b[2], w2
-; CHECK-NEXT:    add x11, sp, #128
-; CHECK-NEXT:    ld1 { v3.b }[3], [x13]
+; CHECK-NEXT:    ld1 { v0.b }[3], [x11]
+; CHECK-NEXT:    add x11, sp, #488
+; CHECK-NEXT:    add x9, sp, #120
+; CHECK-NEXT:    ld1 { v5.b }[1], [x11]
+; CHECK-NEXT:    add x11, sp, #40
+; CHECK-NEXT:    ld1 { v2.b }[3], [x12]
+; CHECK-NEXT:    ld1 { v3.b }[3], [x11]
+; CHECK-NEXT:    add x12, sp, #616
+; CHECK-NEXT:    ldr b16, [sp, #544]
+; CHECK-NEXT:    ld1 { v0.b }[4], [x10]
+; CHECK-NEXT:    add x10, sp, #48
+; CHECK-NEXT:    ld1 { v4.b }[1], [x12]
+; CHECK-NEXT:    add x12, sp, #176
+; CHECK-NEXT:    ld1 { v5.b }[2], [x13]
+; CHECK-NEXT:    add x13, sp, #680
+; CHECK-NEXT:    ld1 { v3.b }[4], [x10]
+; CHECK-NEXT:    ld1 { v2.b }[4], [x12]
+; CHECK-NEXT:    ld1 { v6.b }[1], [x13]
+; CHECK-NEXT:    add x13, sp, #56
 ; CHECK-NEXT:    ld1 { v0.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #552
-; CHECK-NEXT:    add x13, sp, #184
-; CHECK-NEXT:    ld1 { v7.b }[1], [x9]
-; CHECK-NEXT:    ld1 { v2.b }[5], [x13]
-; CHECK-NEXT:    add x13, sp, #624
-; CHECK-NEXT:    add x15, sp, #504
-; CHECK-NEXT:    ld1 { v4.b }[2], [x13]
-; CHECK-NEXT:    add x10, sp, #136
-; CHECK-NEXT:    ld1 { v0.b }[6], [x11]
-; CHECK-NEXT:    add x11, sp, #688
-; CHECK-NEXT:    ld1 { v5.b }[3], [x15]
-; CHECK-NEXT:    ld1 { v6.b }[2], [x11]
-; CHECK-NEXT:    add x11, sp, #560
+; CHECK-NEXT:    mov v1.b[2], w2
+; CHECK-NEXT:    add x8, sp, #128
+; CHECK-NEXT:    add x14, sp, #184
+; CHECK-NEXT:    add x11, sp, #136
+; CHECK-NEXT:    ld1 { v3.b }[5], [x13]
+; CHECK-NEXT:    add x13, sp, #552
+; CHECK-NEXT:    ld1 { v2.b }[5], [x14]
+; CHECK-NEXT:    ld1 { v16.b }[1], [x13]
+; CHECK-NEXT:    add x14, sp, #624
+; CHECK-NEXT:    ld1 { v0.b }[6], [x8]
+; CHECK-NEXT:    add x8, sp, #688
+; CHECK-NEXT:    add x13, sp, #504
+; CHECK-NEXT:    ld1 { v4.b }[2], [x14]
+; CHECK-NEXT:    ld1 { v6.b }[2], [x8]
+; CHECK-NEXT:    add x8, sp, #560
+; CHECK-NEXT:    ld1 { v5.b }[3], [x13]
+; CHECK-NEXT:    ld1 { v16.b }[2], [x8]
 ; CHECK-NEXT:    mov v1.b[3], w3
-; CHECK-NEXT:    ld1 { v7.b }[2], [x11]
-; CHECK-NEXT:    add x9, sp, #632
+; CHECK-NEXT:    add x9, sp, #64
+; CHECK-NEXT:    add x15, sp, #632
+; CHECK-NEXT:    ld1 { v3.b }[6], [x9]
+; CHECK-NEXT:    ld1 { v0.b }[7], [x11]
+; CHECK-NEXT:    ld1 { v4.b }[3], [x15]
+; CHECK-NEXT:    add x8, sp, #696
+; CHECK-NEXT:    add x9, sp, #568
 ; CHECK-NEXT:    add x11, sp, #512
-; CHECK-NEXT:    ld1 { v0.b }[7], [x10]
-; CHECK-NEXT:    ld1 { v4.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #696
-; CHECK-NEXT:    add x10, sp, #568
-; CHECK-NEXT:    ld1 { v6.b }[3], [x9]
+; CHECK-NEXT:    ld1 { v6.b }[3], [x8]
+; CHECK-NEXT:    ld1 { v16.b }[3], [x9]
 ; CHECK-NEXT:    ld1 { v5.b }[4], [x11]
-; CHECK-NEXT:    ld1 { v7.b }[3], [x10]
-; CHECK-NEXT:    add x9, sp, #640
+; CHECK-NEXT:    add x8, sp, #640
 ; CHECK-NEXT:    mov v1.b[4], w4
-; CHECK-NEXT:    ld1 { v4.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #704
-; CHECK-NEXT:    add x10, sp, #576
+; CHECK-NEXT:    ld1 { v4.b }[4], [x8]
+; CHECK-NEXT:    add x8, sp, #704
+; CHECK-NEXT:    add x9, sp, #576
 ; CHECK-NEXT:    add x11, sp, #520
-; CHECK-NEXT:    ld1 { v6.b }[4], [x9]
-; CHECK-NEXT:    ldr b18, [sp, #736]
-; CHECK-NEXT:    ld1 { v7.b }[4], [x10]
+; CHECK-NEXT:    ld1 { v6.b }[4], [x8]
+; CHECK-NEXT:    ld1 { v16.b }[4], [x9]
 ; CHECK-NEXT:    ld1 { v5.b }[5], [x11]
+; CHECK-NEXT:    ldr b18, [sp, #736]
 ; CHECK-NEXT:    add x12, sp, #192
-; CHECK-NEXT:    add x8, sp, #48
 ; CHECK-NEXT:    ld1 { v2.b }[6], [x12]
-; CHECK-NEXT:    add x9, sp, #648
-; CHECK-NEXT:    ld1 { v3.b }[4], [x8]
-; CHECK-NEXT:    add x10, sp, #528
+; CHECK-NEXT:    add x8, sp, #648
+; CHECK-NEXT:    add x9, sp, #528
 ; CHECK-NEXT:    add x11, sp, #712
 ; CHECK-NEXT:    add x12, sp, #584
 ; CHECK-NEXT:    sshll v18.8h, v18.8b, #0
 ; CHECK-NEXT:    mov v1.b[5], w5
 ; CHECK-NEXT:    ld1 { v6.b }[5], [x11]
-; CHECK-NEXT:    ld1 { v7.b }[5], [x12]
-; CHECK-NEXT:    ld1 { v4.b }[5], [x9]
-; CHECK-NEXT:    ld1 { v5.b }[6], [x10]
-; CHECK-NEXT:    add x14, sp, #56
+; CHECK-NEXT:    ld1 { v16.b }[5], [x12]
+; CHECK-NEXT:    ld1 { v4.b }[5], [x8]
+; CHECK-NEXT:    ld1 { v5.b }[6], [x9]
 ; CHECK-NEXT:    movi v17.2d, #0000000000000000
-; CHECK-NEXT:    ld1 { v3.b }[5], [x14]
-; CHECK-NEXT:    add x9, sp, #656
-; CHECK-NEXT:    add x10, sp, #536
+; CHECK-NEXT:    add x8, sp, #656
+; CHECK-NEXT:    add x9, sp, #536
 ; CHECK-NEXT:    add x11, sp, #720
 ; CHECK-NEXT:    add x12, sp, #592
 ; CHECK-NEXT:    sshll v18.4s, v18.4h, #0
-; CHECK-NEXT:    ldr b16, [sp, #208]
+; CHECK-NEXT:    ldr b7, [sp, #208]
 ; CHECK-NEXT:    ld1 { v6.b }[6], [x11]
-; CHECK-NEXT:    ld1 { v7.b }[6], [x12]
-; CHECK-NEXT:    ld1 { v4.b }[6], [x9]
-; CHECK-NEXT:    ld1 { v5.b }[7], [x10]
-; CHECK-NEXT:    add x8, sp, #64
+; CHECK-NEXT:    ld1 { v16.b }[6], [x12]
+; CHECK-NEXT:    ld1 { v4.b }[6], [x8]
+; CHECK-NEXT:    ld1 { v5.b }[7], [x9]
 ; CHECK-NEXT:    mov v1.b[6], w6
-; CHECK-NEXT:    sshll v16.8h, v16.8b, #0
-; CHECK-NEXT:    ld1 { v3.b }[6], [x8]
+; CHECK-NEXT:    sshll v7.8h, v7.8b, #0
 ; CHECK-NEXT:    add x8, sp, #664
 ; CHECK-NEXT:    add x9, sp, #728
-; CHECK-NEXT:    add x10, sp, #600
+; CHECK-NEXT:    add x11, sp, #600
 ; CHECK-NEXT:    mov v17.s[0], v18.s[0]
 ; CHECK-NEXT:    ld1 { v6.b }[7], [x9]
-; CHECK-NEXT:    ld1 { v7.b }[7], [x10]
+; CHECK-NEXT:    ld1 { v16.b }[7], [x11]
 ; CHECK-NEXT:    ld1 { v4.b }[7], [x8]
 ; CHECK-NEXT:    sshll v5.8h, v5.8b, #0
 ; CHECK-NEXT:    movi v18.2d, #0000000000000000
-; CHECK-NEXT:    sshll v16.4s, v16.4h, #0
+; CHECK-NEXT:    add x10, sp, #200
 ; CHECK-NEXT:    mov v1.b[7], w7
-; CHECK-NEXT:    add x9, sp, #200
-; CHECK-NEXT:    add x8, sp, #72
-; CHECK-NEXT:    ld1 { v2.b }[7], [x9]
+; CHECK-NEXT:    add x9, sp, #72
+; CHECK-NEXT:    sshll v7.4s, v7.4h, #0
+; CHECK-NEXT:    ld1 { v2.b }[7], [x10]
+; CHECK-NEXT:    ld1 { v3.b }[7], [x9]
 ; CHECK-NEXT:    sshll v6.8h, v6.8b, #0
-; CHECK-NEXT:    ld1 { v3.b }[7], [x8]
-; CHECK-NEXT:    sshll v7.8h, v7.8b, #0
+; CHECK-NEXT:    sshll v16.8h, v16.8b, #0
 ; CHECK-NEXT:    sshll v4.8h, v4.8b, #0
 ; CHECK-NEXT:    saddw v17.4s, v17.4s, v5.4h
-; CHECK-NEXT:    mov v18.s[0], v16.s[0]
 ; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    mov v18.s[0], v7.s[0]
 ; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
 ; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
 ; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
-; CHECK-NEXT:    saddl2 v16.4s, v7.8h, v6.8h
+; CHECK-NEXT:    saddl2 v7.4s, v16.8h, v6.8h
 ; CHECK-NEXT:    saddl2 v5.4s, v5.8h, v4.8h
-; CHECK-NEXT:    saddl v6.4s, v7.4h, v6.4h
+; CHECK-NEXT:    saddl v6.4s, v16.4h, v6.4h
 ; CHECK-NEXT:    saddw v4.4s, v17.4s, v4.4h
 ; CHECK-NEXT:    saddl2 v17.4s, v1.8h, v0.8h
+; CHECK-NEXT:    saddl2 v16.4s, v3.8h, v2.8h
 ; CHECK-NEXT:    saddw v1.4s, v18.4s, v1.4h
-; CHECK-NEXT:    saddl2 v7.4s, v3.8h, v2.8h
-; CHECK-NEXT:    add v5.4s, v5.4s, v16.4s
-; CHECK-NEXT:    saddl v2.4s, v3.4h, v2.4h
+; CHECK-NEXT:    add v5.4s, v5.4s, v7.4s
 ; CHECK-NEXT:    add v4.4s, v4.4s, v6.4s
+; CHECK-NEXT:    saddl v2.4s, v3.4h, v2.4h
+; CHECK-NEXT:    add v6.4s, v17.4s, v16.4s
 ; CHECK-NEXT:    saddw v0.4s, v1.4s, v0.4h
-; CHECK-NEXT:    add v6.4s, v17.4s, v7.4s
 ; CHECK-NEXT:    add v1.4s, v4.4s, v5.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    add v1.4s, v6.4s, v1.4s

diff  --git a/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll b/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll
index 619134dc4a696b..8b631199b0594b 100644
--- a/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll
@@ -8,8 +8,7 @@
 define <vscale x 4 x i32> @test_post_ld1_insert(ptr %a, ptr %ptr, i64 %inc) {
 ; CHECK-LABEL: test_post_ld1_insert:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    add x8, x0, x2, lsl #2
 ; CHECK-NEXT:    str x8, [x1]
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
index c838ffb0a6576e..184e8fff154b95 100644
--- a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
@@ -705,14 +705,14 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-NEXT:    mov w8, #1000 ; =0x3e8
 ; CHECK-NEXT:  LBB6_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldp q4, q1, [x0, #48]
+; CHECK-NEXT:    ldp q4, q0, [x0, #48]
 ; CHECK-NEXT:    add x9, x1, #10
-; CHECK-NEXT:    ldr d0, [x0, #80]
+; CHECK-NEXT:    ldr d1, [x0, #80]
 ; CHECK-NEXT:    ldp q3, q2, [x0]
 ; CHECK-NEXT:    ldr q5, [x0, #32]
 ; CHECK-NEXT:    subs x8, x8, #1
 ; CHECK-NEXT:    add x0, x0, #128
-; CHECK-NEXT:    uzp1.4s v0, v1, v0
+; CHECK-NEXT:    uzp1.4s v0, v0, v1
 ; CHECK-NEXT:    uzp1.4s v1, v5, v4
 ; CHECK-NEXT:    uzp1.4s v2, v3, v2
 ; CHECK-NEXT:    xtn.4h v0, v0


        


More information about the llvm-commits mailing list