[llvm] 9b5971a - [AArch64][GlobalISel] Lower G_BUILD_VECTOR to G_INSERT_VECTOR_ELT (#105686)

via llvm-commits llvm-commits at lists.llvm.org
Tue Sep 3 18:55:29 PDT 2024


Author: chuongg3
Date: 2024-09-03T18:55:23-07:00
New Revision: 9b5971ad0355d43a9bd37b1067d93ff8b08eba81

URL: https://github.com/llvm/llvm-project/commit/9b5971ad0355d43a9bd37b1067d93ff8b08eba81
DIFF: https://github.com/llvm/llvm-project/commit/9b5971ad0355d43a9bd37b1067d93ff8b08eba81.diff

LOG: [AArch64][GlobalISel] Lower G_BUILD_VECTOR to G_INSERT_VECTOR_ELT (#105686)

The lowering happens in post-legalizer lowering if any source registers
from G_BUILD_VECTOR are not constants.

Add pattern pragment setting `scalar_to_vector ($src)` asequivalent to
`vector_insert (undef), ($src), (i61 0)`

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64Combine.td
    llvm/lib/Target/AArch64/AArch64InstrInfo.td
    llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
    llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
    llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll
    llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir
    llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir
    llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
    llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
    llvm/test/CodeGen/AArch64/aarch64-smull.ll
    llvm/test/CodeGen/AArch64/abs.ll
    llvm/test/CodeGen/AArch64/add.ll
    llvm/test/CodeGen/AArch64/andorxor.ll
    llvm/test/CodeGen/AArch64/arm64-dup.ll
    llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
    llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
    llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
    llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
    llvm/test/CodeGen/AArch64/arm64-tbl.ll
    llvm/test/CodeGen/AArch64/bitcast.ll
    llvm/test/CodeGen/AArch64/bswap.ll
    llvm/test/CodeGen/AArch64/concat-vector.ll
    llvm/test/CodeGen/AArch64/fabs.ll
    llvm/test/CodeGen/AArch64/faddsub.ll
    llvm/test/CodeGen/AArch64/fcmp.ll
    llvm/test/CodeGen/AArch64/fcopysign.ll
    llvm/test/CodeGen/AArch64/fcvt.ll
    llvm/test/CodeGen/AArch64/fdiv.ll
    llvm/test/CodeGen/AArch64/fexplog.ll
    llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
    llvm/test/CodeGen/AArch64/fminimummaximum.ll
    llvm/test/CodeGen/AArch64/fminmax.ll
    llvm/test/CodeGen/AArch64/fmla.ll
    llvm/test/CodeGen/AArch64/fmul.ll
    llvm/test/CodeGen/AArch64/fneg.ll
    llvm/test/CodeGen/AArch64/fpow.ll
    llvm/test/CodeGen/AArch64/fpowi.ll
    llvm/test/CodeGen/AArch64/fptoi.ll
    llvm/test/CodeGen/AArch64/fptrunc.ll
    llvm/test/CodeGen/AArch64/frem.ll
    llvm/test/CodeGen/AArch64/fsincos.ll
    llvm/test/CodeGen/AArch64/fsqrt.ll
    llvm/test/CodeGen/AArch64/icmp.ll
    llvm/test/CodeGen/AArch64/insertextract.ll
    llvm/test/CodeGen/AArch64/itofp.ll
    llvm/test/CodeGen/AArch64/llvm.exp10.ll
    llvm/test/CodeGen/AArch64/load.ll
    llvm/test/CodeGen/AArch64/mul.ll
    llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
    llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
    llvm/test/CodeGen/AArch64/neon-extadd.ll
    llvm/test/CodeGen/AArch64/neon-extmul.ll
    llvm/test/CodeGen/AArch64/neon-perm.ll
    llvm/test/CodeGen/AArch64/ptradd.ll
    llvm/test/CodeGen/AArch64/rem.ll
    llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
    llvm/test/CodeGen/AArch64/sext.ll
    llvm/test/CodeGen/AArch64/shift.ll
    llvm/test/CodeGen/AArch64/shufflevector.ll
    llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
    llvm/test/CodeGen/AArch64/sub.ll
    llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
    llvm/test/CodeGen/AArch64/usub_sat_vec.ll
    llvm/test/CodeGen/AArch64/vecreduce-add.ll
    llvm/test/CodeGen/AArch64/xtn.ll
    llvm/test/CodeGen/AArch64/zext.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index d12f834da5a159..f99d1e276c60f9 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -222,7 +222,15 @@ def build_vector_to_dup : GICombineRule<
   (apply [{ applyBuildVectorToDup(*${root}, MRI, B); }])
 >;
 
-def build_vector_lowering : GICombineGroup<[build_vector_to_dup]>;
+def build_vector_to_vector_insert : GICombineRule<
+  (defs root:$root, register_matchinfo:$matchinfo),
+  (match (G_BUILD_VECTOR $dst, GIVariadic<>:$unused):$root,
+          [{ return matchLowerBuildToInsertVecElt(*${root}, MRI); }]),
+  (apply [{ applyLowerBuildToInsertVecElt(*${root}, MRI, B); }])
+>;
+
+def build_vector_lowering : GICombineGroup<[build_vector_to_dup,
+                                            build_vector_to_vector_insert]>;
 
 def lower_vector_fcmp : GICombineRule<
   (defs root:$root),

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 2fff6fffcd7c6d..c659697c3a1be3 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -3315,6 +3315,10 @@ defm LDRSW  : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>;
 // Pre-fetch.
 defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">;
 
+def vec_ins_or_scal_vec : PatFrags<(ops node:$src),
+                          [(vector_insert undef, node:$src, (i64 0)),
+                           (scalar_to_vector node:$src)]>;
+
 // For regular load, we do not have any alignment requirement.
 // Thus, it is safe to directly map the vector loads with interesting
 // addressing modes.
@@ -3323,13 +3327,13 @@ multiclass ScalToVecROLoadPat<ROAddrMode ro, SDPatternOperator loadop,
                               ValueType ScalTy, ValueType VecTy,
                               Instruction LOADW, Instruction LOADX,
                               SubRegIndex sub> {
-  def : Pat<(VecTy (scalar_to_vector (ScalTy
+  def : Pat<(VecTy (vec_ins_or_scal_vec (ScalTy
               (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))),
             (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
                            (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset),
                            sub)>;
 
-  def : Pat<(VecTy (scalar_to_vector (ScalTy
+  def : Pat<(VecTy (vec_ins_or_scal_vec (ScalTy
               (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))),
             (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
                            (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset),
@@ -3357,12 +3361,12 @@ defm : ScalToVecROLoadPat<ro64, load,       i64, v2i64, LDRDroW, LDRDroX, dsub>;
 defm : ScalToVecROLoadPat<ro64, load,       f64, v2f64, LDRDroW, LDRDroX, dsub>;
 
 
-def : Pat <(v1i64 (scalar_to_vector (i64
+def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
                       (load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
                                            ro_Wextend64:$extend))))),
            (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
 
-def : Pat <(v1i64 (scalar_to_vector (i64
+def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
                       (load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
                                            ro_Xextend64:$extend))))),
            (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
@@ -3495,34 +3499,34 @@ def : Pat <(bf16 (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
 // Thus, it is safe to directly map the vector loads with interesting
 // addressing modes.
 // FIXME: We could do the same for bitconvert to floating point vectors.
-def : Pat <(v8i8 (scalar_to_vector (i32
+def : Pat <(v8i8 (vec_ins_or_scal_vec (i32
                (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
            (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
                           (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
-def : Pat <(v16i8 (scalar_to_vector (i32
+def : Pat <(v16i8 (vec_ins_or_scal_vec (i32
                (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
                           (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
-def : Pat <(v4i16 (scalar_to_vector (i32
+def : Pat <(v4i16 (vec_ins_or_scal_vec (i32
                (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
            (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
                           (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
-def : Pat <(v8i16 (scalar_to_vector (i32
+def : Pat <(v8i16 (vec_ins_or_scal_vec (i32
                (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
            (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
                           (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
-def : Pat <(v2i32 (scalar_to_vector (i32
+def : Pat <(v2i32 (vec_ins_or_scal_vec (i32
                (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
            (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
                           (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
-def : Pat <(v4i32 (scalar_to_vector (i32
+def : Pat <(v4i32 (vec_ins_or_scal_vec (i32
                (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
                           (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
-def : Pat <(v1i64 (scalar_to_vector (i64
+def : Pat <(v1i64 (vec_ins_or_scal_vec (i64
                (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
-def : Pat <(v2i64 (scalar_to_vector (i64
+def : Pat <(v2i64 (vec_ins_or_scal_vec (i64
                (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
            (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
                           (LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>;
@@ -6848,10 +6852,10 @@ def : Pat<(i64 (and (i64 (anyext (i32 (vector_extract (v8i16 V128:$Rn),
 
 defm INS : SIMDIns;
 
-def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
+def : Pat<(v16i8 (vec_ins_or_scal_vec GPR32:$Rn)),
           (SUBREG_TO_REG (i32 0),
                          (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
-def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
+def : Pat<(v8i8 (vec_ins_or_scal_vec GPR32:$Rn)),
           (SUBREG_TO_REG (i32 0),
                          (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
 
@@ -6859,50 +6863,49 @@ def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
 def : Pat<(v8i8 (bitconvert (i64 (zext GPR32:$Rn)))),
           (SUBREG_TO_REG (i32 0), (f32 (FMOVWSr GPR32:$Rn)), ssub)>;
 
-def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
+def : Pat<(v8i16 (vec_ins_or_scal_vec GPR32:$Rn)),
           (SUBREG_TO_REG (i32 0),
                          (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
-def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
+def : Pat<(v4i16 (vec_ins_or_scal_vec GPR32:$Rn)),
           (SUBREG_TO_REG (i32 0),
                          (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
 
-def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
+def : Pat<(v4f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))),
           (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
+def : Pat<(v8f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))),
           (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
 
-def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+def : Pat<(v4bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))),
           (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+def : Pat<(v8bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))),
           (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
 
-def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
+def : Pat<(v2i32 (vec_ins_or_scal_vec (i32 FPR32:$Rn))),
             (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
                                   (i32 FPR32:$Rn), ssub))>;
-def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))),
+def : Pat<(v4i32 (vec_ins_or_scal_vec (i32 FPR32:$Rn))),
             (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
                                   (i32 FPR32:$Rn), ssub))>;
-
-def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
+def : Pat<(v2i64 (vec_ins_or_scal_vec (i64 FPR64:$Rn))),
             (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
                                   (i64 FPR64:$Rn), dsub))>;
 
-def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
+def : Pat<(v4f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))),
           (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
+def : Pat<(v8f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))),
           (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
 
-def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+def : Pat<(v4bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))),
           (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+def : Pat<(v8bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))),
           (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
 
-def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
+def : Pat<(v4f32 (vec_ins_or_scal_vec (f32 FPR32:$Rn))),
           (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
-def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
+def : Pat<(v2f32 (vec_ins_or_scal_vec (f32 FPR32:$Rn))),
           (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
 
-def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
+def : Pat<(v2f64 (vec_ins_or_scal_vec (f64 FPR64:$Rn))),
           (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;
 
 def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn),
@@ -8507,7 +8510,7 @@ def : Ld1Lane64IdxOpPat<extloadi8, VectorIndexH, v4i16, i32, LD1i8, VectorIndexH
 let Predicates = [HasNEON] in {
   class Ld1Lane128FirstElm<ValueType ResultTy, ValueType VecTy,
                           SDPatternOperator ExtLoad, Instruction LD1>
-    : Pat<(ResultTy (scalar_to_vector (i32 (ExtLoad GPR64sp:$Rn)))),
+    : Pat<(ResultTy (vec_ins_or_scal_vec (i32 (ExtLoad GPR64sp:$Rn)))),
             (ResultTy (EXTRACT_SUBREG
               (LD1 (VecTy (IMPLICIT_DEF)), 0, GPR64sp:$Rn), dsub))>;
 
@@ -8940,11 +8943,11 @@ def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))),
           (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
-def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)),
+def : Pat<(v1i64 (vec_ins_or_scal_vec GPR64:$Xn)),
           (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
-def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)),
+def : Pat<(v1f64 (vec_ins_or_scal_vec GPR64:$Xn)),
           (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
-def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;
+def : Pat<(v1f64 (vec_ins_or_scal_vec (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;
 
 def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))),
           (COPY_TO_REGCLASS GPR32:$Xn, FPR32)>;

diff  --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index e9e6b6cb68d0d1..18361cf3685642 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -2116,6 +2116,21 @@ bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
     I.getOperand(1).setReg(NewSrc.getReg(0));
     return true;
   }
+  case AArch64::G_INSERT_VECTOR_ELT: {
+    // Convert the type from p0 to s64 to help selection.
+    LLT DstTy = MRI.getType(I.getOperand(0).getReg());
+    LLT SrcVecTy = MRI.getType(I.getOperand(1).getReg());
+    if (!SrcVecTy.isPointerVector())
+      return false;
+    auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(2).getReg());
+    MRI.setType(I.getOperand(1).getReg(),
+                DstTy.changeElementType(LLT::scalar(64)));
+    MRI.setType(I.getOperand(0).getReg(),
+                DstTy.changeElementType(LLT::scalar(64)));
+    MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass);
+    I.getOperand(2).setReg(NewSrc.getReg(0));
+    return true;
+  }
   case TargetOpcode::G_UITOFP:
   case TargetOpcode::G_SITOFP: {
     // If both source and destination regbanks are FPR, then convert the opcode

diff  --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 90ac4bdff4e0e4..b40fe55fdfaf67 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -1054,6 +1054,40 @@ void applyLowerVectorFCMP(MachineInstr &MI, MachineRegisterInfo &MRI,
   MI.eraseFromParent();
 }
 
+// Matches G_BUILD_VECTOR where at least one source operand is not a constant
+bool matchLowerBuildToInsertVecElt(MachineInstr &MI, MachineRegisterInfo &MRI) {
+  auto *GBuildVec = cast<GBuildVector>(&MI);
+
+  // Check if the values are all constants
+  for (unsigned I = 0; I < GBuildVec->getNumSources(); ++I) {
+    auto ConstVal =
+        getAnyConstantVRegValWithLookThrough(GBuildVec->getSourceReg(I), MRI);
+
+    if (!ConstVal.has_value())
+      return true;
+  }
+
+  return false;
+}
+
+void applyLowerBuildToInsertVecElt(MachineInstr &MI, MachineRegisterInfo &MRI,
+                                   MachineIRBuilder &B) {
+  auto *GBuildVec = cast<GBuildVector>(&MI);
+  LLT DstTy = MRI.getType(GBuildVec->getReg(0));
+  Register DstReg = B.buildUndef(DstTy).getReg(0);
+
+  for (unsigned I = 0; I < GBuildVec->getNumSources(); ++I) {
+    Register SrcReg = GBuildVec->getSourceReg(I);
+    if (mi_match(SrcReg, MRI, m_GImplicitDef()))
+      continue;
+    auto IdxReg = B.buildConstant(LLT::scalar(64), I);
+    DstReg =
+        B.buildInsertVectorElement(DstTy, DstReg, SrcReg, IdxReg).getReg(0);
+  }
+  B.buildCopy(GBuildVec->getReg(0), DstReg);
+  GBuildVec->eraseFromParent();
+}
+
 bool matchFormTruncstore(MachineInstr &MI, MachineRegisterInfo &MRI,
                          Register &SrcReg) {
   assert(MI.getOpcode() == TargetOpcode::G_STORE);

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll
index f7efaeaa507053..87c1307ad29556 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll
@@ -10,12 +10,14 @@ define i32 @bar() {
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
 ; CHECK-NEXT:    mov b1, v0[1]
-; CHECK-NEXT:    mov b2, v0[2]
-; CHECK-NEXT:    mov b3, v0[3]
-; CHECK-NEXT:    mov.h v0[1], v1[0]
-; CHECK-NEXT:    mov.h v2[1], v3[0]
+; CHECK-NEXT:    mov b2, v0[3]
+; CHECK-NEXT:    mov b3, v0[2]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    mov.h v0[1], w8
+; CHECK-NEXT:    mov.h v3[1], w9
 ; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    ushll.4s v1, v2, #0
+; CHECK-NEXT:    ushll.4s v1, v3, #0
 ; CHECK-NEXT:    mov.d v0[1], v1[0]
 ; CHECK-NEXT:    movi.4s v1, #1
 ; CHECK-NEXT:    and.16b v0, v0, v1

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir
index 70867c2ea2842a..0115531dfb09ae 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir
@@ -42,20 +42,30 @@ body:             |
     ; LOWER-NEXT: {{  $}}
     ; LOWER-NEXT: %r:_(s32) = COPY $w0
     ; LOWER-NEXT: %q:_(s32) = COPY $w1
-    ; LOWER-NEXT: %build_vector:_(<2 x s32>) = G_BUILD_VECTOR %r(s32), %q(s32)
+    ; LOWER-NEXT: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
+    ; LOWER-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; LOWER-NEXT: [[IVEC:%[0-9]+]]:_(<2 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], %r(s32), [[C]](s64)
+    ; LOWER-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; LOWER-NEXT: [[IVEC1:%[0-9]+]]:_(<2 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], %q(s32), [[C1]](s64)
+    ; LOWER-NEXT: %build_vector:_(<2 x s32>) = COPY [[IVEC1]](<2 x s32>)
     ; LOWER-NEXT: $d0 = COPY %build_vector(<2 x s32>)
     ; LOWER-NEXT: RET_ReallyLR implicit $d0
     ;
     ; SELECT-LABEL: name: dont_combine_
diff erent_reg
     ; SELECT: liveins: $d0, $w0, $w1
     ; SELECT-NEXT: {{  $}}
-    ; SELECT-NEXT: %r:gpr32all = COPY $w0
+    ; SELECT-NEXT: %r:gpr32 = COPY $w0
     ; SELECT-NEXT: %q:gpr32 = COPY $w1
-    ; SELECT-NEXT: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
-    ; SELECT-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], %r, %subreg.ssub
-    ; SELECT-NEXT: [[INSvi32gpr:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG]], 1, %q
-    ; SELECT-NEXT: %build_vector:fpr64 = COPY [[INSvi32gpr]].dsub
-    ; SELECT-NEXT: $d0 = COPY %build_vector
+    ; SELECT-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF
+    ; SELECT-NEXT: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; SELECT-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[DEF]], %subreg.dsub
+    ; SELECT-NEXT: [[INSvi32gpr:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG]], 0, %r
+    ; SELECT-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY [[INSvi32gpr]].dsub
+    ; SELECT-NEXT: [[DEF2:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; SELECT-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF2]], [[COPY]], %subreg.dsub
+    ; SELECT-NEXT: [[INSvi32gpr1:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG1]], 1, %q
+    ; SELECT-NEXT: [[COPY1:%[0-9]+]]:fpr64 = COPY [[INSvi32gpr1]].dsub
+    ; SELECT-NEXT: $d0 = COPY [[COPY1]]
     ; SELECT-NEXT: RET_ReallyLR implicit $d0
     %r:_(s32) = COPY $w0
     %q:_(s32) = COPY $w1

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir
index 71094825e42f30..7c7689bcb80b5f 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir
@@ -355,7 +355,21 @@ body:             |
     ; CHECK: liveins: $w0, $w1, $w2, $w3
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %lane:_(s32) = COPY $w0
-    ; CHECK-NEXT: %shuf:_(<4 x s32>) = G_DUP %lane(s32)
+    ; CHECK-NEXT: %b:_(s32) = COPY $w1
+    ; CHECK-NEXT: %c:_(s32) = COPY $w2
+    ; CHECK-NEXT: %d:_(s32) = COPY $w3
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], %lane(s32), [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[IVEC1:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], %b(s32), [[C1]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[IVEC2:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC1]], %c(s32), [[C2]](s64)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: [[IVEC3:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC2]], %d(s32), [[C3]](s64)
+    ; CHECK-NEXT: %buildvec:_(<4 x s32>) = COPY [[IVEC3]](<4 x s32>)
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: %shuf:_(<4 x s32>) = G_DUPLANE32 %buildvec, [[C4]](s64)
     ; CHECK-NEXT: $q0 = COPY %shuf(<4 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %lane:_(s32) = COPY $w0
@@ -367,7 +381,7 @@ body:             |
     %shuf:_(<4 x s32>) = G_SHUFFLE_VECTOR %buildvec(<4 x s32>), %undef, shufflemask(0, 0, 0, 0)
     $q0 = COPY %shuf(<4 x s32>)
     RET_ReallyLR implicit $q0
- 
+
 ...
 ---
 name:            build_vector_rhs
@@ -382,10 +396,35 @@ body:             |
     ;
     ; CHECK-LABEL: name: build_vector
     ; CHECK: liveins: $w0, $w1, $w2, $w3, $w4
-    ; CHECK: %lane_1:_(s32) = COPY $w1
-    ; CHECK: %shuf:_(<4 x s32>) = G_DUP %lane_1(s32)
-    ; CHECK: $q0 = COPY %shuf(<4 x s32>)
-    ; CHECK: RET_ReallyLR implicit $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %lane_0:_(s32) = COPY $w0
+    ; CHECK-NEXT: %lane_1:_(s32) = COPY $w1
+    ; CHECK-NEXT: %b:_(s32) = COPY $w2
+    ; CHECK-NEXT: %c:_(s32) = COPY $w3
+    ; CHECK-NEXT: %d:_(s32) = COPY $w4
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], %lane_0(s32), [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[IVEC1:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], %b(s32), [[C1]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[IVEC2:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC1]], %c(s32), [[C2]](s64)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: [[IVEC3:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC2]], %d(s32), [[C3]](s64)
+    ; CHECK-NEXT: %buildvec0:_(<4 x s32>) = COPY [[IVEC3]](<4 x s32>)
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC4:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[DEF1]], %lane_1(s32), [[C4]](s64)
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[IVEC5:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC4]], %b(s32), [[C5]](s64)
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[IVEC6:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC5]], %c(s32), [[C6]](s64)
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: [[IVEC7:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC6]], %d(s32), [[C7]](s64)
+    ; CHECK-NEXT: %buildvec1:_(<4 x s32>) = COPY [[IVEC7]](<4 x s32>)
+    ; CHECK-NEXT: %shuf:_(<4 x s32>) = G_SHUFFLE_VECTOR %buildvec0(<4 x s32>), %buildvec1, shufflemask(4, 4, 4, 4)
+    ; CHECK-NEXT: $q0 = COPY %shuf(<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %lane_0:_(s32) = COPY $w0
     %lane_1:_(s32) = COPY $w1
     %b:_(s32) = COPY $w2

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
index f47da47002fbcd..9734ab35bd6b2d 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
@@ -76,7 +76,7 @@ define <1 x i32> @test_bitf_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
 ; CHECK-GI-NEXT:    bic w9, w9, w8
 ; CHECK-GI-NEXT:    and w8, w8, w10
 ; CHECK-GI-NEXT:    orr w8, w9, w8
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
   %neg = xor <1 x i32> %C, <i32 -1>

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
index 6431cfc58a54d2..45ad4b07ff66f7 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
@@ -76,7 +76,7 @@ define <1 x i32> @test_bit_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
 ; CHECK-GI-NEXT:    and w9, w8, w9
 ; CHECK-GI-NEXT:    bic w8, w10, w8
 ; CHECK-GI-NEXT:    orr w8, w9, w8
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
   %and = and <1 x i32> %C, %B

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 307aa397eabbbe..d677526bab0005 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -209,24 +209,22 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind {
 ; CHECK-GI-NEXT:    ldr w8, [x0]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    uxtb w8, w8
-; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b1, v0.b[2]
+; CHECK-GI-NEXT:    mov b2, v0.b[1]
 ; CHECK-GI-NEXT:    mov b3, v0.b[3]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    fmov w10, s2
 ; CHECK-GI-NEXT:    fmov w11, s3
+; CHECK-GI-NEXT:    ldr d2, [x1]
 ; CHECK-GI-NEXT:    uxtb w9, w9
 ; CHECK-GI-NEXT:    uxtb w10, w10
 ; CHECK-GI-NEXT:    uxtb w11, w11
 ; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    fmov s2, w10
-; CHECK-GI-NEXT:    fmov s3, w11
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], w10
+; CHECK-GI-NEXT:    mov v1.h[1], w11
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v2.4h, #0
-; CHECK-GI-NEXT:    ldr d2, [x1]
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    sshll v1.4s, v2.4h, #0
 ; CHECK-GI-NEXT:    mul v0.4s, v0.4s, v1.4s
@@ -269,25 +267,25 @@ define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
 ;
 ; CHECK-GI-LABEL: smull_zext_v2i32_v2i64:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr h1, [x0]
+; CHECK-GI-NEXT:    ld1 { v1.h }[0], [x0]
 ; CHECK-GI-NEXT:    ldr h2, [x0, #2]
 ; CHECK-GI-NEXT:    movi d0, #0x00ffff0000ffff
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    and v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
 ; CHECK-GI-NEXT:    mov w8, v0.s[0]
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
-; CHECK-GI-NEXT:    ldr d0, [x1]
-; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    fmov d1, x8
-; CHECK-GI-NEXT:    fmov x11, d0
-; CHECK-GI-NEXT:    mov v1.d[1], x9
-; CHECK-GI-NEXT:    mov x9, v0.d[1]
-; CHECK-GI-NEXT:    fmov x10, d1
-; CHECK-GI-NEXT:    mov x8, v1.d[1]
-; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    mov x11, v1.d[1]
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
 ; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    fmov d0, x10
-; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    mul x9, x10, x11
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov v0.d[1], x9
 ; CHECK-GI-NEXT:    ret
   %load.A = load <2 x i16>, ptr %A
   %load.B = load <2 x i32>, ptr %B
@@ -322,14 +320,14 @@ define <2 x i64> @smull_zext_and_v2i32_v2i64(ptr %A, ptr %B) nounwind {
 ; CHECK-GI-NEXT:    ldr d1, [x1]
 ; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
 ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    fmov x11, d1
-; CHECK-GI-NEXT:    mov x9, v1.d[1]
-; CHECK-GI-NEXT:    fmov x10, d0
-; CHECK-GI-NEXT:    mov x8, v0.d[1]
-; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    mov x11, v1.d[1]
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
 ; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    fmov d0, x10
-; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    mul x9, x10, x11
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov v0.d[1], x9
 ; CHECK-GI-NEXT:    ret
   %load.A = load <2 x i32>, ptr %A
   %and.A = and <2 x i32> %load.A, <i32 u0x7FFFFFFF, i32 u0x7FFFFFFF>
@@ -1048,14 +1046,14 @@ define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
 ; CHECK-GI-NEXT:    adrp x8, .LCPI36_0
 ; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI36_0]
-; CHECK-GI-NEXT:    fmov x10, d0
-; CHECK-GI-NEXT:    fmov x11, d1
-; CHECK-GI-NEXT:    mov x8, v0.d[1]
-; CHECK-GI-NEXT:    mov x9, v1.d[1]
-; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
+; CHECK-GI-NEXT:    mov x11, v1.d[1]
 ; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    fmov d0, x10
-; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    mul x9, x10, x11
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov v0.d[1], x9
 ; CHECK-GI-NEXT:    ret
   %tmp3 = sext <2 x i32> %arg to <2 x i64>
   %tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234>
@@ -1163,14 +1161,14 @@ define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
 ; CHECK-GI-NEXT:    adrp x8, .LCPI40_0
 ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI40_0]
-; CHECK-GI-NEXT:    fmov x10, d0
-; CHECK-GI-NEXT:    fmov x11, d1
-; CHECK-GI-NEXT:    mov x8, v0.d[1]
-; CHECK-GI-NEXT:    mov x9, v1.d[1]
-; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
+; CHECK-GI-NEXT:    mov x11, v1.d[1]
 ; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    fmov d0, x10
-; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    mul x9, x10, x11
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov v0.d[1], x9
 ; CHECK-GI-NEXT:    ret
   %tmp3 = zext <2 x i32> %arg to <2 x i64>
   %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
@@ -1264,15 +1262,15 @@ define <2 x i64> @amull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
 ; CHECK-GI-NEXT:    adrp x8, .LCPI43_0
 ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI43_0]
-; CHECK-GI-NEXT:    fmov x10, d0
-; CHECK-GI-NEXT:    fmov x11, d1
-; CHECK-GI-NEXT:    mov x8, v0.d[1]
-; CHECK-GI-NEXT:    mov x9, v1.d[1]
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
+; CHECK-GI-NEXT:    mov x11, v1.d[1]
 ; CHECK-GI-NEXT:    movi v1.2d, #0x000000ffffffff
-; CHECK-GI-NEXT:    mul x10, x10, x11
 ; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    fmov d0, x10
-; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    mul x9, x10, x11
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov v0.d[1], x9
 ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
   %tmp3 = zext <2 x i32> %arg to <2 x i64>
@@ -1891,15 +1889,15 @@ define <2 x i64> @umull_and_v2i64(<2 x i32> %src1, <2 x i64> %src2) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    movi v2.2d, #0x000000000000ff
 ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    fmov x10, d0
-; CHECK-GI-NEXT:    mov x8, v0.d[1]
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
 ; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-GI-NEXT:    fmov x11, d1
-; CHECK-GI-NEXT:    mov x9, v1.d[1]
-; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    mov x11, v1.d[1]
 ; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    fmov d0, x10
-; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    mul x9, x10, x11
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov v0.d[1], x9
 ; CHECK-GI-NEXT:    ret
 entry:
   %in1 = zext <2 x i32> %src1 to <2 x i64>
@@ -1947,10 +1945,10 @@ define <4 x i64> @umull_and_v4i64(<4 x i32> %src1, <4 x i64> %src2) {
 ; CHECK-GI-NEXT:    fmov x9, d0
 ; CHECK-GI-NEXT:    mul x10, x10, x11
 ; CHECK-GI-NEXT:    mul x9, x9, x12
-; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    mov v0.d[0], x8
 ; CHECK-GI-NEXT:    mul x11, x13, x14
+; CHECK-GI-NEXT:    mov v1.d[0], x9
 ; CHECK-GI-NEXT:    mov v0.d[1], x10
-; CHECK-GI-NEXT:    fmov d1, x9
 ; CHECK-GI-NEXT:    mov v1.d[1], x11
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1992,9 +1990,9 @@ define <4 x i64> @umull_and_v4i64_dup(<4 x i32> %src1, i64 %src2) {
 ; CHECK-GI-NEXT:    mul x8, x8, x9
 ; CHECK-GI-NEXT:    mul x9, x12, x9
 ; CHECK-GI-NEXT:    mul x10, x10, x11
-; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    mov v0.d[0], x8
 ; CHECK-GI-NEXT:    mul x11, x13, x11
-; CHECK-GI-NEXT:    fmov d1, x9
+; CHECK-GI-NEXT:    mov v1.d[0], x9
 ; CHECK-GI-NEXT:    mov v0.d[1], x10
 ; CHECK-GI-NEXT:    mov v1.d[1], x11
 ; CHECK-GI-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/abs.ll b/llvm/test/CodeGen/AArch64/abs.ll
index 78c1ff7b993706..6da019a79b7277 100644
--- a/llvm/test/CodeGen/AArch64/abs.ll
+++ b/llvm/test/CodeGen/AArch64/abs.ll
@@ -247,7 +247,7 @@ define <1 x i32> @abs_v1i32(<1 x i32> %a){
 ; CHECK-GI-NEXT:    fmov w9, s0
 ; CHECK-GI-NEXT:    cmp w8, #0
 ; CHECK-GI-NEXT:    cneg w8, w9, le
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -299,10 +299,8 @@ define <3 x i8> @abs_v3i8(<3 x i8> %a){
 ; CHECK-GI-LABEL: abs_v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w2
-; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
+; CHECK-GI-NEXT:    mov v0.b[1], w1
+; CHECK-GI-NEXT:    mov v0.b[2], w2
 ; CHECK-GI-NEXT:    abs v0.8b, v0.8b
 ; CHECK-GI-NEXT:    umov w0, v0.b[0]
 ; CHECK-GI-NEXT:    umov w1, v0.b[1]

diff  --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll
index ee15445a7bbd62..fc1a0c71d4cdf0 100644
--- a/llvm/test/CodeGen/AArch64/add.ll
+++ b/llvm/test/CodeGen/AArch64/add.ll
@@ -71,13 +71,13 @@ define void @v2i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: v2i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr b0, [x0]
-; CHECK-GI-NEXT:    ldr b1, [x0, #1]
-; CHECK-GI-NEXT:    ldr b2, [x1]
+; CHECK-GI-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-GI-NEXT:    ld1 { v1.b }[0], [x1]
+; CHECK-GI-NEXT:    ldr b2, [x0, #1]
 ; CHECK-GI-NEXT:    ldr b3, [x1, #1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT:    add v0.2s, v0.2s, v2.2s
+; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v3.s[0]
+; CHECK-GI-NEXT:    add v0.2s, v0.2s, v1.2s
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    str b0, [x0]
 ; CHECK-GI-NEXT:    str b1, [x0, #1]
@@ -112,22 +112,18 @@ define void @v3i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w10, [x1]
+; CHECK-GI-NEXT:    ldrb w9, [x1]
+; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
 ; CHECK-GI-NEXT:    ldrb w11, [x1, #1]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    fmov s2, w10
-; CHECK-GI-NEXT:    fmov s3, w11
 ; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
 ; CHECK-GI-NEXT:    ldrb w9, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w9
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[2], v3.h[0]
-; CHECK-GI-NEXT:    add v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    mov v0.h[1], w10
+; CHECK-GI-NEXT:    mov v1.h[1], w11
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    mov v1.h[2], w9
+; CHECK-GI-NEXT:    add v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov h2, v0.h[2]
 ; CHECK-GI-NEXT:    str b0, [x0]
@@ -159,27 +155,27 @@ define void @v4i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v1.b[1]
-; CHECK-GI-NEXT:    mov b4, v0.b[2]
-; CHECK-GI-NEXT:    mov b5, v0.b[3]
-; CHECK-GI-NEXT:    mov b6, v1.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[2]
-; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mov v3.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[1]
+; CHECK-GI-NEXT:    mov v5.b[0], v1.b[0]
+; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[2]
+; CHECK-GI-NEXT:    mov b1, v1.b[3]
+; CHECK-GI-NEXT:    mov v3.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v5.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v3.b[3], v0.b[0]
+; CHECK-GI-NEXT:    mov v5.b[3], v1.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v3.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v5.8b, #0
 ; CHECK-GI-NEXT:    add v0.4h, v0.4h, v1.4h
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v3.h[0]
-; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    xtn v0.8b, v1.8h
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    str w8, [x0]
 ; CHECK-GI-NEXT:    ret
@@ -247,13 +243,13 @@ define void @v2i16(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: v2i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    add v0.2s, v0.2s, v1.2s
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    str h0, [x0]
@@ -281,18 +277,16 @@ define void @v3i16(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    ldr h1, [x1]
 ; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    add x10, x1, #4
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #4]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    ldr h3, [x1, #4]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[2], v3.h[0]
-; CHECK-GI-NEXT:    add v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
+; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
+; CHECK-GI-NEXT:    add v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
 ; CHECK-GI-NEXT:    st1 { v0.h }[2], [x9]

diff  --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll
index 1176c98ce44e34..5385a917619fa0 100644
--- a/llvm/test/CodeGen/AArch64/andorxor.ll
+++ b/llvm/test/CodeGen/AArch64/andorxor.ll
@@ -191,13 +191,13 @@ define void @and_v2i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: and_v2i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr b0, [x0]
-; CHECK-GI-NEXT:    ldr b1, [x0, #1]
-; CHECK-GI-NEXT:    ldr b2, [x1]
+; CHECK-GI-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-GI-NEXT:    ld1 { v1.b }[0], [x1]
+; CHECK-GI-NEXT:    ldr b2, [x0, #1]
 ; CHECK-GI-NEXT:    ldr b3, [x1, #1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v3.s[0]
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    str b0, [x0]
 ; CHECK-GI-NEXT:    str b1, [x0, #1]
@@ -228,13 +228,13 @@ define void @or_v2i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: or_v2i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr b0, [x0]
-; CHECK-GI-NEXT:    ldr b1, [x0, #1]
-; CHECK-GI-NEXT:    ldr b2, [x1]
+; CHECK-GI-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-GI-NEXT:    ld1 { v1.b }[0], [x1]
+; CHECK-GI-NEXT:    ldr b2, [x0, #1]
 ; CHECK-GI-NEXT:    ldr b3, [x1, #1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v3.s[0]
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    str b0, [x0]
 ; CHECK-GI-NEXT:    str b1, [x0, #1]
@@ -265,13 +265,13 @@ define void @xor_v2i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: xor_v2i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr b0, [x0]
-; CHECK-GI-NEXT:    ldr b1, [x0, #1]
-; CHECK-GI-NEXT:    ldr b2, [x1]
+; CHECK-GI-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-GI-NEXT:    ld1 { v1.b }[0], [x1]
+; CHECK-GI-NEXT:    ldr b2, [x0, #1]
 ; CHECK-GI-NEXT:    ldr b3, [x1, #1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT:    eor v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v3.s[0]
+; CHECK-GI-NEXT:    eor v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    str b0, [x0]
 ; CHECK-GI-NEXT:    str b1, [x0, #1]
@@ -306,22 +306,18 @@ define void @and_v3i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: and_v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w10, [x1]
+; CHECK-GI-NEXT:    ldrb w9, [x1]
+; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
 ; CHECK-GI-NEXT:    ldrb w11, [x1, #1]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    fmov s2, w10
-; CHECK-GI-NEXT:    fmov s3, w11
 ; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
 ; CHECK-GI-NEXT:    ldrb w9, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w9
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[2], v3.h[0]
-; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    mov v0.h[1], w10
+; CHECK-GI-NEXT:    mov v1.h[1], w11
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    mov v1.h[2], w9
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov h2, v0.h[2]
 ; CHECK-GI-NEXT:    str b0, [x0]
@@ -358,22 +354,18 @@ define void @or_v3i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: or_v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w10, [x1]
+; CHECK-GI-NEXT:    ldrb w9, [x1]
+; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
 ; CHECK-GI-NEXT:    ldrb w11, [x1, #1]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    fmov s2, w10
-; CHECK-GI-NEXT:    fmov s3, w11
 ; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
 ; CHECK-GI-NEXT:    ldrb w9, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w9
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[2], v3.h[0]
-; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    mov v0.h[1], w10
+; CHECK-GI-NEXT:    mov v1.h[1], w11
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    mov v1.h[2], w9
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov h2, v0.h[2]
 ; CHECK-GI-NEXT:    str b0, [x0]
@@ -410,22 +402,18 @@ define void @xor_v3i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: xor_v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w10, [x1]
+; CHECK-GI-NEXT:    ldrb w9, [x1]
+; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
 ; CHECK-GI-NEXT:    ldrb w11, [x1, #1]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    fmov s2, w10
-; CHECK-GI-NEXT:    fmov s3, w11
 ; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
 ; CHECK-GI-NEXT:    ldrb w9, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w9
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[2], v3.h[0]
-; CHECK-GI-NEXT:    eor v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    mov v0.h[1], w10
+; CHECK-GI-NEXT:    mov v1.h[1], w11
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    mov v1.h[2], w9
+; CHECK-GI-NEXT:    eor v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov h2, v0.h[2]
 ; CHECK-GI-NEXT:    str b0, [x0]
@@ -459,27 +447,27 @@ define void @and_v4i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v1.b[1]
-; CHECK-GI-NEXT:    mov b4, v0.b[2]
-; CHECK-GI-NEXT:    mov b5, v0.b[3]
-; CHECK-GI-NEXT:    mov b6, v1.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[2]
-; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mov v3.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[1]
+; CHECK-GI-NEXT:    mov v5.b[0], v1.b[0]
+; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[2]
+; CHECK-GI-NEXT:    mov b1, v1.b[3]
+; CHECK-GI-NEXT:    mov v3.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v5.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v3.b[3], v0.b[0]
+; CHECK-GI-NEXT:    mov v5.b[3], v1.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v3.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v5.8b, #0
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v3.h[0]
-; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    xtn v0.8b, v1.8h
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    str w8, [x0]
 ; CHECK-GI-NEXT:    ret
@@ -510,27 +498,27 @@ define void @or_v4i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v1.b[1]
-; CHECK-GI-NEXT:    mov b4, v0.b[2]
-; CHECK-GI-NEXT:    mov b5, v0.b[3]
-; CHECK-GI-NEXT:    mov b6, v1.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[2]
-; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mov v3.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[1]
+; CHECK-GI-NEXT:    mov v5.b[0], v1.b[0]
+; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[2]
+; CHECK-GI-NEXT:    mov b1, v1.b[3]
+; CHECK-GI-NEXT:    mov v3.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v5.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v3.b[3], v0.b[0]
+; CHECK-GI-NEXT:    mov v5.b[3], v1.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v3.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v5.8b, #0
 ; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v3.h[0]
-; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    xtn v0.8b, v1.8h
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    str w8, [x0]
 ; CHECK-GI-NEXT:    ret
@@ -561,27 +549,27 @@ define void @xor_v4i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v1.b[1]
-; CHECK-GI-NEXT:    mov b4, v0.b[2]
-; CHECK-GI-NEXT:    mov b5, v0.b[3]
-; CHECK-GI-NEXT:    mov b6, v1.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[2]
-; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mov v3.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[1]
+; CHECK-GI-NEXT:    mov v5.b[0], v1.b[0]
+; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[2]
+; CHECK-GI-NEXT:    mov b1, v1.b[3]
+; CHECK-GI-NEXT:    mov v3.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v5.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v3.b[3], v0.b[0]
+; CHECK-GI-NEXT:    mov v5.b[3], v1.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v3.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v5.8b, #0
 ; CHECK-GI-NEXT:    eor v0.8b, v0.8b, v1.8b
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v3.h[0]
-; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    xtn v0.8b, v1.8h
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    str w8, [x0]
 ; CHECK-GI-NEXT:    ret
@@ -723,13 +711,13 @@ define void @and_v2i16(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: and_v2i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    str h0, [x0]
@@ -762,13 +750,13 @@ define void @or_v2i16(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: or_v2i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    str h0, [x0]
@@ -801,13 +789,13 @@ define void @xor_v2i16(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: xor_v2i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    eor v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    str h0, [x0]
@@ -836,18 +824,16 @@ define void @and_v3i16(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: and_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    ldr h1, [x1]
 ; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    add x10, x1, #4
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #4]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    ldr h3, [x1, #4]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[2], v3.h[0]
-; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
+; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
 ; CHECK-GI-NEXT:    st1 { v0.h }[2], [x9]
@@ -875,18 +861,16 @@ define void @or_v3i16(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: or_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    ldr h1, [x1]
 ; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    add x10, x1, #4
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #4]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    ldr h3, [x1, #4]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[2], v3.h[0]
-; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
+; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
+; CHECK-GI-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
 ; CHECK-GI-NEXT:    st1 { v0.h }[2], [x9]
@@ -914,18 +898,16 @@ define void @xor_v3i16(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: xor_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    ldr h1, [x1]
 ; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    add x10, x1, #4
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #4]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    ldr h3, [x1, #4]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[2], v3.h[0]
-; CHECK-GI-NEXT:    eor v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
+; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
+; CHECK-GI-NEXT:    eor v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
 ; CHECK-GI-NEXT:    st1 { v0.h }[2], [x9]

diff  --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll
index 0291f8c9123047..a25763e3b15907 100644
--- a/llvm/test/CodeGen/AArch64/arm64-dup.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll
@@ -334,25 +334,40 @@ entry:
 }
 
 define <2 x i32> @f(i32 %a, i32 %b) nounwind readnone  {
-; CHECK-LABEL: f:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    mov.s v0[1], w1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: f:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    mov.s v0[1], w1
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: f:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov.s v0[0], w0
+; CHECK-GI-NEXT:    mov.s v0[1], w1
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
   %vecinit = insertelement <2 x i32> undef, i32 %a, i32 0
   %vecinit1 = insertelement <2 x i32> %vecinit, i32 %b, i32 1
   ret <2 x i32> %vecinit1
 }
 
 define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone  {
-; CHECK-LABEL: g:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    mov.s v0[1], w1
-; CHECK-NEXT:    mov.s v0[2], w1
-; CHECK-NEXT:    mov.s v0[3], w0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: g:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    mov.s v0[1], w1
+; CHECK-SD-NEXT:    mov.s v0[2], w1
+; CHECK-SD-NEXT:    mov.s v0[3], w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: g:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov.s v0[0], w0
+; CHECK-GI-NEXT:    mov.s v0[1], w1
+; CHECK-GI-NEXT:    mov.s v0[2], w1
+; CHECK-GI-NEXT:    mov.s v0[3], w0
+; CHECK-GI-NEXT:    ret
   %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0
   %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1
   %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2
@@ -361,11 +376,17 @@ define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone  {
 }
 
 define <2 x i64> @h(i64 %a, i64 %b) nounwind readnone  {
-; CHECK-LABEL: h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov d0, x0
-; CHECK-NEXT:    mov.d v0[1], x1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov d0, x0
+; CHECK-SD-NEXT:    mov.d v0[1], x1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov.d v0[0], x0
+; CHECK-GI-NEXT:    mov.d v0[1], x1
+; CHECK-GI-NEXT:    ret
   %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0
   %vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1
   ret <2 x i64> %vecinit1
@@ -386,8 +407,8 @@ define <4 x i16> @test_build_illegal(<4 x i32> %in) {
 ;
 ; CHECK-GI-LABEL: test_build_illegal:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov s0, v0[3]
-; CHECK-GI-NEXT:    mov.h v0[3], v0[0]
+; CHECK-GI-NEXT:    mov.s w8, v0[3]
+; CHECK-GI-NEXT:    mov.h v0[3], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
   %val = extractelement <4 x i32> %in, i32 3

diff  --git a/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll b/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
index bc399c8d4ff071..8611532d6ea924 100644
--- a/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
@@ -29,19 +29,20 @@ define <4 x i8> @test_varidx_extract_v8s8(<8 x i8> %x, i32 %idx) {
 ; CHECK-GISEL-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-GISEL-NEXT:    mov w9, w0
 ; CHECK-GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GISEL-NEXT:    mov b1, v0.b[1]
 ; CHECK-GISEL-NEXT:    add x8, sp, #8
-; CHECK-GISEL-NEXT:    and x9, x9, #0x7
 ; CHECK-GISEL-NEXT:    str d0, [sp, #8]
+; CHECK-GISEL-NEXT:    and x9, x9, #0x7
+; CHECK-GISEL-NEXT:    mov b2, v0.b[1]
 ; CHECK-GISEL-NEXT:    mov b3, v0.b[2]
 ; CHECK-GISEL-NEXT:    lsl x10, x9, #1
 ; CHECK-GISEL-NEXT:    mov b0, v0.b[3]
 ; CHECK-GISEL-NEXT:    sub x9, x10, x9
-; CHECK-GISEL-NEXT:    ldr b2, [x8, x9]
-; CHECK-GISEL-NEXT:    mov v2.b[1], v1.b[0]
-; CHECK-GISEL-NEXT:    mov v2.b[2], v3.b[0]
-; CHECK-GISEL-NEXT:    mov v2.b[3], v0.b[0]
-; CHECK-GISEL-NEXT:    ushll v0.8h, v2.8b, #0
+; CHECK-GISEL-NEXT:    ldr b1, [x8, x9]
+; CHECK-GISEL-NEXT:    mov v1.b[0], v1.b[0]
+; CHECK-GISEL-NEXT:    mov v1.b[1], v2.b[0]
+; CHECK-GISEL-NEXT:    mov v1.b[2], v3.b[0]
+; CHECK-GISEL-NEXT:    mov v1.b[3], v0.b[0]
+; CHECK-GISEL-NEXT:    ushll v0.8h, v1.8b, #0
 ; CHECK-GISEL-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GISEL-NEXT:    add sp, sp, #16
 ; CHECK-GISEL-NEXT:    ret
@@ -82,14 +83,15 @@ define <8 x i8> @test_varidx_extract_v16s8(<16 x i8> %x, i32 %idx) {
 ; CHECK-GISEL-NEXT:    sub sp, sp, #16
 ; CHECK-GISEL-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-GISEL-NEXT:    mov w9, w0
-; CHECK-GISEL-NEXT:    mov b2, v0.b[1]
 ; CHECK-GISEL-NEXT:    mov x8, sp
-; CHECK-GISEL-NEXT:    and x9, x9, #0xf
 ; CHECK-GISEL-NEXT:    str q0, [sp]
+; CHECK-GISEL-NEXT:    and x9, x9, #0xf
+; CHECK-GISEL-NEXT:    mov b2, v0.b[1]
 ; CHECK-GISEL-NEXT:    mov b3, v0.b[2]
 ; CHECK-GISEL-NEXT:    lsl x10, x9, #1
 ; CHECK-GISEL-NEXT:    sub x9, x10, x9
 ; CHECK-GISEL-NEXT:    ldr b1, [x8, x9]
+; CHECK-GISEL-NEXT:    mov v1.b[0], v1.b[0]
 ; CHECK-GISEL-NEXT:    mov v1.b[1], v2.b[0]
 ; CHECK-GISEL-NEXT:    mov b2, v0.b[3]
 ; CHECK-GISEL-NEXT:    mov v1.b[2], v3.b[0]
@@ -176,15 +178,14 @@ define <2 x i16> @test_varidx_extract_v4s16(<4 x i16> %x, i32 %idx) {
 ; CHECK-GISEL:       // %bb.0:
 ; CHECK-GISEL-NEXT:    sub sp, sp, #16
 ; CHECK-GISEL-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GISEL-NEXT:    mov w9, w0
-; CHECK-GISEL-NEXT:    mov h1, v0.h[1]
+; CHECK-GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GISEL-NEXT:    add x8, sp, #8
 ; CHECK-GISEL-NEXT:    str d0, [sp, #8]
 ; CHECK-GISEL-NEXT:    and x9, x9, #0x3
-; CHECK-GISEL-NEXT:    ldr h0, [x8, x9, lsl #1]
-; CHECK-GISEL-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GISEL-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GISEL-NEXT:    ldr h1, [x8, x9, lsl #1]
+; CHECK-GISEL-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GISEL-NEXT:    ushll v0.4s, v1.4h, #0
 ; CHECK-GISEL-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GISEL-NEXT:    add sp, sp, #16
 ; CHECK-GISEL-NEXT:    ret
@@ -217,16 +218,13 @@ define <4 x i16> @test_varidx_extract_v8s16(<8 x i16> %x, i32 %idx) {
 ; CHECK-GISEL-NEXT:    sub sp, sp, #16
 ; CHECK-GISEL-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-GISEL-NEXT:    mov w9, w0
-; CHECK-GISEL-NEXT:    mov h2, v0.h[1]
 ; CHECK-GISEL-NEXT:    mov x8, sp
 ; CHECK-GISEL-NEXT:    str q0, [sp]
 ; CHECK-GISEL-NEXT:    and x9, x9, #0x7
-; CHECK-GISEL-NEXT:    mov h3, v0.h[2]
 ; CHECK-GISEL-NEXT:    ldr h1, [x8, x9, lsl #1]
-; CHECK-GISEL-NEXT:    mov h0, v0.h[3]
-; CHECK-GISEL-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GISEL-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GISEL-NEXT:    mov v1.h[3], v0.h[0]
+; CHECK-GISEL-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GISEL-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GISEL-NEXT:    mov v1.h[3], v0.h[3]
 ; CHECK-GISEL-NEXT:    fmov d0, d1
 ; CHECK-GISEL-NEXT:    add sp, sp, #16
 ; CHECK-GISEL-NEXT:    ret
@@ -289,13 +287,12 @@ define <2 x i32> @test_varidx_extract_v4s32(<4 x i32> %x, i32 %idx) {
 ; CHECK-GISEL-NEXT:    sub sp, sp, #16
 ; CHECK-GISEL-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-GISEL-NEXT:    mov w9, w0
-; CHECK-GISEL-NEXT:    mov s1, v0.s[1]
 ; CHECK-GISEL-NEXT:    mov x8, sp
 ; CHECK-GISEL-NEXT:    str q0, [sp]
 ; CHECK-GISEL-NEXT:    and x9, x9, #0x3
-; CHECK-GISEL-NEXT:    ldr s0, [x8, x9, lsl #2]
-; CHECK-GISEL-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GISEL-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GISEL-NEXT:    ldr s1, [x8, x9, lsl #2]
+; CHECK-GISEL-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GISEL-NEXT:    fmov d0, d1
 ; CHECK-GISEL-NEXT:    add sp, sp, #16
 ; CHECK-GISEL-NEXT:    ret
   %tmp = extractelement <4 x i32> %x, i32 %idx

diff  --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
index 720951eca6a344..0412aef7545e9d 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -13820,12 +13820,10 @@ define void @test_ld1lane_build(ptr %ptr0, ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr
 ; CHECK-GI-LABEL: test_ld1lane_build:
 ; CHECK-GI:       ; %bb.0:
 ; CHECK-GI-NEXT:    ldr s0, [x0]
-; CHECK-GI-NEXT:    ldr s1, [x1]
-; CHECK-GI-NEXT:    ldr s2, [x2]
-; CHECK-GI-NEXT:    ldr s3, [x3]
-; CHECK-GI-NEXT:    mov.s v0[1], v1[0]
-; CHECK-GI-NEXT:    mov.s v2[1], v3[0]
-; CHECK-GI-NEXT:    sub.2s v0, v0, v2
+; CHECK-GI-NEXT:    ldr s1, [x2]
+; CHECK-GI-NEXT:    ld1.s { v0 }[1], [x1]
+; CHECK-GI-NEXT:    ld1.s { v1 }[1], [x3]
+; CHECK-GI-NEXT:    sub.2s v0, v0, v1
 ; CHECK-GI-NEXT:    str d0, [x4]
 ; CHECK-GI-NEXT:    ret
   %load0 = load i32, ptr %ptr0, align 4
@@ -13844,28 +13842,15 @@ define void @test_ld1lane_build(ptr %ptr0, ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr
 }
 
 define void  @test_ld1lane_build_i16(ptr %a, ptr %b, ptr %c, ptr %d, <4 x i16> %e, ptr %p) {
-; CHECK-SD-LABEL: test_ld1lane_build_i16:
-; CHECK-SD:       ; %bb.0:
-; CHECK-SD-NEXT:    ldr h1, [x0]
-; CHECK-SD-NEXT:    ld1.h { v1 }[1], [x1]
-; CHECK-SD-NEXT:    ld1.h { v1 }[2], [x2]
-; CHECK-SD-NEXT:    ld1.h { v1 }[3], [x3]
-; CHECK-SD-NEXT:    sub.4h v0, v1, v0
-; CHECK-SD-NEXT:    str d0, [x4]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_ld1lane_build_i16:
-; CHECK-GI:       ; %bb.0:
-; CHECK-GI-NEXT:    ldr h1, [x0]
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    mov.h v1[1], v2[0]
-; CHECK-GI-NEXT:    ldr h2, [x2]
-; CHECK-GI-NEXT:    mov.h v1[2], v2[0]
-; CHECK-GI-NEXT:    ldr h2, [x3]
-; CHECK-GI-NEXT:    mov.h v1[3], v2[0]
-; CHECK-GI-NEXT:    sub.4h v0, v1, v0
-; CHECK-GI-NEXT:    str d0, [x4]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_ld1lane_build_i16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr h1, [x0]
+; CHECK-NEXT:    ld1.h { v1 }[1], [x1]
+; CHECK-NEXT:    ld1.h { v1 }[2], [x2]
+; CHECK-NEXT:    ld1.h { v1 }[3], [x3]
+; CHECK-NEXT:    sub.4h v0, v1, v0
+; CHECK-NEXT:    str d0, [x4]
+; CHECK-NEXT:    ret
   %ld.a = load i16, ptr %a
   %ld.b = load i16, ptr %b
   %ld.c = load i16, ptr %c
@@ -13880,34 +13865,18 @@ define void  @test_ld1lane_build_i16(ptr %a, ptr %b, ptr %c, ptr %d, <4 x i16> %
 }
 
 define void  @test_ld1lane_build_half(ptr %a, ptr %b, ptr %c, ptr %d, <4 x half> %e, ptr %p) {
-; CHECK-SD-LABEL: test_ld1lane_build_half:
-; CHECK-SD:       ; %bb.0:
-; CHECK-SD-NEXT:    ldr h1, [x0]
-; CHECK-SD-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-SD-NEXT:    ld1.h { v1 }[1], [x1]
-; CHECK-SD-NEXT:    ld1.h { v1 }[2], [x2]
-; CHECK-SD-NEXT:    ld1.h { v1 }[3], [x3]
-; CHECK-SD-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-SD-NEXT:    fsub.4s v0, v1, v0
-; CHECK-SD-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-SD-NEXT:    str d0, [x4]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_ld1lane_build_half:
-; CHECK-GI:       ; %bb.0:
-; CHECK-GI-NEXT:    ldr h1, [x0]
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-GI-NEXT:    mov.h v1[1], v2[0]
-; CHECK-GI-NEXT:    ldr h2, [x2]
-; CHECK-GI-NEXT:    mov.h v1[2], v2[0]
-; CHECK-GI-NEXT:    ldr h2, [x3]
-; CHECK-GI-NEXT:    mov.h v1[3], v2[0]
-; CHECK-GI-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NEXT:    fsub.4s v0, v1, v0
-; CHECK-GI-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-GI-NEXT:    str d0, [x4]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_ld1lane_build_half:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr h1, [x0]
+; CHECK-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-NEXT:    ld1.h { v1 }[1], [x1]
+; CHECK-NEXT:    ld1.h { v1 }[2], [x2]
+; CHECK-NEXT:    ld1.h { v1 }[3], [x3]
+; CHECK-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-NEXT:    fsub.4s v0, v1, v0
+; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    str d0, [x4]
+; CHECK-NEXT:    ret
   %ld.a = load half, ptr %a
   %ld.b = load half, ptr %b
   %ld.c = load half, ptr %c
@@ -13942,6 +13911,7 @@ define void  @test_ld1lane_build_i8(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e, ptr
 ; CHECK-GI-NEXT:    ldr b1, [x0]
 ; CHECK-GI-NEXT:    ldr b2, [x1]
 ; CHECK-GI-NEXT:    ldr x8, [sp]
+; CHECK-GI-NEXT:    mov.b v1[0], v1[0]
 ; CHECK-GI-NEXT:    mov.b v1[1], v2[0]
 ; CHECK-GI-NEXT:    ldr b2, [x2]
 ; CHECK-GI-NEXT:    mov.b v1[2], v2[0]

diff  --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index c56f4409e3a627..c0d91c1e0c836b 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1259,7 +1259,7 @@ define <2 x i32> @scalar_to_vector.v2i32(i32 %a) {
 ;
 ; CHECK-GI-LABEL: scalar_to_vector.v2i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    mov v0.s[0], w0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
   %b = insertelement <2 x i32> undef, i32 %a, i32 0
@@ -1267,19 +1267,29 @@ define <2 x i32> @scalar_to_vector.v2i32(i32 %a) {
 }
 
 define <4 x i32> @scalar_to_vector.v4i32(i32 %a) {
-; CHECK-LABEL: scalar_to_vector.v4i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_to_vector.v4i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_to_vector.v4i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov v0.s[0], w0
+; CHECK-GI-NEXT:    ret
   %b = insertelement <4 x i32> undef, i32 %a, i32 0
   ret <4 x i32> %b
 }
 
 define <2 x i64> @scalar_to_vector.v2i64(i64 %a) {
-; CHECK-LABEL: scalar_to_vector.v2i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov d0, x0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: scalar_to_vector.v2i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov d0, x0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: scalar_to_vector.v2i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov v0.d[0], x0
+; CHECK-GI-NEXT:    ret
   %b = insertelement <2 x i64> undef, i64 %a, i32 0
   ret <2 x i64> %b
 }
@@ -1348,21 +1358,22 @@ define <8 x i8> @getl(<16 x i8> %x) #0 {
 ;
 ; CHECK-GI-LABEL: getl:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov b4, v0.b[4]
-; CHECK-GI-NEXT:    mov b5, v0.b[5]
-; CHECK-GI-NEXT:    mov b6, v0.b[6]
-; CHECK-GI-NEXT:    mov b7, v0.b[7]
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[4], v4.b[0]
-; CHECK-GI-NEXT:    mov v0.b[5], v5.b[0]
-; CHECK-GI-NEXT:    mov v0.b[6], v6.b[0]
-; CHECK-GI-NEXT:    mov v0.b[7], v7.b[0]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    mov b2, v0.b[1]
+; CHECK-GI-NEXT:    mov v1.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[2]
+; CHECK-GI-NEXT:    mov v1.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[3]
+; CHECK-GI-NEXT:    mov v1.b[2], v3.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[4]
+; CHECK-GI-NEXT:    mov v1.b[3], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[5]
+; CHECK-GI-NEXT:    mov v1.b[4], v3.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[6]
+; CHECK-GI-NEXT:    mov b0, v0.b[7]
+; CHECK-GI-NEXT:    mov v1.b[5], v2.b[0]
+; CHECK-GI-NEXT:    mov v1.b[6], v3.b[0]
+; CHECK-GI-NEXT:    mov v1.b[7], v0.b[0]
+; CHECK-GI-NEXT:    fmov d0, d1
 ; CHECK-GI-NEXT:    ret
   %vecext = extractelement <16 x i8> %x, i32 0
   %vecinit = insertelement <8 x i8> undef, i8 %vecext, i32 0
@@ -1405,16 +1416,13 @@ define <4 x i16> @test_extracts_inserts_varidx_extract(<8 x i16> %x, i32 %idx) {
 ; CHECK-GI-NEXT:    sub sp, sp, #16
 ; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-GI-NEXT:    mov w9, w0
-; CHECK-GI-NEXT:    mov h2, v0.h[1]
 ; CHECK-GI-NEXT:    mov x8, sp
 ; CHECK-GI-NEXT:    str q0, [sp]
 ; CHECK-GI-NEXT:    and x9, x9, #0x7
-; CHECK-GI-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NEXT:    ldr h1, [x8, x9, lsl #1]
-; CHECK-GI-NEXT:    mov h0, v0.h[3]
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
 ; CHECK-GI-NEXT:    fmov d0, d1
 ; CHECK-GI-NEXT:    add sp, sp, #16
 ; CHECK-GI-NEXT:    ret
@@ -1709,8 +1717,8 @@ define <2 x i32> @test_concat_undef_v1i32(<2 x i32> %a) {
 ;
 ; CHECK-GI-LABEL: test_concat_undef_v1i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov v0.s[1], v0.s[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1794,25 +1802,26 @@ define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 {
 ;
 ; CHECK-GI-LABEL: test_concat_v16i8_v8i8_v16i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    mov v2.16b, v1.16b
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov b3, v0.b[1]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI127_0
-; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v0.b[2]
-; CHECK-GI-NEXT:    mov b4, v0.b[3]
-; CHECK-GI-NEXT:    mov b5, v0.b[4]
-; CHECK-GI-NEXT:    mov b6, v0.b[5]
-; CHECK-GI-NEXT:    mov b7, v0.b[6]
-; CHECK-GI-NEXT:    mov b16, v0.b[7]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI127_0]
-; CHECK-GI-NEXT:    mov v0.b[2], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v4.b[0]
-; CHECK-GI-NEXT:    mov v0.b[4], v5.b[0]
-; CHECK-GI-NEXT:    mov v0.b[5], v6.b[0]
-; CHECK-GI-NEXT:    mov v0.b[6], v7.b[0]
-; CHECK-GI-NEXT:    mov v0.b[7], v16.b[0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT:    mov v1.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b4, v0.b[2]
+; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[3]
+; CHECK-GI-NEXT:    mov v1.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v0.b[4]
+; CHECK-GI-NEXT:    mov v1.b[3], v3.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[5]
+; CHECK-GI-NEXT:    mov v1.b[4], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v0.b[6]
+; CHECK-GI-NEXT:    mov b0, v0.b[7]
+; CHECK-GI-NEXT:    mov v1.b[5], v3.b[0]
+; CHECK-GI-NEXT:    mov v1.b[6], v4.b[0]
+; CHECK-GI-NEXT:    mov v1.b[7], v0.b[0]
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI127_0]
+; CHECK-GI-NEXT:    tbl v0.16b, { v1.16b, v2.16b }, v0.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %vecext = extractelement <8 x i8> %x, i32 0
@@ -1844,36 +1853,38 @@ define <16 x i8> @test_concat_v16i8_v16i8_v8i8(<16 x i8> %x, <8 x i8> %y) #0 {
 ;
 ; CHECK-GI-LABEL: test_concat_v16i8_v16i8_v8i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v0.b[2]
+; CHECK-GI-NEXT:    mov b3, v0.b[1]
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov b4, v0.b[3]
-; CHECK-GI-NEXT:    mov b5, v0.b[4]
-; CHECK-GI-NEXT:    mov b6, v0.b[5]
-; CHECK-GI-NEXT:    mov b7, v0.b[6]
-; CHECK-GI-NEXT:    mov b16, v0.b[7]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[1]
-; CHECK-GI-NEXT:    mov v0.b[2], v3.b[0]
+; CHECK-GI-NEXT:    mov b4, v0.b[2]
+; CHECK-GI-NEXT:    mov v2.b[1], v3.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[3]
+; CHECK-GI-NEXT:    mov v2.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v0.b[4]
+; CHECK-GI-NEXT:    mov v2.b[3], v3.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[5]
+; CHECK-GI-NEXT:    mov v2.b[4], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v0.b[6]
+; CHECK-GI-NEXT:    mov b0, v0.b[7]
+; CHECK-GI-NEXT:    mov v2.b[5], v3.b[0]
 ; CHECK-GI-NEXT:    mov b3, v1.b[2]
-; CHECK-GI-NEXT:    mov v0.b[3], v4.b[0]
-; CHECK-GI-NEXT:    mov v0.b[4], v5.b[0]
-; CHECK-GI-NEXT:    mov v0.b[5], v6.b[0]
-; CHECK-GI-NEXT:    mov v0.b[6], v7.b[0]
-; CHECK-GI-NEXT:    mov v0.b[7], v16.b[0]
-; CHECK-GI-NEXT:    mov v0.b[8], v1.b[0]
-; CHECK-GI-NEXT:    mov v0.b[9], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[3]
-; CHECK-GI-NEXT:    mov v0.b[10], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[6], v4.b[0]
+; CHECK-GI-NEXT:    mov v2.b[7], v0.b[0]
+; CHECK-GI-NEXT:    mov b0, v1.b[1]
+; CHECK-GI-NEXT:    mov v2.b[8], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[9], v0.b[0]
+; CHECK-GI-NEXT:    mov b0, v1.b[3]
+; CHECK-GI-NEXT:    mov v2.b[10], v3.b[0]
 ; CHECK-GI-NEXT:    mov b3, v1.b[4]
-; CHECK-GI-NEXT:    mov v0.b[11], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[5]
-; CHECK-GI-NEXT:    mov v0.b[12], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[11], v0.b[0]
+; CHECK-GI-NEXT:    mov b0, v1.b[5]
+; CHECK-GI-NEXT:    mov v2.b[12], v3.b[0]
 ; CHECK-GI-NEXT:    mov b3, v1.b[6]
-; CHECK-GI-NEXT:    mov b1, v1.b[7]
-; CHECK-GI-NEXT:    mov v0.b[13], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[14], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[15], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[13], v0.b[0]
+; CHECK-GI-NEXT:    mov b0, v1.b[7]
+; CHECK-GI-NEXT:    mov v2.b[14], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[15], v0.b[0]
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %vecext = extractelement <16 x i8> %x, i32 0
@@ -1922,36 +1933,38 @@ define <16 x i8> @test_concat_v16i8_v8i8_v8i8(<8 x i8> %x, <8 x i8> %y) #0 {
 ; CHECK-GI-LABEL: test_concat_v16i8_v8i8_v8i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v0.b[2]
+; CHECK-GI-NEXT:    mov b3, v0.b[1]
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov b4, v0.b[3]
-; CHECK-GI-NEXT:    mov b5, v0.b[4]
-; CHECK-GI-NEXT:    mov b6, v0.b[5]
-; CHECK-GI-NEXT:    mov b7, v0.b[6]
-; CHECK-GI-NEXT:    mov b16, v0.b[7]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[1]
-; CHECK-GI-NEXT:    mov v0.b[2], v3.b[0]
+; CHECK-GI-NEXT:    mov b4, v0.b[2]
+; CHECK-GI-NEXT:    mov v2.b[1], v3.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[3]
+; CHECK-GI-NEXT:    mov v2.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v0.b[4]
+; CHECK-GI-NEXT:    mov v2.b[3], v3.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[5]
+; CHECK-GI-NEXT:    mov v2.b[4], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v0.b[6]
+; CHECK-GI-NEXT:    mov b0, v0.b[7]
+; CHECK-GI-NEXT:    mov v2.b[5], v3.b[0]
 ; CHECK-GI-NEXT:    mov b3, v1.b[2]
-; CHECK-GI-NEXT:    mov v0.b[3], v4.b[0]
-; CHECK-GI-NEXT:    mov v0.b[4], v5.b[0]
-; CHECK-GI-NEXT:    mov v0.b[5], v6.b[0]
-; CHECK-GI-NEXT:    mov v0.b[6], v7.b[0]
-; CHECK-GI-NEXT:    mov v0.b[7], v16.b[0]
-; CHECK-GI-NEXT:    mov v0.b[8], v1.b[0]
-; CHECK-GI-NEXT:    mov v0.b[9], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[3]
-; CHECK-GI-NEXT:    mov v0.b[10], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[6], v4.b[0]
+; CHECK-GI-NEXT:    mov v2.b[7], v0.b[0]
+; CHECK-GI-NEXT:    mov b0, v1.b[1]
+; CHECK-GI-NEXT:    mov v2.b[8], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[9], v0.b[0]
+; CHECK-GI-NEXT:    mov b0, v1.b[3]
+; CHECK-GI-NEXT:    mov v2.b[10], v3.b[0]
 ; CHECK-GI-NEXT:    mov b3, v1.b[4]
-; CHECK-GI-NEXT:    mov v0.b[11], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[5]
-; CHECK-GI-NEXT:    mov v0.b[12], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[11], v0.b[0]
+; CHECK-GI-NEXT:    mov b0, v1.b[5]
+; CHECK-GI-NEXT:    mov v2.b[12], v3.b[0]
 ; CHECK-GI-NEXT:    mov b3, v1.b[6]
-; CHECK-GI-NEXT:    mov b1, v1.b[7]
-; CHECK-GI-NEXT:    mov v0.b[13], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[14], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[15], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[13], v0.b[0]
+; CHECK-GI-NEXT:    mov b0, v1.b[7]
+; CHECK-GI-NEXT:    mov v2.b[14], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[15], v0.b[0]
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %vecext = extractelement <8 x i8> %x, i32 0
@@ -2017,17 +2030,15 @@ define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 {
 ;
 ; CHECK-GI-LABEL: test_concat_v8i16_v4i16_v8i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    mov v2.16b, v1.16b
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    adrp x8, .LCPI131_0
-; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI131_0]
-; CHECK-GI-NEXT:    mov v0.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI131_0]
+; CHECK-GI-NEXT:    tbl v0.16b, { v1.16b, v2.16b }, v0.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %vecext = extractelement <4 x i16> %x, i32 0
@@ -2051,20 +2062,16 @@ define <8 x i16> @test_concat_v8i16_v8i16_v4i16(<8 x i16> %x, <4 x i16> %y) #0 {
 ;
 ; CHECK-GI-LABEL: test_concat_v8i16_v8i16_v4i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-NEXT:    mov h3, v0.h[2]
+; CHECK-GI-NEXT:    mov v2.h[0], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NEXT:    mov h2, v1.h[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov h3, v1.h[2]
-; CHECK-GI-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NEXT:    mov h1, v1.h[3]
-; CHECK-GI-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[6], v3.h[0]
-; CHECK-GI-NEXT:    mov v0.h[7], v1.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-GI-NEXT:    mov v2.h[4], v1.h[0]
+; CHECK-GI-NEXT:    mov v2.h[5], v1.h[1]
+; CHECK-GI-NEXT:    mov v2.h[6], v1.h[2]
+; CHECK-GI-NEXT:    mov v2.h[7], v1.h[3]
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %vecext = extractelement <8 x i16> %x, i32 0
@@ -2097,20 +2104,16 @@ define <8 x i16> @test_concat_v8i16_v4i16_v4i16(<4 x i16> %x, <4 x i16> %y) #0 {
 ; CHECK-GI-LABEL: test_concat_v8i16_v4i16_v4i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-NEXT:    mov h3, v0.h[2]
+; CHECK-GI-NEXT:    mov v2.h[0], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NEXT:    mov h2, v1.h[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov h3, v1.h[2]
-; CHECK-GI-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NEXT:    mov h1, v1.h[3]
-; CHECK-GI-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[6], v3.h[0]
-; CHECK-GI-NEXT:    mov v0.h[7], v1.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-GI-NEXT:    mov v2.h[4], v1.h[0]
+; CHECK-GI-NEXT:    mov v2.h[5], v1.h[1]
+; CHECK-GI-NEXT:    mov v2.h[6], v1.h[2]
+; CHECK-GI-NEXT:    mov v2.h[7], v1.h[3]
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %vecext = extractelement <4 x i16> %x, i32 0
@@ -2160,13 +2163,13 @@ define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 {
 ;
 ; CHECK-GI-LABEL: test_concat_v4i32_v2i32_v4i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1
+; CHECK-GI-NEXT:    mov v2.16b, v1.16b
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    adrp x8, .LCPI135_0
-; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI135_0]
-; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI135_0]
+; CHECK-GI-NEXT:    tbl v0.16b, { v1.16b, v2.16b }, v0.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %vecext = extractelement <2 x i32> %x, i32 0
@@ -2186,12 +2189,12 @@ define <4 x i32> @test_concat_v4i32_v4i32_v2i32(<4 x i32> %x, <2 x i32> %y) #0 {
 ;
 ; CHECK-GI-LABEL: test_concat_v4i32_v4i32_v2i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
-; CHECK-GI-NEXT:    mov s2, v1.s[1]
-; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT:    mov v0.s[3], v2.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v2.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov v2.s[3], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %vecext = extractelement <4 x i32> %x, i32 0
@@ -2241,11 +2244,18 @@ entry:
 }
 
 define <2 x i64> @test_concat_v2i64_v2i64_v1i64(<2 x i64> %x, <1 x i64> %y) #0 {
-; CHECK-LABEL: test_concat_v2i64_v2i64_v1i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_concat_v2i64_v2i64_v1i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_concat_v2i64_v2i64_v1i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v0.d[0], v0.d[0]
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-GI-NEXT:    ret
 entry:
   %vecext = extractelement <2 x i64> %x, i32 0
   %vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0

diff  --git a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
index 1f5654d59926dc..a6a825b26b3b52 100644
--- a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
@@ -466,92 +466,62 @@ define <32 x i8> @sext_v32i1(<32 x i1> %arg) {
 ;
 ; CHECK-GI-LABEL: sext_v32i1:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr w9, [sp, #64]
-; CHECK-GI-NEXT:    ldr w8, [sp, #72]
+; CHECK-GI-NEXT:    ldr w8, [sp, #64]
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s2, w1
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w9, [sp, #72]
+; CHECK-GI-NEXT:    fmov s1, w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #80]
+; CHECK-GI-NEXT:    mov.b v0[1], w1
+; CHECK-GI-NEXT:    mov.b v1[1], w9
 ; CHECK-GI-NEXT:    ldr w9, [sp, #128]
-; CHECK-GI-NEXT:    mov.b v0[1], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w2
-; CHECK-GI-NEXT:    mov.b v1[1], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov.b v0[2], w2
+; CHECK-GI-NEXT:    mov.b v1[2], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #88]
-; CHECK-GI-NEXT:    mov.b v0[2], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w3
-; CHECK-GI-NEXT:    mov.b v1[2], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov.b v0[3], w3
+; CHECK-GI-NEXT:    mov.b v1[3], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #96]
-; CHECK-GI-NEXT:    mov.b v0[3], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w4
-; CHECK-GI-NEXT:    mov.b v1[3], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov.b v0[4], w4
+; CHECK-GI-NEXT:    mov.b v1[4], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #104]
-; CHECK-GI-NEXT:    mov.b v0[4], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w5
-; CHECK-GI-NEXT:    mov.b v1[4], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov.b v0[5], w5
+; CHECK-GI-NEXT:    mov.b v1[5], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #112]
-; CHECK-GI-NEXT:    mov.b v0[5], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w6
-; CHECK-GI-NEXT:    mov.b v1[5], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov.b v0[6], w6
+; CHECK-GI-NEXT:    mov.b v1[6], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #120]
-; CHECK-GI-NEXT:    mov.b v0[6], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w7
-; CHECK-GI-NEXT:    mov.b v1[6], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov.b v0[7], w7
+; CHECK-GI-NEXT:    mov.b v1[7], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp]
-; CHECK-GI-NEXT:    mov.b v0[7], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov.b v0[8], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #8]
-; CHECK-GI-NEXT:    mov.b v1[7], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov.b v1[8], w9
 ; CHECK-GI-NEXT:    ldr w9, [sp, #136]
-; CHECK-GI-NEXT:    mov.b v0[8], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov.b v0[9], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #16]
-; CHECK-GI-NEXT:    mov.b v1[8], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov.b v1[9], w9
 ; CHECK-GI-NEXT:    ldr w9, [sp, #144]
-; CHECK-GI-NEXT:    mov.b v0[9], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov.b v0[10], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #24]
-; CHECK-GI-NEXT:    mov.b v1[9], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov.b v1[10], w9
 ; CHECK-GI-NEXT:    ldr w9, [sp, #152]
-; CHECK-GI-NEXT:    mov.b v0[10], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov.b v0[11], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #32]
-; CHECK-GI-NEXT:    mov.b v1[10], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov.b v1[11], w9
 ; CHECK-GI-NEXT:    ldr w9, [sp, #160]
-; CHECK-GI-NEXT:    mov.b v0[11], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov.b v0[12], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #40]
-; CHECK-GI-NEXT:    mov.b v1[11], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov.b v1[12], w9
 ; CHECK-GI-NEXT:    ldr w9, [sp, #168]
-; CHECK-GI-NEXT:    mov.b v0[12], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov.b v0[13], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #48]
-; CHECK-GI-NEXT:    mov.b v1[12], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov.b v1[13], w9
 ; CHECK-GI-NEXT:    ldr w9, [sp, #176]
-; CHECK-GI-NEXT:    mov.b v0[13], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov.b v0[14], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #56]
-; CHECK-GI-NEXT:    mov.b v1[13], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov.b v1[14], w9
 ; CHECK-GI-NEXT:    ldr w9, [sp, #184]
-; CHECK-GI-NEXT:    mov.b v0[14], v2[0]
-; CHECK-GI-NEXT:    fmov s2, w8
-; CHECK-GI-NEXT:    mov.b v1[14], v3[0]
-; CHECK-GI-NEXT:    fmov s3, w9
-; CHECK-GI-NEXT:    mov.b v0[15], v2[0]
-; CHECK-GI-NEXT:    mov.b v1[15], v3[0]
+; CHECK-GI-NEXT:    mov.b v0[15], w8
+; CHECK-GI-NEXT:    mov.b v1[15], w9
 ; CHECK-GI-NEXT:    shl.16b v0, v0, #7
 ; CHECK-GI-NEXT:    shl.16b v1, v1, #7
 ; CHECK-GI-NEXT:    sshr.16b v0, v0, #7
@@ -840,194 +810,134 @@ define <64 x i8> @sext_v64i1(<64 x i1> %arg) {
 ; CHECK-GI-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-GI-NEXT:    .cfi_offset w29, -16
-; CHECK-GI-NEXT:    ldr w9, [sp, #80]
-; CHECK-GI-NEXT:    ldr w11, [sp, #88]
+; CHECK-GI-NEXT:    ldr w13, [sp, #80]
+; CHECK-GI-NEXT:    ldr w11, [sp, #208]
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s3, w1
-; CHECK-GI-NEXT:    ldr w8, [sp, #208]
-; CHECK-GI-NEXT:    ldr w10, [sp, #216]
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    fmov s4, w11
 ; CHECK-GI-NEXT:    ldr w9, [sp, #336]
-; CHECK-GI-NEXT:    fmov s2, w8
-; CHECK-GI-NEXT:    fmov s5, w10
-; CHECK-GI-NEXT:    ldr w11, [sp, #344]
-; CHECK-GI-NEXT:    mov.b v0[1], v3[0]
+; CHECK-GI-NEXT:    ldr w8, [sp, #88]
+; CHECK-GI-NEXT:    ldr w10, [sp, #216]
+; CHECK-GI-NEXT:    fmov s1, w13
+; CHECK-GI-NEXT:    fmov s2, w11
+; CHECK-GI-NEXT:    ldr w12, [sp, #344]
 ; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov.b v0[1], w1
 ; CHECK-GI-NEXT:    ldr w9, [sp, #224]
-; CHECK-GI-NEXT:    mov.b v1[1], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w2
-; CHECK-GI-NEXT:    fmov s6, w11
-; CHECK-GI-NEXT:    mov.b v2[1], v5[0]
+; CHECK-GI-NEXT:    ldr w11, [sp, #400]
+; CHECK-GI-NEXT:    mov.b v1[1], w8
+; CHECK-GI-NEXT:    mov.b v2[1], w10
 ; CHECK-GI-NEXT:    ldr w8, [sp, #96]
+; CHECK-GI-NEXT:    mov.b v3[1], w12
 ; CHECK-GI-NEXT:    ldr w10, [sp, #352]
-; CHECK-GI-NEXT:    ldr w11, [sp, #16]
-; CHECK-GI-NEXT:    mov.b v0[2], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #232]
-; CHECK-GI-NEXT:    mov.b v3[1], v6[0]
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    fmov s6, w10
+; CHECK-GI-NEXT:    mov.b v0[2], w2
+; CHECK-GI-NEXT:    mov.b v1[2], w8
+; CHECK-GI-NEXT:    mov.b v2[2], w9
 ; CHECK-GI-NEXT:    ldr w8, [sp, #104]
+; CHECK-GI-NEXT:    mov.b v3[2], w10
+; CHECK-GI-NEXT:    ldr w9, [sp, #232]
 ; CHECK-GI-NEXT:    ldr w10, [sp, #360]
-; CHECK-GI-NEXT:    mov.b v2[2], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w3
-; CHECK-GI-NEXT:    mov.b v1[2], v5[0]
-; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    mov.b v0[3], w3
+; CHECK-GI-NEXT:    mov.b v1[3], w8
+; CHECK-GI-NEXT:    mov.b v2[3], w9
 ; CHECK-GI-NEXT:    ldr w8, [sp, #112]
-; CHECK-GI-NEXT:    mov.b v3[2], v6[0]
-; CHECK-GI-NEXT:    fmov s6, w10
-; CHECK-GI-NEXT:    ldr w10, [sp, #368]
-; CHECK-GI-NEXT:    mov.b v0[3], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    mov.b v3[3], w10
 ; CHECK-GI-NEXT:    ldr w9, [sp, #240]
-; CHECK-GI-NEXT:    mov.b v1[3], v5[0]
-; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    ldr w10, [sp, #368]
+; CHECK-GI-NEXT:    mov.b v0[4], w4
+; CHECK-GI-NEXT:    mov.b v1[4], w8
+; CHECK-GI-NEXT:    mov.b v2[4], w9
 ; CHECK-GI-NEXT:    ldr w8, [sp, #120]
-; CHECK-GI-NEXT:    mov.b v2[3], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w4
-; CHECK-GI-NEXT:    mov.b v3[3], v6[0]
-; CHECK-GI-NEXT:    fmov s6, w10
-; CHECK-GI-NEXT:    ldr w10, [sp, #376]
-; CHECK-GI-NEXT:    mov.b v0[4], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    mov.b v3[4], w10
 ; CHECK-GI-NEXT:    ldr w9, [sp, #248]
-; CHECK-GI-NEXT:    mov.b v1[4], v5[0]
-; CHECK-GI-NEXT:    mov.b v3[4], v6[0]
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    fmov s6, w10
+; CHECK-GI-NEXT:    ldr w10, [sp, #376]
+; CHECK-GI-NEXT:    mov.b v0[5], w5
+; CHECK-GI-NEXT:    mov.b v1[5], w8
+; CHECK-GI-NEXT:    mov.b v2[5], w9
 ; CHECK-GI-NEXT:    ldr w8, [sp, #128]
-; CHECK-GI-NEXT:    ldr w10, [sp, #384]
-; CHECK-GI-NEXT:    mov.b v2[4], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w5
-; CHECK-GI-NEXT:    mov.b v1[5], v5[0]
-; CHECK-GI-NEXT:    mov.b v3[5], v6[0]
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    mov.b v0[5], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    mov.b v3[5], w10
 ; CHECK-GI-NEXT:    ldr w9, [sp, #256]
-; CHECK-GI-NEXT:    fmov s6, w10
+; CHECK-GI-NEXT:    ldr w10, [sp, #384]
+; CHECK-GI-NEXT:    mov.b v0[6], w6
+; CHECK-GI-NEXT:    mov.b v1[6], w8
+; CHECK-GI-NEXT:    mov.b v2[6], w9
 ; CHECK-GI-NEXT:    ldr w8, [sp, #136]
-; CHECK-GI-NEXT:    ldr w10, [sp, #392]
-; CHECK-GI-NEXT:    mov.b v2[5], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w6
-; CHECK-GI-NEXT:    mov.b v1[6], v5[0]
-; CHECK-GI-NEXT:    mov.b v3[6], v6[0]
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    fmov s6, w10
-; CHECK-GI-NEXT:    ldr w8, [sp, #144]
-; CHECK-GI-NEXT:    ldr w10, [sp, #400]
-; CHECK-GI-NEXT:    mov.b v0[6], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    mov.b v3[6], w10
 ; CHECK-GI-NEXT:    ldr w9, [sp, #264]
-; CHECK-GI-NEXT:    mov.b v1[7], v5[0]
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    ldr w8, [sp, #152]
-; CHECK-GI-NEXT:    mov.b v3[7], v6[0]
-; CHECK-GI-NEXT:    fmov s6, w10
-; CHECK-GI-NEXT:    ldr w10, [sp, #408]
-; CHECK-GI-NEXT:    mov.b v2[6], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w7
-; CHECK-GI-NEXT:    mov.b v1[8], v5[0]
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    ldr w8, [sp, #160]
-; CHECK-GI-NEXT:    mov.b v0[7], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #272]
-; CHECK-GI-NEXT:    mov.b v3[8], v6[0]
-; CHECK-GI-NEXT:    fmov s6, w10
-; CHECK-GI-NEXT:    ldr w10, [sp, #416]
-; CHECK-GI-NEXT:    mov.b v2[7], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w11
-; CHECK-GI-NEXT:    ldr w11, [sp, #24]
-; CHECK-GI-NEXT:    mov.b v1[9], v5[0]
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    ldr w8, [sp, #168]
-; CHECK-GI-NEXT:    mov.b v3[9], v6[0]
-; CHECK-GI-NEXT:    fmov s6, w10
-; CHECK-GI-NEXT:    ldr w10, [sp, #424]
-; CHECK-GI-NEXT:    mov.b v0[8], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #280]
-; CHECK-GI-NEXT:    mov.b v1[10], v5[0]
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    ldr w8, [sp, #176]
-; CHECK-GI-NEXT:    mov.b v2[8], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w11
-; CHECK-GI-NEXT:    ldr w11, [sp, #32]
-; CHECK-GI-NEXT:    mov.b v3[10], v6[0]
-; CHECK-GI-NEXT:    fmov s6, w10
-; CHECK-GI-NEXT:    ldr w10, [sp, #432]
-; CHECK-GI-NEXT:    mov.b v0[9], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #288]
-; CHECK-GI-NEXT:    mov.b v1[11], v5[0]
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    ldr w8, [sp, #184]
-; CHECK-GI-NEXT:    mov.b v3[11], v6[0]
-; CHECK-GI-NEXT:    fmov s6, w10
-; CHECK-GI-NEXT:    ldr w10, [sp, #440]
-; CHECK-GI-NEXT:    mov.b v2[9], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w11
-; CHECK-GI-NEXT:    ldr w11, [sp, #40]
-; CHECK-GI-NEXT:    mov.b v1[12], v5[0]
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    ldr w8, [sp, #192]
-; CHECK-GI-NEXT:    mov.b v0[10], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #296]
-; CHECK-GI-NEXT:    mov.b v3[12], v6[0]
-; CHECK-GI-NEXT:    fmov s6, w10
-; CHECK-GI-NEXT:    ldr w10, [sp, #448]
-; CHECK-GI-NEXT:    mov.b v2[10], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w11
-; CHECK-GI-NEXT:    ldr w11, [sp, #48]
-; CHECK-GI-NEXT:    mov.b v1[13], v5[0]
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    ldr w8, [sp, #200]
-; CHECK-GI-NEXT:    mov.b v3[13], v6[0]
-; CHECK-GI-NEXT:    fmov s6, w10
-; CHECK-GI-NEXT:    ldr w10, [sp, #456]
-; CHECK-GI-NEXT:    mov.b v0[11], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #304]
-; CHECK-GI-NEXT:    fmov s7, w10
-; CHECK-GI-NEXT:    mov.b v1[14], v5[0]
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    mov.b v2[11], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w11
-; CHECK-GI-NEXT:    ldr w11, [sp, #56]
-; CHECK-GI-NEXT:    mov.b v3[14], v6[0]
-; CHECK-GI-NEXT:    mov.b v0[12], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #312]
-; CHECK-GI-NEXT:    mov.b v1[15], v5[0]
-; CHECK-GI-NEXT:    mov.b v3[15], v7[0]
-; CHECK-GI-NEXT:    mov.b v2[12], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w11
-; CHECK-GI-NEXT:    ldr w11, [sp, #64]
-; CHECK-GI-NEXT:    shl.16b v1, v1, #7
-; CHECK-GI-NEXT:    mov.b v0[13], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #320]
-; CHECK-GI-NEXT:    shl.16b v3, v3, #7
-; CHECK-GI-NEXT:    sshr.16b v1, v1, #7
-; CHECK-GI-NEXT:    mov.b v2[13], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w11
-; CHECK-GI-NEXT:    ldr w11, [sp, #72]
-; CHECK-GI-NEXT:    sshr.16b v3, v3, #7
-; CHECK-GI-NEXT:    mov.b v0[14], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #328]
-; CHECK-GI-NEXT:    fmov s6, w9
-; CHECK-GI-NEXT:    mov.b v2[14], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w11
-; CHECK-GI-NEXT:    mov.b v0[15], v4[0]
-; CHECK-GI-NEXT:    mov.b v2[15], v6[0]
+; CHECK-GI-NEXT:    ldr w10, [sp, #392]
+; CHECK-GI-NEXT:    mov.b v0[7], w7
+; CHECK-GI-NEXT:    mov.b v1[7], w8
+; CHECK-GI-NEXT:    mov.b v2[7], w9
+; CHECK-GI-NEXT:    ldr w8, [sp, #16]
+; CHECK-GI-NEXT:    mov.b v3[7], w10
+; CHECK-GI-NEXT:    ldr w9, [sp, #144]
+; CHECK-GI-NEXT:    ldr w10, [sp, #272]
+; CHECK-GI-NEXT:    mov.b v0[8], w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #24]
+; CHECK-GI-NEXT:    mov.b v1[8], w9
+; CHECK-GI-NEXT:    mov.b v2[8], w10
+; CHECK-GI-NEXT:    ldr w9, [sp, #152]
+; CHECK-GI-NEXT:    mov.b v3[8], w11
+; CHECK-GI-NEXT:    ldr w10, [sp, #280]
+; CHECK-GI-NEXT:    ldr w11, [sp, #408]
+; CHECK-GI-NEXT:    mov.b v0[9], w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #32]
+; CHECK-GI-NEXT:    mov.b v1[9], w9
+; CHECK-GI-NEXT:    mov.b v2[9], w10
+; CHECK-GI-NEXT:    ldr w9, [sp, #160]
+; CHECK-GI-NEXT:    mov.b v3[9], w11
+; CHECK-GI-NEXT:    ldr w10, [sp, #288]
+; CHECK-GI-NEXT:    ldr w11, [sp, #416]
+; CHECK-GI-NEXT:    mov.b v0[10], w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #40]
+; CHECK-GI-NEXT:    mov.b v1[10], w9
+; CHECK-GI-NEXT:    mov.b v2[10], w10
+; CHECK-GI-NEXT:    ldr w9, [sp, #168]
+; CHECK-GI-NEXT:    mov.b v3[10], w11
+; CHECK-GI-NEXT:    ldr w10, [sp, #296]
+; CHECK-GI-NEXT:    ldr w11, [sp, #424]
+; CHECK-GI-NEXT:    mov.b v0[11], w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #48]
+; CHECK-GI-NEXT:    mov.b v1[11], w9
+; CHECK-GI-NEXT:    mov.b v2[11], w10
+; CHECK-GI-NEXT:    ldr w9, [sp, #176]
+; CHECK-GI-NEXT:    mov.b v3[11], w11
+; CHECK-GI-NEXT:    ldr w10, [sp, #304]
+; CHECK-GI-NEXT:    ldr w11, [sp, #432]
+; CHECK-GI-NEXT:    mov.b v0[12], w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #56]
+; CHECK-GI-NEXT:    mov.b v1[12], w9
+; CHECK-GI-NEXT:    mov.b v2[12], w10
+; CHECK-GI-NEXT:    ldr w9, [sp, #184]
+; CHECK-GI-NEXT:    mov.b v3[12], w11
+; CHECK-GI-NEXT:    ldr w10, [sp, #312]
+; CHECK-GI-NEXT:    ldr w11, [sp, #440]
+; CHECK-GI-NEXT:    mov.b v0[13], w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #64]
+; CHECK-GI-NEXT:    mov.b v1[13], w9
+; CHECK-GI-NEXT:    mov.b v2[13], w10
+; CHECK-GI-NEXT:    ldr w9, [sp, #192]
+; CHECK-GI-NEXT:    mov.b v3[13], w11
+; CHECK-GI-NEXT:    ldr w10, [sp, #320]
+; CHECK-GI-NEXT:    ldr w11, [sp, #448]
+; CHECK-GI-NEXT:    mov.b v0[14], w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #72]
+; CHECK-GI-NEXT:    mov.b v1[14], w9
+; CHECK-GI-NEXT:    mov.b v2[14], w10
+; CHECK-GI-NEXT:    ldr w9, [sp, #200]
+; CHECK-GI-NEXT:    mov.b v3[14], w11
+; CHECK-GI-NEXT:    ldr w10, [sp, #328]
+; CHECK-GI-NEXT:    ldr w11, [sp, #456]
+; CHECK-GI-NEXT:    mov.b v0[15], w8
+; CHECK-GI-NEXT:    mov.b v1[15], w9
+; CHECK-GI-NEXT:    mov.b v2[15], w10
+; CHECK-GI-NEXT:    mov.b v3[15], w11
 ; CHECK-GI-NEXT:    shl.16b v0, v0, #7
+; CHECK-GI-NEXT:    shl.16b v1, v1, #7
 ; CHECK-GI-NEXT:    shl.16b v2, v2, #7
+; CHECK-GI-NEXT:    shl.16b v3, v3, #7
 ; CHECK-GI-NEXT:    sshr.16b v0, v0, #7
+; CHECK-GI-NEXT:    sshr.16b v1, v1, #7
 ; CHECK-GI-NEXT:    sshr.16b v2, v2, #7
+; CHECK-GI-NEXT:    sshr.16b v3, v3, #7
 ; CHECK-GI-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ret
   %res = sext <64 x i1> %arg to <64 x i8>

diff  --git a/llvm/test/CodeGen/AArch64/arm64-tbl.ll b/llvm/test/CodeGen/AArch64/arm64-tbl.ll
index 44b92e6ccd088f..a854cb7fec9917 100644
--- a/llvm/test/CodeGen/AArch64/arm64-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-tbl.ll
@@ -368,28 +368,26 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x
 ; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
 ; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
 ; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
-; CHECK-GI-NEXT:    mov.16b v5, v4
-; CHECK-GI-NEXT:    mov.b v5[1], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[2], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[3], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[4], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[5], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[6], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[7], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov.b v4[1], w0
+; CHECK-GI-NEXT:    mov.b v4[2], w0
+; CHECK-GI-NEXT:    mov.b v4[3], w0
+; CHECK-GI-NEXT:    mov.b v4[4], w0
+; CHECK-GI-NEXT:    mov.b v4[5], w0
+; CHECK-GI-NEXT:    mov.b v4[6], w0
+; CHECK-GI-NEXT:    mov.b v4[7], w0
+; CHECK-GI-NEXT:    mov.b v4[8], w8
+; CHECK-GI-NEXT:    mov.b v4[9], w8
+; CHECK-GI-NEXT:    mov.b v4[10], w8
+; CHECK-GI-NEXT:    mov.b v4[11], w8
+; CHECK-GI-NEXT:    mov.b v4[12], w8
+; CHECK-GI-NEXT:    mov.b v4[13], w8
+; CHECK-GI-NEXT:    mov.b v4[14], w8
+; CHECK-GI-NEXT:    mov.b v4[15], w8
 ; CHECK-GI-NEXT:    adrp x8, .LCPI10_1
-; CHECK-GI-NEXT:    mov.b v5[8], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[9], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[10], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[11], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[12], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[13], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[14], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[15], v4[0]
-; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI10_1]
+; CHECK-GI-NEXT:    ldr q5, [x8, :lo12:.LCPI10_1]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI10_0
-; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v5
-; CHECK-GI-NEXT:    tbl.16b v1, { v2, v3 }, v4
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v4
+; CHECK-GI-NEXT:    tbl.16b v1, { v2, v3 }, v5
 ; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI10_0]
 ; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
 ; CHECK-GI-NEXT:    ret
@@ -488,35 +486,32 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x
 ; CHECK-GI-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask2:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    mov w8, #1 // =0x1
-; CHECK-GI-NEXT:    fmov s6, w0
 ; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
 ; CHECK-GI-NEXT:    fmov s4, w8
-; CHECK-GI-NEXT:    mov w8, #255 // =0xff
 ; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
 ; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
-; CHECK-GI-NEXT:    mov.16b v5, v4
-; CHECK-GI-NEXT:    mov.b v5[1], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[2], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[3], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[4], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[5], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[6], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[7], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov.b v4[1], w8
+; CHECK-GI-NEXT:    mov.b v4[2], w8
+; CHECK-GI-NEXT:    mov.b v4[3], w8
+; CHECK-GI-NEXT:    mov.b v4[4], w8
+; CHECK-GI-NEXT:    mov.b v4[5], w8
+; CHECK-GI-NEXT:    mov.b v4[6], w8
+; CHECK-GI-NEXT:    mov.b v4[7], w8
+; CHECK-GI-NEXT:    mov w8, #255 // =0xff
+; CHECK-GI-NEXT:    mov.b v4[8], w8
+; CHECK-GI-NEXT:    mov.b v4[9], w8
+; CHECK-GI-NEXT:    mov.b v4[10], w8
+; CHECK-GI-NEXT:    mov.b v4[11], w8
+; CHECK-GI-NEXT:    mov.b v4[12], w0
+; CHECK-GI-NEXT:    mov.b v4[13], w0
+; CHECK-GI-NEXT:    mov.b v4[14], w8
 ; CHECK-GI-NEXT:    adrp x8, .LCPI11_1
-; CHECK-GI-NEXT:    mov.b v5[8], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[9], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[10], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[11], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[12], v6[0]
-; CHECK-GI-NEXT:    mov.b v5[13], v6[0]
-; CHECK-GI-NEXT:    mov.b v5[14], v4[0]
-; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI11_1]
+; CHECK-GI-NEXT:    ldr q5, [x8, :lo12:.LCPI11_1]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI11_0
-; CHECK-GI-NEXT:    mov.b v5[15], v6[0]
-; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v5
-; CHECK-GI-NEXT:    tbl.16b v1, { v2, v3 }, v4
+; CHECK-GI-NEXT:    mov.b v4[15], w0
+; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v4
+; CHECK-GI-NEXT:    tbl.16b v1, { v2, v3 }, v5
 ; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI11_0]
 ; CHECK-GI-NEXT:    tbl.16b v0, { v0, v1 }, v2
 ; CHECK-GI-NEXT:    ret
@@ -623,32 +618,30 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    fmov s4, w0
 ; CHECK-GI-NEXT:    mov w8, #255 // =0xff
+; CHECK-GI-NEXT:    adrp x9, .LCPI12_1
 ; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-GI-NEXT:    ldr q5, [x9, :lo12:.LCPI12_1]
 ; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
 ; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    mov.16b v5, v4
-; CHECK-GI-NEXT:    mov.b v5[1], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[2], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[3], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[4], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[5], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[6], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[7], v4[0]
-; CHECK-GI-NEXT:    fmov s4, w8
-; CHECK-GI-NEXT:    adrp x8, .LCPI12_1
-; CHECK-GI-NEXT:    mov.b v5[8], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[9], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[10], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[11], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[12], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[13], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[14], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[15], v4[0]
-; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI12_1]
+; CHECK-GI-NEXT:    mov.b v4[1], w0
+; CHECK-GI-NEXT:    tbl.16b v2, { v2, v3 }, v5
+; CHECK-GI-NEXT:    mov.b v4[2], w0
+; CHECK-GI-NEXT:    mov.b v4[3], w0
+; CHECK-GI-NEXT:    mov.b v4[4], w0
+; CHECK-GI-NEXT:    mov.b v4[5], w0
+; CHECK-GI-NEXT:    mov.b v4[6], w0
+; CHECK-GI-NEXT:    mov.b v4[7], w0
+; CHECK-GI-NEXT:    mov.b v4[8], w8
+; CHECK-GI-NEXT:    mov.b v4[9], w8
+; CHECK-GI-NEXT:    mov.b v4[10], w8
+; CHECK-GI-NEXT:    mov.b v4[11], w8
+; CHECK-GI-NEXT:    mov.b v4[12], w8
+; CHECK-GI-NEXT:    mov.b v4[13], w8
+; CHECK-GI-NEXT:    mov.b v4[14], w8
+; CHECK-GI-NEXT:    mov.b v4[15], w8
 ; CHECK-GI-NEXT:    adrp x8, .LCPI12_0
-; CHECK-GI-NEXT:    tbl.16b v2, { v2, v3 }, v4
-; CHECK-GI-NEXT:    tbl.16b v3, { v0, v1 }, v5
+; CHECK-GI-NEXT:    tbl.16b v3, { v0, v1 }, v4
 ; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI12_0]
 ; CHECK-GI-NEXT:    tbl.16b v0, { v2, v3 }, v0
 ; CHECK-GI-NEXT:    ret
@@ -774,30 +767,28 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16
 ; CHECK-GI-NEXT:    mov w8, #255 // =0xff
 ; CHECK-GI-NEXT:    // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
 ; CHECK-GI-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    fmov s6, w8
-; CHECK-GI-NEXT:    adrp x8, .LCPI13_1
 ; CHECK-GI-NEXT:    // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
 ; CHECK-GI-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-GI-NEXT:    mov.16b v5, v4
-; CHECK-GI-NEXT:    mov.b v5[1], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[2], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[3], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[4], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[5], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[6], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[7], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[8], v6[0]
-; CHECK-GI-NEXT:    mov.b v5[9], v6[0]
-; CHECK-GI-NEXT:    mov.b v5[10], v6[0]
-; CHECK-GI-NEXT:    mov.b v5[11], v6[0]
-; CHECK-GI-NEXT:    mov.b v5[12], v6[0]
-; CHECK-GI-NEXT:    mov.b v5[13], v6[0]
-; CHECK-GI-NEXT:    mov.b v5[14], v4[0]
-; CHECK-GI-NEXT:    mov.b v5[15], v4[0]
-; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI13_1]
+; CHECK-GI-NEXT:    mov.b v4[1], w0
+; CHECK-GI-NEXT:    mov.b v4[2], w0
+; CHECK-GI-NEXT:    mov.b v4[3], w0
+; CHECK-GI-NEXT:    mov.b v4[4], w0
+; CHECK-GI-NEXT:    mov.b v4[5], w0
+; CHECK-GI-NEXT:    mov.b v4[6], w0
+; CHECK-GI-NEXT:    mov.b v4[7], w0
+; CHECK-GI-NEXT:    mov.b v4[8], w8
+; CHECK-GI-NEXT:    mov.b v4[9], w8
+; CHECK-GI-NEXT:    mov.b v4[10], w8
+; CHECK-GI-NEXT:    mov.b v4[11], w8
+; CHECK-GI-NEXT:    mov.b v4[12], w8
+; CHECK-GI-NEXT:    mov.b v4[13], w8
+; CHECK-GI-NEXT:    adrp x8, .LCPI13_1
+; CHECK-GI-NEXT:    ldr q5, [x8, :lo12:.LCPI13_1]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI13_0
-; CHECK-GI-NEXT:    tbl.16b v2, { v2, v3 }, v4
-; CHECK-GI-NEXT:    tbl.16b v3, { v0, v1 }, v5
+; CHECK-GI-NEXT:    tbl.16b v2, { v2, v3 }, v5
+; CHECK-GI-NEXT:    mov.b v4[14], w0
+; CHECK-GI-NEXT:    mov.b v4[15], w0
+; CHECK-GI-NEXT:    tbl.16b v3, { v0, v1 }, v4
 ; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI13_0]
 ; CHECK-GI-NEXT:    tbl.16b v0, { v2, v3 }, v0
 ; CHECK-GI-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll
index 5de99586f7fc78..79cfeedb74bce0 100644
--- a/llvm/test/CodeGen/AArch64/bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast.ll
@@ -13,7 +13,7 @@ define <4 x i16> @foo1(<2 x i32> %a) {
 ; CHECK-GI-LABEL: foo1:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    mov w8, #58712 // =0xe558
-; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.s[0], w8
 ; CHECK-GI-NEXT:    zip1 v0.2s, v1.2s, v0.2s
 ; CHECK-GI-NEXT:    rev32 v0.4h, v0.4h
 ; CHECK-GI-NEXT:    ret
@@ -33,7 +33,7 @@ define <4 x i16> @foo2(<2 x i32> %a) {
 ; CHECK-GI-LABEL: foo2:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    mov w8, #712 // =0x2c8
-; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.s[0], w8
 ; CHECK-GI-NEXT:    zip1 v0.2s, v1.2s, v0.2s
 ; CHECK-GI-NEXT:    rev32 v0.4h, v0.4h
 ; CHECK-GI-NEXT:    ret
@@ -60,13 +60,11 @@ define i32 @bitcast_v4i8_i32(<4 x i8> %a, <4 x i8> %b){
 ; CHECK-GI-LABEL: bitcast_v4i8_i32:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    add v0.4h, v0.4h, v1.4h
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v3.h[0]
-; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    xtn v0.8b, v1.8h
 ; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    ret
   %c = add <4 x i8> %a, %b
@@ -87,12 +85,13 @@ define <4 x i8> @bitcast_i32_v4i8(i32 %a, i32 %b){
 ; CHECK-GI-NEXT:    add w8, w0, w1
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v3.b[0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[2], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[3], v0.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v2.8b, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
   %c = add i32 %a, %b
@@ -117,9 +116,9 @@ define i32 @bitcast_v2i16_i32(<2 x i16> %a, <2 x i16> %b){
 ; CHECK-GI-LABEL: bitcast_v2i16_i32:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    add v0.2s, v0.2s, v1.2s
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    xtn v0.4h, v1.4s
 ; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    ret
   %c = add <2 x i16> %a, %b
@@ -419,16 +418,17 @@ define <4 x i8> @bitcast_v2i16_v4i8(<2 x i16> %a, <2 x i16> %b){
 ; CHECK-GI-LABEL: bitcast_v2i16_v4i8:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    add v0.2s, v0.2s, v1.2s
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    xtn v0.4h, v1.4s
 ; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v3.b[0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[2], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[3], v0.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v2.8b, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
   %c = add <2 x i16> %a, %b
@@ -455,13 +455,11 @@ define <2 x i16> @bitcast_v4i8_v2i16(<4 x i8> %a, <4 x i8> %b){
 ; CHECK-GI-LABEL: bitcast_v4i8_v2i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    add v0.4h, v0.4h, v1.4h
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v3.h[0]
-; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    xtn v0.8b, v1.8h
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
@@ -515,10 +513,12 @@ define <4 x i64> @bitcast_v8i32_v4i64(<8 x i32> %a, <8 x i32> %b){
 ;
 ; CHECK-GI-LABEL: bitcast_v8i32_v4i64:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    add v0.4s, v0.4s, v2.4s
-; CHECK-GI-NEXT:    add v1.4s, v1.4s, v3.4s
-; CHECK-GI-NEXT:    mov x8, v0.d[1]
-; CHECK-GI-NEXT:    mov x9, v1.d[1]
+; CHECK-GI-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    add v3.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    mov x8, v2.d[1]
+; CHECK-GI-NEXT:    mov x9, v3.d[1]
+; CHECK-GI-NEXT:    mov v0.d[0], v2.d[0]
+; CHECK-GI-NEXT:    mov v1.d[0], v3.d[0]
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    mov v1.d[1], x9
 ; CHECK-GI-NEXT:    ret
@@ -574,10 +574,12 @@ define <4 x i64> @bitcast_v16i16_v4i64(<16 x i16> %a, <16 x i16> %b){
 ;
 ; CHECK-GI-LABEL: bitcast_v16i16_v4i64:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    add v0.8h, v0.8h, v2.8h
-; CHECK-GI-NEXT:    add v1.8h, v1.8h, v3.8h
-; CHECK-GI-NEXT:    mov x8, v0.d[1]
-; CHECK-GI-NEXT:    mov x9, v1.d[1]
+; CHECK-GI-NEXT:    add v2.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    add v3.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    mov x8, v2.d[1]
+; CHECK-GI-NEXT:    mov x9, v3.d[1]
+; CHECK-GI-NEXT:    mov v0.d[0], v2.d[0]
+; CHECK-GI-NEXT:    mov v1.d[0], v3.d[0]
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    mov v1.d[1], x9
 ; CHECK-GI-NEXT:    ret
@@ -614,14 +616,18 @@ define <8 x i64> @bitcast_v16i32_v8i64(<16 x i32> %a, <16 x i32> %b){
 ;
 ; CHECK-GI-LABEL: bitcast_v16i32_v8i64:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    add v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT:    add v1.4s, v1.4s, v5.4s
-; CHECK-GI-NEXT:    add v2.4s, v2.4s, v6.4s
-; CHECK-GI-NEXT:    add v3.4s, v3.4s, v7.4s
-; CHECK-GI-NEXT:    mov x8, v0.d[1]
-; CHECK-GI-NEXT:    mov x9, v1.d[1]
-; CHECK-GI-NEXT:    mov x10, v2.d[1]
-; CHECK-GI-NEXT:    mov x11, v3.d[1]
+; CHECK-GI-NEXT:    add v4.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT:    add v5.4s, v1.4s, v5.4s
+; CHECK-GI-NEXT:    add v6.4s, v2.4s, v6.4s
+; CHECK-GI-NEXT:    add v7.4s, v3.4s, v7.4s
+; CHECK-GI-NEXT:    mov x8, v4.d[1]
+; CHECK-GI-NEXT:    mov x9, v5.d[1]
+; CHECK-GI-NEXT:    mov x10, v6.d[1]
+; CHECK-GI-NEXT:    mov x11, v7.d[1]
+; CHECK-GI-NEXT:    mov v0.d[0], v4.d[0]
+; CHECK-GI-NEXT:    mov v1.d[0], v5.d[0]
+; CHECK-GI-NEXT:    mov v2.d[0], v6.d[0]
+; CHECK-GI-NEXT:    mov v3.d[0], v7.d[0]
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    mov v1.d[1], x9
 ; CHECK-GI-NEXT:    mov v2.d[1], x10

diff  --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index 071613b9cc011e..9ee924dd2548a6 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -110,8 +110,8 @@ define <2 x i16> @bswap_v2i16(<2 x i16> %a){
 ; CHECK-GI-LABEL: bswap_v2i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
 ; CHECK-GI-NEXT:    rev16 v0.8b, v0.8b
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
@@ -146,7 +146,7 @@ define <1 x i32> @bswap_v1i32(<1 x i32> %a){
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    rev w8, w8
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll
index f6eeeef4faf7ed..18570b2d793ff6 100644
--- a/llvm/test/CodeGen/AArch64/concat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/concat-vector.ll
@@ -11,12 +11,13 @@ define <4 x i8> @concat1(<2 x i8> %A, <2 x i8> %B) {
 ; CHECK-GI-LABEL: concat1:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov s2, v1.s[1]
-; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v2.b[0]
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    mov v0.b[1], w8
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.b[2], w8
+; CHECK-GI-NEXT:    mov v0.b[3], w9
 ; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
@@ -32,22 +33,20 @@ define <8 x i8> @concat2(<4 x i8> %A, <4 x i8> %B) {
 ;
 ; CHECK-GI-LABEL: concat2:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov h2, v1.h[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h3, v0.h[1]
-; CHECK-GI-NEXT:    mov h4, v1.h[2]
-; CHECK-GI-NEXT:    mov h5, v1.h[3]
-; CHECK-GI-NEXT:    mov h6, v0.h[3]
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov v0.h[1], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[2], v4.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v5.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v6.h[0]
-; CHECK-GI-NEXT:    xtn v1.8b, v1.8h
-; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    mov v2.h[0], v0.h[0]
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov v3.h[0], v1.h[0]
+; CHECK-GI-NEXT:    mov v2.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v3.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v2.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v3.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v2.h[3], v0.h[3]
+; CHECK-GI-NEXT:    mov v3.h[3], v1.h[3]
+; CHECK-GI-NEXT:    xtn v0.8b, v2.8h
+; CHECK-GI-NEXT:    xtn v1.8b, v3.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    mov v0.s[0], w8
 ; CHECK-GI-NEXT:    fmov w8, s1
 ; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -75,14 +74,16 @@ define <4 x i16> @concat4(<2 x i16> %A, <2 x i16> %B) {
 ;
 ; CHECK-GI-LABEL: concat4:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov s2, v1.s[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s3, v0.s[1]
-; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
-; CHECK-GI-NEXT:    mov v0.s[1], v3.s[0]
-; CHECK-GI-NEXT:    xtn v1.4h, v1.4s
-; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    xtn v2.4h, v2.4s
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    xtn v1.4h, v0.4s
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov v0.s[0], w8
 ; CHECK-GI-NEXT:    fmov w8, s1
 ; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -145,8 +146,9 @@ define <4 x half> @concat9(<2 x half> %A, <2 x half> %B) {
 ;
 ; CHECK-GI-LABEL: concat9:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    mov v0.s[0], w8
 ; CHECK-GI-NEXT:    fmov w8, s1
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
@@ -181,12 +183,14 @@ define <8 x i16> @concat_v8s16_v2s16(ptr %ptr) {
 ;
 ; CHECK-GI-LABEL: concat_v8s16_v2s16:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    dup v0.4s, w8
 ; CHECK-GI-NEXT:    ldr h1, [x0]
 ; CHECK-GI-NEXT:    ldr h2, [x0, #2]
+; CHECK-GI-NEXT:    dup v0.4s, w8
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    xtn v2.4h, v0.4s
-; CHECK-GI-NEXT:    xtn v0.4h, v1.4s
+; CHECK-GI-NEXT:    xtn v1.4h, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.s[0], w8
 ; CHECK-GI-NEXT:    fmov w8, s2
 ; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    mov v0.s[2], w8
@@ -208,9 +212,10 @@ define <16 x i8> @concat_v16s8_v4s8(ptr %ptr) {
 ; CHECK-GI-NEXT:    dup v0.8h, w8
 ; CHECK-GI-NEXT:    xtn v1.8b, v0.8h
 ; CHECK-GI-NEXT:    ldr s0, [x0]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT:    mov v0.s[3], v1.s[0]
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov v0.s[3], w8
 ; CHECK-GI-NEXT:    ret
     %a = load <4 x i8>, ptr %ptr
     %b = shufflevector <4 x i8> %a, <4 x i8> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -218,24 +223,13 @@ define <16 x i8> @concat_v16s8_v4s8(ptr %ptr) {
 }
 
 define <16 x i8> @concat_v16s8_v4s8_load(ptr %ptrA, ptr %ptrB, ptr %ptrC, ptr %ptrD) {
-; CHECK-SD-LABEL: concat_v16s8_v4s8_load:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr s0, [x0]
-; CHECK-SD-NEXT:    ld1 { v0.s }[1], [x1]
-; CHECK-SD-NEXT:    ld1 { v0.s }[2], [x2]
-; CHECK-SD-NEXT:    ld1 { v0.s }[3], [x3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: concat_v16s8_v4s8_load:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr s0, [x0]
-; CHECK-GI-NEXT:    ldr s1, [x1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    ldr s1, [x2]
-; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT:    ldr s1, [x3]
-; CHECK-GI-NEXT:    mov v0.s[3], v1.s[0]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: concat_v16s8_v4s8_load:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ld1 { v0.s }[1], [x1]
+; CHECK-NEXT:    ld1 { v0.s }[2], [x2]
+; CHECK-NEXT:    ld1 { v0.s }[3], [x3]
+; CHECK-NEXT:    ret
     %A = load <4 x i8>, ptr %ptrA
     %B = load <4 x i8>, ptr %ptrB
     %C = load <4 x i8>, ptr %ptrC
@@ -261,41 +255,35 @@ define <16 x i8> @concat_v16s8_v4s8_reg(<4 x i8> %A, <4 x i8> %B, <4 x i8> %C, <
 ;
 ; CHECK-GI-LABEL: concat_v16s8_v4s8_reg:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov h4, v1.h[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h5, v0.h[1]
+; CHECK-GI-NEXT:    mov v4.h[0], v0.h[0]
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov v5.h[0], v1.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-GI-NEXT:    // kill: def $d3 killed $d3 def $q3
-; CHECK-GI-NEXT:    mov h6, v1.h[2]
-; CHECK-GI-NEXT:    mov h7, v1.h[3]
-; CHECK-GI-NEXT:    mov h16, v2.h[1]
-; CHECK-GI-NEXT:    mov h17, v0.h[3]
-; CHECK-GI-NEXT:    mov h18, v2.h[3]
-; CHECK-GI-NEXT:    mov v1.h[1], v4.h[0]
-; CHECK-GI-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-NEXT:    mov v0.h[1], v5.h[0]
-; CHECK-GI-NEXT:    mov h5, v2.h[2]
-; CHECK-GI-NEXT:    mov v2.h[1], v16.h[0]
-; CHECK-GI-NEXT:    mov v1.h[2], v6.h[0]
-; CHECK-GI-NEXT:    mov h6, v3.h[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v4.h[0]
-; CHECK-GI-NEXT:    mov v2.h[2], v5.h[0]
-; CHECK-GI-NEXT:    mov h4, v3.h[2]
-; CHECK-GI-NEXT:    mov h5, v3.h[3]
-; CHECK-GI-NEXT:    mov v1.h[3], v7.h[0]
-; CHECK-GI-NEXT:    mov v3.h[1], v6.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v17.h[0]
-; CHECK-GI-NEXT:    mov v2.h[3], v18.h[0]
-; CHECK-GI-NEXT:    xtn v1.8b, v1.8h
-; CHECK-GI-NEXT:    mov v3.h[2], v4.h[0]
-; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
-; CHECK-GI-NEXT:    xtn v2.8b, v2.8h
-; CHECK-GI-NEXT:    mov v3.h[3], v5.h[0]
+; CHECK-GI-NEXT:    mov v6.h[0], v2.h[0]
+; CHECK-GI-NEXT:    mov v7.h[0], v3.h[0]
+; CHECK-GI-NEXT:    mov v4.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v5.h[1], v1.h[1]
+; CHECK-GI-NEXT:    mov v6.h[1], v2.h[1]
+; CHECK-GI-NEXT:    mov v7.h[1], v3.h[1]
+; CHECK-GI-NEXT:    mov v4.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v5.h[2], v1.h[2]
+; CHECK-GI-NEXT:    mov v6.h[2], v2.h[2]
+; CHECK-GI-NEXT:    mov v7.h[2], v3.h[2]
+; CHECK-GI-NEXT:    mov v4.h[3], v0.h[3]
+; CHECK-GI-NEXT:    mov v5.h[3], v1.h[3]
+; CHECK-GI-NEXT:    mov v6.h[3], v2.h[3]
+; CHECK-GI-NEXT:    mov v7.h[3], v3.h[3]
+; CHECK-GI-NEXT:    xtn v0.8b, v4.8h
+; CHECK-GI-NEXT:    xtn v1.8b, v5.8h
+; CHECK-GI-NEXT:    xtn v2.8b, v6.8h
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    mov v0.s[0], w8
 ; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    xtn v1.8b, v7.8h
 ; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    xtn v1.8b, v3.8h
 ; CHECK-GI-NEXT:    mov v0.s[2], w8
 ; CHECK-GI-NEXT:    fmov w8, s1
 ; CHECK-GI-NEXT:    mov v0.s[3], w8
@@ -320,27 +308,29 @@ define <8 x i16> @concat_v8s16_v2s16_reg(<2 x i16> %A, <2 x i16> %B, <2 x i16> %
 ;
 ; CHECK-GI-LABEL: concat_v8s16_v2s16_reg:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov s4, v1.s[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s5, v0.s[1]
+; CHECK-GI-NEXT:    mov v4.s[0], v0.s[0]
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov v5.s[0], v1.s[0]
 ; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-GI-NEXT:    // kill: def $d3 killed $d3 def $q3
-; CHECK-GI-NEXT:    mov v1.s[1], v4.s[0]
-; CHECK-GI-NEXT:    mov s4, v2.s[1]
-; CHECK-GI-NEXT:    mov v0.s[1], v5.s[0]
+; CHECK-GI-NEXT:    mov v4.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v5.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v1.s[0], v2.s[0]
+; CHECK-GI-NEXT:    xtn v0.4h, v4.4s
+; CHECK-GI-NEXT:    xtn v4.4h, v5.4s
+; CHECK-GI-NEXT:    mov v1.s[1], v2.s[1]
+; CHECK-GI-NEXT:    mov v2.s[0], v3.s[0]
+; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    xtn v1.4h, v1.4s
-; CHECK-GI-NEXT:    mov v2.s[1], v4.s[0]
-; CHECK-GI-NEXT:    mov s4, v3.s[1]
-; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    mov v2.s[1], v3.s[1]
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    fmov w8, s4
 ; CHECK-GI-NEXT:    xtn v2.4h, v2.4s
-; CHECK-GI-NEXT:    mov v3.s[1], v4.s[0]
-; CHECK-GI-NEXT:    fmov w8, s1
 ; CHECK-GI-NEXT:    mov v0.s[1], w8
-; CHECK-GI-NEXT:    xtn v1.4h, v3.4s
-; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    mov v0.s[2], w8
 ; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    fmov w8, s2
 ; CHECK-GI-NEXT:    mov v0.s[3], w8
 ; CHECK-GI-NEXT:    ret
     %b = shufflevector <2 x i16> %A, <2 x i16> %B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>

diff  --git a/llvm/test/CodeGen/AArch64/fabs.ll b/llvm/test/CodeGen/AArch64/fabs.ll
index de108b0bc2b7a0..e19e2ead11f4d0 100644
--- a/llvm/test/CodeGen/AArch64/fabs.ll
+++ b/llvm/test/CodeGen/AArch64/fabs.ll
@@ -161,27 +161,21 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-LABEL: fabs_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fabs v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fabs v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    fabs v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fabs_v7f16:

diff  --git a/llvm/test/CodeGen/AArch64/faddsub.ll b/llvm/test/CodeGen/AArch64/faddsub.ll
index 6913a62fb266c1..b15579199a0598 100644
--- a/llvm/test/CodeGen/AArch64/faddsub.ll
+++ b/llvm/test/CodeGen/AArch64/faddsub.ll
@@ -188,33 +188,25 @@ define <7 x half> @fadd_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fadd v2.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v6.4h
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v1.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v2.h[3]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fadd_v7f16:
@@ -537,33 +529,25 @@ define <7 x half> @fsub_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fsub v2.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v6.4h
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fsub v1.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    fsub v1.4s, v1.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v2.h[3]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fsub_v7f16:

diff  --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index a5d7ae147ffda2..8ca1e9ee5b6178 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -556,7 +556,7 @@ define <2 x double> @v2f128_double(<2 x fp128> %a, <2 x fp128> %b, <2 x double>
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    cset w19, lt
 ; CHECK-GI-NEXT:    bl __lttf2
-; CHECK-GI-NEXT:    fmov d0, x19
+; CHECK-GI-NEXT:    mov v0.d[0], x19
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    cset w8, lt
 ; CHECK-GI-NEXT:    ldp q2, q1, [sp, #32] // 32-byte Folded Reload
@@ -663,29 +663,29 @@ define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double>
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    cset w22, lt
 ; CHECK-GI-NEXT:    bl __lttf2
-; CHECK-GI-NEXT:    ldp q0, q2, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    sbfx x8, x21, #0, #1
-; CHECK-GI-NEXT:    ldp q4, q3, [sp, #96] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    sbfx x9, x22, #0, #1
-; CHECK-GI-NEXT:    fmov d1, x8
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    ldr x30, [sp, #128] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v2.d[1], v0.d[0]
-; CHECK-GI-NEXT:    fmov d0, x8
-; CHECK-GI-NEXT:    cset w8, lt
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov v1.d[0], x8
+; CHECK-GI-NEXT:    sbfx x8, x22, #0, #1
+; CHECK-GI-NEXT:    mov v2.d[1], v3.d[0]
+; CHECK-GI-NEXT:    ldp q4, q3, [sp, #96] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #144] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    mov v1.d[1], x8
 ; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT:    cset w8, lt
 ; CHECK-GI-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-GI-NEXT:    mov v1.d[1], x9
-; CHECK-GI-NEXT:    ldp x22, x21, [sp, #144] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-GI-NEXT:    bic v0.16b, v3.16b, v0.16b
 ; CHECK-GI-NEXT:    and x9, x19, x8
 ; CHECK-GI-NEXT:    bic x8, x20, x8
 ; CHECK-GI-NEXT:    ldp x20, x19, [sp, #160] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    orr x8, x9, x8
-; CHECK-GI-NEXT:    bic v1.16b, v3.16b, v1.16b
-; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
+; CHECK-GI-NEXT:    orr v0.16b, v1.16b, v0.16b
 ; CHECK-GI-NEXT:    fmov d2, x8
-; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    add sp, sp, #176
@@ -831,21 +831,21 @@ define <3 x i32> @v3f64_i32(<3 x double> %a, <3 x double> %b, <3 x i32> %d, <3 x
 ; CHECK-GI-NEXT:    fcmp d2, d5
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
-; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.s[0], w8
 ; CHECK-GI-NEXT:    cset w9, mi
-; CHECK-GI-NEXT:    mov v1.s[1], w8
-; CHECK-GI-NEXT:    fmov d2, x9
+; CHECK-GI-NEXT:    mov v2.d[0], x9
+; CHECK-GI-NEXT:    mov w9, #-1 // =0xffffffff
 ; CHECK-GI-NEXT:    fcmgt v0.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    mov v3.s[0], w9
 ; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v2.4s
-; CHECK-GI-NEXT:    fmov s2, w8
-; CHECK-GI-NEXT:    mov v2.s[1], w8
+; CHECK-GI-NEXT:    mov v3.s[1], w9
 ; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NEXT:    neg v1.4s, v1.4s
-; CHECK-GI-NEXT:    mov v2.s[2], w8
+; CHECK-GI-NEXT:    mov v3.s[2], w9
 ; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v3.16b
 ; CHECK-GI-NEXT:    and v0.16b, v6.16b, v0.16b
 ; CHECK-GI-NEXT:    and v1.16b, v7.16b, v1.16b
 ; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
@@ -902,18 +902,18 @@ define <3 x float> @v3f32_float(<3 x float> %a, <3 x float> %b, <3 x float> %d,
 ; CHECK-GI-LABEL: v3f32_float:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov w8, #31 // =0x1f
+; CHECK-GI-NEXT:    mov w9, #-1 // =0xffffffff
 ; CHECK-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v4.s[0], w8
+; CHECK-GI-NEXT:    mov v5.s[0], w9
 ; CHECK-GI-NEXT:    mov v4.s[1], w8
+; CHECK-GI-NEXT:    mov v5.s[1], w9
 ; CHECK-GI-NEXT:    mov v4.s[2], w8
-; CHECK-GI-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    mov v5.s[2], w9
 ; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT:    neg v4.4s, v4.4s
-; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    neg v1.4s, v4.4s
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v5.16b
 ; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
 ; CHECK-GI-NEXT:    and v1.16b, v3.16b, v1.16b
 ; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
@@ -980,18 +980,18 @@ define <3 x i32> @v3f32_i32(<3 x float> %a, <3 x float> %b, <3 x i32> %d, <3 x i
 ; CHECK-GI-LABEL: v3f32_i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov w8, #31 // =0x1f
+; CHECK-GI-NEXT:    mov w9, #-1 // =0xffffffff
 ; CHECK-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v4.s[0], w8
+; CHECK-GI-NEXT:    mov v5.s[0], w9
 ; CHECK-GI-NEXT:    mov v4.s[1], w8
+; CHECK-GI-NEXT:    mov v5.s[1], w9
 ; CHECK-GI-NEXT:    mov v4.s[2], w8
-; CHECK-GI-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    mov v5.s[2], w9
 ; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT:    neg v4.4s, v4.4s
-; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    neg v1.4s, v4.4s
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v5.16b
 ; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
 ; CHECK-GI-NEXT:    and v1.16b, v3.16b, v1.16b
 ; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
@@ -1106,44 +1106,38 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x
 ; CHECK-GI-NOFP16-LABEL: v7f16_half:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    mov w8, #15 // =0xf
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fmov s4, w8
-; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov w8, #65535 // =0xffff
-; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h18, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h19, v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    fmov s5, w8
+; CHECK-GI-NOFP16-NEXT:    mov w9, #65535 // =0xffff
+; CHECK-GI-NOFP16-NEXT:    fmov s7, w9
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], w8
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v7.h[1], w9
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], w8
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v7.h[2], w9
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v7.16b, v4.16b
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s6, w8
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[1], v17.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v7.h[1], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v17.16b, v6.16b
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], w8
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v6.4h
+; CHECK-GI-NOFP16-NEXT:    mov v7.h[3], w9
 ; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v18.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[1], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[2], v19.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v7.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v5.4h
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[2], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v16.4s, v16.4h
-; CHECK-GI-NOFP16-NEXT:    mov v7.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[3], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcmgt v1.4s, v16.4s, v5.4s
-; CHECK-GI-NOFP16-NEXT:    mov v7.h[4], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[4], v6.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[4], w8
+; CHECK-GI-NOFP16-NEXT:    fcmgt v1.4s, v6.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT:    mov v7.h[4], w9
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[5], w8
 ; CHECK-GI-NOFP16-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-GI-NOFP16-NEXT:    mov v7.h[5], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[5], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v7.h[6], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[6], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    ushl v0.8h, v0.8h, v7.8h
-; CHECK-GI-NOFP16-NEXT:    neg v1.8h, v7.8h
+; CHECK-GI-NOFP16-NEXT:    mov v7.h[5], w9
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[6], w8
+; CHECK-GI-NOFP16-NEXT:    mov v7.h[6], w9
+; CHECK-GI-NOFP16-NEXT:    ushl v0.8h, v0.8h, v5.8h
+; CHECK-GI-NOFP16-NEXT:    neg v1.8h, v5.8h
 ; CHECK-GI-NOFP16-NEXT:    sshl v0.8h, v0.8h, v1.8h
-; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v0.16b, v17.16b
+; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v0.16b, v7.16b
 ; CHECK-GI-NOFP16-NEXT:    and v0.16b, v2.16b, v0.16b
 ; CHECK-GI-NOFP16-NEXT:    and v1.16b, v3.16b, v1.16b
 ; CHECK-GI-NOFP16-NEXT:    orr v0.16b, v0.16b, v1.16b
@@ -1152,28 +1146,26 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x
 ; CHECK-GI-FP16-LABEL: v7f16_half:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    mov w8, #15 // =0xf
+; CHECK-GI-FP16-NEXT:    mov w9, #65535 // =0xffff
 ; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
 ; CHECK-GI-FP16-NEXT:    fmov s4, w8
-; CHECK-GI-FP16-NEXT:    mov w8, #65535 // =0xffff
-; CHECK-GI-FP16-NEXT:    fmov s6, w8
-; CHECK-GI-FP16-NEXT:    mov v5.16b, v4.16b
-; CHECK-GI-FP16-NEXT:    mov v7.16b, v6.16b
-; CHECK-GI-FP16-NEXT:    mov v5.h[1], v4.h[0]
-; CHECK-GI-FP16-NEXT:    mov v7.h[1], v6.h[0]
-; CHECK-GI-FP16-NEXT:    mov v5.h[2], v4.h[0]
-; CHECK-GI-FP16-NEXT:    mov v7.h[2], v6.h[0]
-; CHECK-GI-FP16-NEXT:    mov v5.h[3], v4.h[0]
-; CHECK-GI-FP16-NEXT:    mov v7.h[3], v6.h[0]
-; CHECK-GI-FP16-NEXT:    mov v5.h[4], v4.h[0]
-; CHECK-GI-FP16-NEXT:    mov v7.h[4], v6.h[0]
-; CHECK-GI-FP16-NEXT:    mov v5.h[5], v4.h[0]
-; CHECK-GI-FP16-NEXT:    mov v7.h[5], v6.h[0]
-; CHECK-GI-FP16-NEXT:    mov v5.h[6], v4.h[0]
-; CHECK-GI-FP16-NEXT:    mov v7.h[6], v6.h[0]
-; CHECK-GI-FP16-NEXT:    ushl v0.8h, v0.8h, v5.8h
-; CHECK-GI-FP16-NEXT:    neg v1.8h, v5.8h
+; CHECK-GI-FP16-NEXT:    fmov s5, w9
+; CHECK-GI-FP16-NEXT:    mov v4.h[1], w8
+; CHECK-GI-FP16-NEXT:    mov v5.h[1], w9
+; CHECK-GI-FP16-NEXT:    mov v4.h[2], w8
+; CHECK-GI-FP16-NEXT:    mov v5.h[2], w9
+; CHECK-GI-FP16-NEXT:    mov v4.h[3], w8
+; CHECK-GI-FP16-NEXT:    mov v5.h[3], w9
+; CHECK-GI-FP16-NEXT:    mov v4.h[4], w8
+; CHECK-GI-FP16-NEXT:    mov v5.h[4], w9
+; CHECK-GI-FP16-NEXT:    mov v4.h[5], w8
+; CHECK-GI-FP16-NEXT:    mov v5.h[5], w9
+; CHECK-GI-FP16-NEXT:    mov v4.h[6], w8
+; CHECK-GI-FP16-NEXT:    mov v5.h[6], w9
+; CHECK-GI-FP16-NEXT:    ushl v0.8h, v0.8h, v4.8h
+; CHECK-GI-FP16-NEXT:    neg v1.8h, v4.8h
 ; CHECK-GI-FP16-NEXT:    sshl v0.8h, v0.8h, v1.8h
-; CHECK-GI-FP16-NEXT:    eor v1.16b, v0.16b, v7.16b
+; CHECK-GI-FP16-NEXT:    eor v1.16b, v0.16b, v5.16b
 ; CHECK-GI-FP16-NEXT:    and v0.16b, v2.16b, v0.16b
 ; CHECK-GI-FP16-NEXT:    and v1.16b, v3.16b, v1.16b
 ; CHECK-GI-FP16-NEXT:    orr v0.16b, v0.16b, v1.16b
@@ -1599,59 +1591,52 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ;
 ; CHECK-GI-NOFP16-LABEL: v7f16_i32:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[4]
 ; CHECK-GI-NOFP16-NEXT:    mov w8, #31 // =0x1f
-; CHECK-GI-NOFP16-NEXT:    mov h4, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    ldr s17, [sp, #32]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    fmov s16, w0
-; CHECK-GI-NOFP16-NEXT:    fmov s18, w4
+; CHECK-GI-NOFP16-NEXT:    mov v4.s[0], w8
+; CHECK-GI-NOFP16-NEXT:    mov w9, #-1 // =0xffffffff
+; CHECK-GI-NOFP16-NEXT:    mov v5.s[0], w0
+; CHECK-GI-NOFP16-NEXT:    mov v6.s[0], w9
+; CHECK-GI-NOFP16-NEXT:    mov v7.s[0], w7
+; CHECK-GI-NOFP16-NEXT:    ldr s16, [sp]
+; CHECK-GI-NOFP16-NEXT:    ldr s17, [sp, #24]
+; CHECK-GI-NOFP16-NEXT:    ldr s18, [sp, #32]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v4.s[1], w8
+; CHECK-GI-NOFP16-NEXT:    mov v5.s[1], w1
+; CHECK-GI-NOFP16-NEXT:    mov v17.s[1], v18.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v6.s[1], w9
+; CHECK-GI-NOFP16-NEXT:    mov v7.s[1], v16.s[0]
+; CHECK-GI-NOFP16-NEXT:    ldr s16, [sp, #8]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov v4.s[2], w8
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s3, w8
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    ldr s5, [sp]
-; CHECK-GI-NOFP16-NEXT:    mov v16.s[1], w1
-; CHECK-GI-NOFP16-NEXT:    mov v18.s[1], w5
-; CHECK-GI-NOFP16-NEXT:    mov v3.s[1], w8
-; CHECK-GI-NOFP16-NEXT:    fmov w9, s5
-; CHECK-GI-NOFP16-NEXT:    fmov s5, w7
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    ldr s6, [sp, #8]
-; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    ldr s7, [sp, #24]
-; CHECK-GI-NOFP16-NEXT:    mov v16.s[2], w2
-; CHECK-GI-NOFP16-NEXT:    mov v5.s[1], w9
-; CHECK-GI-NOFP16-NEXT:    fmov w9, s6
-; CHECK-GI-NOFP16-NEXT:    ldr s6, [sp, #16]
-; CHECK-GI-NOFP16-NEXT:    mov v3.s[2], w8
-; CHECK-GI-NOFP16-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-GI-NOFP16-NEXT:    mov v7.s[1], v17.s[0]
-; CHECK-GI-NOFP16-NEXT:    ldr s17, [sp, #40]
+; CHECK-GI-NOFP16-NEXT:    mov v5.s[2], w2
+; CHECK-GI-NOFP16-NEXT:    mov v6.s[2], w9
+; CHECK-GI-NOFP16-NEXT:    mov v7.s[2], v16.s[0]
+; CHECK-GI-NOFP16-NEXT:    ldr s16, [sp, #40]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov v18.s[2], w6
-; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT:    mov v16.s[3], w3
-; CHECK-GI-NOFP16-NEXT:    mov v5.s[2], w9
-; CHECK-GI-NOFP16-NEXT:    mov v7.s[2], v17.s[0]
-; CHECK-GI-NOFP16-NEXT:    fcmgt v2.4s, v4.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    fmov s4, w8
-; CHECK-GI-NOFP16-NEXT:    mov v4.s[1], w8
-; CHECK-GI-NOFP16-NEXT:    ushl v2.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    neg v3.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v4.s[2], w8
-; CHECK-GI-NOFP16-NEXT:    sshl v2.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    fmov w8, s6
-; CHECK-GI-NOFP16-NEXT:    mov v5.s[3], w8
-; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v2.16b, v4.16b
-; CHECK-GI-NOFP16-NEXT:    and v2.16b, v18.16b, v2.16b
-; CHECK-GI-NOFP16-NEXT:    and v1.16b, v7.16b, v1.16b
-; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v16.16b, v5.16b
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT:    mov v17.s[2], v16.s[0]
+; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v5.s[3], w3
+; CHECK-GI-NOFP16-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v3.s[0], w4
+; CHECK-GI-NOFP16-NEXT:    ushl v2.4s, v2.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT:    neg v4.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT:    mov v3.s[1], w5
+; CHECK-GI-NOFP16-NEXT:    sshl v2.4s, v2.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT:    ldr s4, [sp, #16]
+; CHECK-GI-NOFP16-NEXT:    mov v3.s[2], w6
+; CHECK-GI-NOFP16-NEXT:    mov v7.s[3], v4.s[0]
+; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v2.16b, v6.16b
+; CHECK-GI-NOFP16-NEXT:    and v2.16b, v3.16b, v2.16b
+; CHECK-GI-NOFP16-NEXT:    and v1.16b, v17.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v5.16b, v7.16b
 ; CHECK-GI-NOFP16-NEXT:    orr v1.16b, v2.16b, v1.16b
 ; CHECK-GI-NOFP16-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    mov s3, v0.s[2]
@@ -1670,59 +1655,56 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ; CHECK-GI-FP16-LABEL: v7f16_i32:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
-; CHECK-GI-FP16-NEXT:    mov w10, #31 // =0x1f
-; CHECK-GI-FP16-NEXT:    ldr s3, [sp]
-; CHECK-GI-FP16-NEXT:    fmov s2, w10
-; CHECK-GI-FP16-NEXT:    fmov s6, w0
-; CHECK-GI-FP16-NEXT:    ldr s4, [sp, #8]
-; CHECK-GI-FP16-NEXT:    fmov s17, w4
-; CHECK-GI-FP16-NEXT:    ldr s7, [sp, #24]
+; CHECK-GI-FP16-NEXT:    mov w9, #31 // =0x1f
+; CHECK-GI-FP16-NEXT:    mov v4.s[0], w0
+; CHECK-GI-FP16-NEXT:    mov v2.s[0], w9
+; CHECK-GI-FP16-NEXT:    mov v5.s[0], w7
+; CHECK-GI-FP16-NEXT:    ldr s6, [sp]
+; CHECK-GI-FP16-NEXT:    mov v7.s[0], w4
 ; CHECK-GI-FP16-NEXT:    ldr s16, [sp, #32]
+; CHECK-GI-FP16-NEXT:    ldr s17, [sp, #8]
 ; CHECK-GI-FP16-NEXT:    umov w8, v0.h[4]
-; CHECK-GI-FP16-NEXT:    umov w9, v0.h[5]
-; CHECK-GI-FP16-NEXT:    mov v2.s[1], w10
-; CHECK-GI-FP16-NEXT:    mov v6.s[1], w1
-; CHECK-GI-FP16-NEXT:    mov v17.s[1], w5
-; CHECK-GI-FP16-NEXT:    mov v7.s[1], v16.s[0]
+; CHECK-GI-FP16-NEXT:    umov w10, v0.h[5]
+; CHECK-GI-FP16-NEXT:    mov v4.s[1], w1
+; CHECK-GI-FP16-NEXT:    mov v2.s[1], w9
+; CHECK-GI-FP16-NEXT:    mov v5.s[1], v6.s[0]
+; CHECK-GI-FP16-NEXT:    ldr s6, [sp, #24]
+; CHECK-GI-FP16-NEXT:    mov v7.s[1], w5
+; CHECK-GI-FP16-NEXT:    mov v6.s[1], v16.s[0]
 ; CHECK-GI-FP16-NEXT:    ldr s16, [sp, #40]
-; CHECK-GI-FP16-NEXT:    fmov s1, w8
+; CHECK-GI-FP16-NEXT:    mov v1.s[0], w8
 ; CHECK-GI-FP16-NEXT:    umov w8, v0.h[6]
-; CHECK-GI-FP16-NEXT:    mov v2.s[2], w10
 ; CHECK-GI-FP16-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-FP16-NEXT:    mov v6.s[2], w2
-; CHECK-GI-FP16-NEXT:    mov v17.s[2], w6
-; CHECK-GI-FP16-NEXT:    mov v7.s[2], v16.s[0]
-; CHECK-GI-FP16-NEXT:    mov v1.s[1], w9
-; CHECK-GI-FP16-NEXT:    mov w9, #-1 // =0xffffffff
-; CHECK-GI-FP16-NEXT:    fmov s5, w9
+; CHECK-GI-FP16-NEXT:    mov v2.s[2], w9
+; CHECK-GI-FP16-NEXT:    mov v4.s[2], w2
+; CHECK-GI-FP16-NEXT:    mov v5.s[2], v17.s[0]
+; CHECK-GI-FP16-NEXT:    mov v7.s[2], w6
 ; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
-; CHECK-GI-FP16-NEXT:    mov v6.s[3], w3
-; CHECK-GI-FP16-NEXT:    mov v1.s[2], w8
-; CHECK-GI-FP16-NEXT:    fmov w8, s3
-; CHECK-GI-FP16-NEXT:    fmov s3, w7
-; CHECK-GI-FP16-NEXT:    mov v5.s[1], w9
+; CHECK-GI-FP16-NEXT:    mov v6.s[2], v16.s[0]
+; CHECK-GI-FP16-NEXT:    mov v1.s[1], w10
+; CHECK-GI-FP16-NEXT:    mov w10, #-1 // =0xffffffff
+; CHECK-GI-FP16-NEXT:    mov v3.s[0], w10
+; CHECK-GI-FP16-NEXT:    mov v4.s[3], w3
 ; CHECK-GI-FP16-NEXT:    sshr v0.4s, v0.4s, #31
-; CHECK-GI-FP16-NEXT:    mov v3.s[1], w8
-; CHECK-GI-FP16-NEXT:    fmov w8, s4
-; CHECK-GI-FP16-NEXT:    ldr s4, [sp, #16]
+; CHECK-GI-FP16-NEXT:    mov v1.s[2], w8
+; CHECK-GI-FP16-NEXT:    mov v3.s[1], w10
 ; CHECK-GI-FP16-NEXT:    ushl v1.4s, v1.4s, v2.4s
 ; CHECK-GI-FP16-NEXT:    neg v2.4s, v2.4s
-; CHECK-GI-FP16-NEXT:    mov v5.s[2], w9
-; CHECK-GI-FP16-NEXT:    mov v3.s[2], w8
+; CHECK-GI-FP16-NEXT:    mov v3.s[2], w10
 ; CHECK-GI-FP16-NEXT:    sshl v1.4s, v1.4s, v2.4s
-; CHECK-GI-FP16-NEXT:    fmov w8, s4
-; CHECK-GI-FP16-NEXT:    eor v2.16b, v1.16b, v5.16b
-; CHECK-GI-FP16-NEXT:    and v1.16b, v17.16b, v1.16b
-; CHECK-GI-FP16-NEXT:    mov v3.s[3], w8
-; CHECK-GI-FP16-NEXT:    and v2.16b, v7.16b, v2.16b
-; CHECK-GI-FP16-NEXT:    bsl v0.16b, v6.16b, v3.16b
+; CHECK-GI-FP16-NEXT:    ldr s2, [sp, #16]
+; CHECK-GI-FP16-NEXT:    mov v5.s[3], v2.s[0]
+; CHECK-GI-FP16-NEXT:    eor v3.16b, v1.16b, v3.16b
+; CHECK-GI-FP16-NEXT:    and v1.16b, v7.16b, v1.16b
+; CHECK-GI-FP16-NEXT:    and v2.16b, v6.16b, v3.16b
+; CHECK-GI-FP16-NEXT:    bsl v0.16b, v4.16b, v5.16b
 ; CHECK-GI-FP16-NEXT:    orr v1.16b, v1.16b, v2.16b
 ; CHECK-GI-FP16-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-FP16-NEXT:    mov s3, v0.s[2]
 ; CHECK-GI-FP16-NEXT:    mov s4, v0.s[3]
+; CHECK-GI-FP16-NEXT:    fmov w0, s0
 ; CHECK-GI-FP16-NEXT:    mov s5, v1.s[1]
 ; CHECK-GI-FP16-NEXT:    mov s6, v1.s[2]
-; CHECK-GI-FP16-NEXT:    fmov w0, s0
 ; CHECK-GI-FP16-NEXT:    fmov w4, s1
 ; CHECK-GI-FP16-NEXT:    fmov w1, s2
 ; CHECK-GI-FP16-NEXT:    fmov w2, s3

diff  --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll
index 84376107679d84..a42ec8e253be29 100644
--- a/llvm/test/CodeGen/AArch64/fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/fcopysign.ll
@@ -156,8 +156,8 @@ define <3 x float> @copysign_v3f32(<3 x float> %a, <3 x float> %b) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov w8, #-2147483648 // =0x80000000
 ; CHECK-GI-NEXT:    mov w9, #2147483647 // =0x7fffffff
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v2.s[0], w9
+; CHECK-GI-NEXT:    mov v3.s[0], w8
 ; CHECK-GI-NEXT:    mov v2.s[1], w9
 ; CHECK-GI-NEXT:    mov v3.s[1], w8
 ; CHECK-GI-NEXT:    mov v2.s[2], w9
@@ -207,22 +207,20 @@ define <7 x half> @copysign_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NEXT:    mov w9, #32767 // =0x7fff
 ; CHECK-GI-NEXT:    fmov s2, w9
 ; CHECK-GI-NEXT:    fmov s3, w8
-; CHECK-GI-NEXT:    mov v4.16b, v2.16b
-; CHECK-GI-NEXT:    mov v5.16b, v3.16b
-; CHECK-GI-NEXT:    mov v4.h[1], v2.h[0]
-; CHECK-GI-NEXT:    mov v5.h[1], v3.h[0]
-; CHECK-GI-NEXT:    mov v4.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v5.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v4.h[3], v2.h[0]
-; CHECK-GI-NEXT:    mov v5.h[3], v3.h[0]
-; CHECK-GI-NEXT:    mov v4.h[4], v2.h[0]
-; CHECK-GI-NEXT:    mov v5.h[4], v3.h[0]
-; CHECK-GI-NEXT:    mov v4.h[5], v2.h[0]
-; CHECK-GI-NEXT:    mov v5.h[5], v3.h[0]
-; CHECK-GI-NEXT:    mov v4.h[6], v2.h[0]
-; CHECK-GI-NEXT:    mov v5.h[6], v3.h[0]
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v4.16b
-; CHECK-GI-NEXT:    and v1.16b, v1.16b, v5.16b
+; CHECK-GI-NEXT:    mov v2.h[1], w9
+; CHECK-GI-NEXT:    mov v3.h[1], w8
+; CHECK-GI-NEXT:    mov v2.h[2], w9
+; CHECK-GI-NEXT:    mov v3.h[2], w8
+; CHECK-GI-NEXT:    mov v2.h[3], w9
+; CHECK-GI-NEXT:    mov v3.h[3], w8
+; CHECK-GI-NEXT:    mov v2.h[4], w9
+; CHECK-GI-NEXT:    mov v3.h[4], w8
+; CHECK-GI-NEXT:    mov v2.h[5], w9
+; CHECK-GI-NEXT:    mov v3.h[5], w8
+; CHECK-GI-NEXT:    mov v2.h[6], w9
+; CHECK-GI-NEXT:    mov v3.h[6], w8
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v3.16b
 ; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/fcvt.ll b/llvm/test/CodeGen/AArch64/fcvt.ll
index 1c761ea083028a..b408e9c1bd4e60 100644
--- a/llvm/test/CodeGen/AArch64/fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt.ll
@@ -164,27 +164,21 @@ define <7 x half> @ceil_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-LABEL: ceil_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    frintp v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintp v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    frintp v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: ceil_v7f16:
@@ -469,27 +463,21 @@ define <7 x half> @floor_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-LABEL: floor_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    frintm v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintm v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    frintm v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: floor_v7f16:
@@ -774,27 +762,21 @@ define <7 x half> @nearbyint_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-LABEL: nearbyint_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    frinti v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    frinti v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    frinti v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: nearbyint_v7f16:
@@ -1079,27 +1061,21 @@ define <7 x half> @roundeven_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-LABEL: roundeven_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    frintn v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintn v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    frintn v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: roundeven_v7f16:
@@ -1384,27 +1360,21 @@ define <7 x half> @rint_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-LABEL: rint_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    frintx v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintx v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    frintx v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: rint_v7f16:
@@ -1689,27 +1659,21 @@ define <7 x half> @round_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-LABEL: round_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    frinta v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    frinta v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    frinta v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: round_v7f16:
@@ -1994,27 +1958,21 @@ define <7 x half> @trunc_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-LABEL: trunc_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    frintz v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintz v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    frintz v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: trunc_v7f16:

diff  --git a/llvm/test/CodeGen/AArch64/fdiv.ll b/llvm/test/CodeGen/AArch64/fdiv.ll
index d73a5dc73eefcd..5bdccccc62b99c 100644
--- a/llvm/test/CodeGen/AArch64/fdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fdiv.ll
@@ -188,33 +188,25 @@ define <7 x half> @fdiv_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fdiv v2.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v6.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    fdiv v1.4s, v1.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    fdiv v1.4s, v0.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v2.h[3]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fdiv_v7f16:

diff  --git a/llvm/test/CodeGen/AArch64/fexplog.ll b/llvm/test/CodeGen/AArch64/fexplog.ll
index 93d3d96d67b650..30ce389f231281 100644
--- a/llvm/test/CodeGen/AArch64/fexplog.ll
+++ b/llvm/test/CodeGen/AArch64/fexplog.ll
@@ -678,12 +678,12 @@ define <7 x half> @exp_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -701,18 +701,19 @@ define <7 x half> @exp_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #128] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #112] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -789,21 +790,21 @@ define <4 x half> @exp_v4f16(<4 x half> %a) {
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #56] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr x30, [sp, #72] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr d10, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -919,12 +920,12 @@ define <8 x half> @exp_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -947,21 +948,21 @@ define <8 x half> @exp_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #80] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #152] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #136] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldr d14, [sp, #112] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #120] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -1155,7 +1156,7 @@ define <16 x half> @exp_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    fcvt s1, h9
@@ -1180,7 +1181,7 @@ define <16 x half> @exp_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
@@ -1231,7 +1232,7 @@ define <16 x half> @exp_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl expf
 ; CHECK-GI-NEXT:    ldr q3, [sp, #192] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr q2, [sp, #128] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q2, [sp, #112] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp x29, x30, [sp, #304] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[1], v2.h[0]
 ; CHECK-GI-NEXT:    ldp q1, q2, [sp] // 32-byte Folded Reload
@@ -1257,7 +1258,7 @@ define <16 x half> @exp_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    fcvt h2, s0
-; CHECK-GI-NEXT:    ldr q0, [sp, #112] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q0, [sp, #128] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    ldr q0, [sp, #160] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
@@ -1948,12 +1949,12 @@ define <7 x half> @exp2_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -1971,18 +1972,19 @@ define <7 x half> @exp2_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #128] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #112] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -2059,21 +2061,21 @@ define <4 x half> @exp2_v4f16(<4 x half> %a) {
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #56] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr x30, [sp, #72] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr d10, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -2189,12 +2191,12 @@ define <8 x half> @exp2_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -2217,21 +2219,21 @@ define <8 x half> @exp2_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #80] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #152] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #136] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldr d14, [sp, #112] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #120] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -2425,7 +2427,7 @@ define <16 x half> @exp2_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    fcvt s1, h9
@@ -2450,7 +2452,7 @@ define <16 x half> @exp2_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
@@ -2501,7 +2503,7 @@ define <16 x half> @exp2_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl exp2f
 ; CHECK-GI-NEXT:    ldr q3, [sp, #192] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr q2, [sp, #128] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q2, [sp, #112] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp x29, x30, [sp, #304] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[1], v2.h[0]
 ; CHECK-GI-NEXT:    ldp q1, q2, [sp] // 32-byte Folded Reload
@@ -2527,7 +2529,7 @@ define <16 x half> @exp2_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    fcvt h2, s0
-; CHECK-GI-NEXT:    ldr q0, [sp, #112] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q0, [sp, #128] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    ldr q0, [sp, #160] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
@@ -3218,12 +3220,12 @@ define <7 x half> @log_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -3241,18 +3243,19 @@ define <7 x half> @log_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #128] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #112] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -3329,21 +3332,21 @@ define <4 x half> @log_v4f16(<4 x half> %a) {
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #56] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr x30, [sp, #72] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr d10, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -3459,12 +3462,12 @@ define <8 x half> @log_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -3487,21 +3490,21 @@ define <8 x half> @log_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #80] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #152] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #136] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldr d14, [sp, #112] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #120] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -3695,7 +3698,7 @@ define <16 x half> @log_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    fcvt s1, h9
@@ -3720,7 +3723,7 @@ define <16 x half> @log_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
@@ -3771,7 +3774,7 @@ define <16 x half> @log_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl logf
 ; CHECK-GI-NEXT:    ldr q3, [sp, #192] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr q2, [sp, #128] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q2, [sp, #112] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp x29, x30, [sp, #304] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[1], v2.h[0]
 ; CHECK-GI-NEXT:    ldp q1, q2, [sp] // 32-byte Folded Reload
@@ -3797,7 +3800,7 @@ define <16 x half> @log_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    fcvt h2, s0
-; CHECK-GI-NEXT:    ldr q0, [sp, #112] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q0, [sp, #128] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    ldr q0, [sp, #160] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
@@ -4488,12 +4491,12 @@ define <7 x half> @log2_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -4511,18 +4514,19 @@ define <7 x half> @log2_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #128] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #112] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -4599,21 +4603,21 @@ define <4 x half> @log2_v4f16(<4 x half> %a) {
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #56] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr x30, [sp, #72] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr d10, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -4729,12 +4733,12 @@ define <8 x half> @log2_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -4757,21 +4761,21 @@ define <8 x half> @log2_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #80] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #152] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #136] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldr d14, [sp, #112] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #120] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -4965,7 +4969,7 @@ define <16 x half> @log2_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    fcvt s1, h9
@@ -4990,7 +4994,7 @@ define <16 x half> @log2_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
@@ -5041,7 +5045,7 @@ define <16 x half> @log2_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log2f
 ; CHECK-GI-NEXT:    ldr q3, [sp, #192] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr q2, [sp, #128] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q2, [sp, #112] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp x29, x30, [sp, #304] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[1], v2.h[0]
 ; CHECK-GI-NEXT:    ldp q1, q2, [sp] // 32-byte Folded Reload
@@ -5067,7 +5071,7 @@ define <16 x half> @log2_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    fcvt h2, s0
-; CHECK-GI-NEXT:    ldr q0, [sp, #112] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q0, [sp, #128] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    ldr q0, [sp, #160] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
@@ -5758,12 +5762,12 @@ define <7 x half> @log10_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -5781,18 +5785,19 @@ define <7 x half> @log10_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #128] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #112] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -5869,21 +5874,21 @@ define <4 x half> @log10_v4f16(<4 x half> %a) {
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #56] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr x30, [sp, #72] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr d10, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -5999,12 +6004,12 @@ define <8 x half> @log10_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -6027,21 +6032,21 @@ define <8 x half> @log10_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #80] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #152] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #136] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldr d14, [sp, #112] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #120] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -6235,7 +6240,7 @@ define <16 x half> @log10_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    fcvt s1, h9
@@ -6260,7 +6265,7 @@ define <16 x half> @log10_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
@@ -6311,7 +6316,7 @@ define <16 x half> @log10_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl log10f
 ; CHECK-GI-NEXT:    ldr q3, [sp, #192] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr q2, [sp, #128] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q2, [sp, #112] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp x29, x30, [sp, #304] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[1], v2.h[0]
 ; CHECK-GI-NEXT:    ldp q1, q2, [sp] // 32-byte Folded Reload
@@ -6337,7 +6342,7 @@ define <16 x half> @log10_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    fcvt h2, s0
-; CHECK-GI-NEXT:    ldr q0, [sp, #112] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q0, [sp, #128] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    ldr q0, [sp, #160] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]

diff  --git a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
index 2ea7e0f3c44a9a..aa20304e52a951 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
@@ -11,13 +11,15 @@ define <4 x half> @interleave2_v4f16(<2 x half> %vec0, <2 x half> %vec1) {
 ; CHECK-GI-LABEL: interleave2_v4f16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    dup v2.4s, w8
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    xtn v2.4h, v2.4s
-; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    xtn v0.4h, v2.4s
+; CHECK-GI-NEXT:    mov v1.s[0], w8
+; CHECK-GI-NEXT:    mov v2.s[0], w9
+; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    mov v1.s[1], w8
-; CHECK-GI-NEXT:    zip1 v0.4h, v0.4h, v1.4h
+; CHECK-GI-NEXT:    mov v2.s[1], w8
+; CHECK-GI-NEXT:    zip1 v0.4h, v1.4h, v2.4h
 ; CHECK-GI-NEXT:    ret
   %retval = call <4 x half> @llvm.vector.interleave2.v4f16(<2 x half> %vec0, <2 x half> %vec1)
   ret <4 x half> %retval

diff  --git a/llvm/test/CodeGen/AArch64/fminimummaximum.ll b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
index 357d91960624bd..fb12f8acf17453 100644
--- a/llvm/test/CodeGen/AArch64/fminimummaximum.ll
+++ b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
@@ -664,33 +664,25 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v0.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v1.4h
-; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov h6, v1.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h7, v1.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[0], v0.h[4]
 ; CHECK-NOFP16-GI-NEXT:    fmin v2.4s, v2.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v6.h[1], v7.h[0]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v3.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v6.h[2], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[3]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v4.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v6.4h
-; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    fmin v1.4s, v2.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[1], v1.h[5]
+; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT:    fmin v1.4s, v1.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v2.h[2]
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v5.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov h2, v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v2.h[3]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: min_v7f16:
@@ -770,33 +762,25 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v0.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v1.4h
-; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov h6, v1.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h7, v1.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[0], v0.h[4]
 ; CHECK-NOFP16-GI-NEXT:    fmax v2.4s, v2.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v6.h[1], v7.h[0]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v3.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v6.h[2], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[3]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v4.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v6.4h
-; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    fmax v1.4s, v2.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[1], v1.h[5]
+; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT:    fmax v1.4s, v1.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v2.h[2]
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v5.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov h2, v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v2.h[3]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: max_v7f16:

diff  --git a/llvm/test/CodeGen/AArch64/fminmax.ll b/llvm/test/CodeGen/AArch64/fminmax.ll
index 61199f82615bbe..64f0da8b4cd0f9 100644
--- a/llvm/test/CodeGen/AArch64/fminmax.ll
+++ b/llvm/test/CodeGen/AArch64/fminmax.ll
@@ -664,33 +664,25 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v0.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v1.4h
-; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov h6, v1.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h7, v1.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[0], v0.h[4]
 ; CHECK-NOFP16-GI-NEXT:    fminnm v2.4s, v2.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v6.h[1], v7.h[0]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v3.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v6.h[2], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[3]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v4.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v6.4h
-; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    fminnm v1.4s, v2.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[1], v1.h[5]
+; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT:    fminnm v1.4s, v1.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v2.h[2]
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v5.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov h2, v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v2.h[3]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: min_v7f16:
@@ -770,33 +762,25 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v0.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v1.4h
-; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov h6, v1.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h7, v1.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[0], v0.h[4]
 ; CHECK-NOFP16-GI-NEXT:    fmaxnm v2.4s, v2.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v6.h[1], v7.h[0]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v3.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v6.h[2], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[3]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v4.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v6.4h
-; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    fmaxnm v1.4s, v2.4s, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[1], v1.h[5]
+; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-NOFP16-GI-NEXT:    fmaxnm v1.4s, v1.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[2], v2.h[2]
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v5.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov h2, v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[3], v2.h[3]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[2]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: max_v7f16:

diff  --git a/llvm/test/CodeGen/AArch64/fmla.ll b/llvm/test/CodeGen/AArch64/fmla.ll
index 4b019b57d968d3..7bcaae5a77eac5 100644
--- a/llvm/test/CodeGen/AArch64/fmla.ll
+++ b/llvm/test/CodeGen/AArch64/fmla.ll
@@ -257,39 +257,29 @@ define <7 x half> @fma_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h7, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h18, v2.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h19, v2.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v2.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fmla v5.4s, v4.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[1], v17.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[1], v19.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v5.4s
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v18.h[2], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v6.4h
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v16.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v18.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmla v4.4s, v3.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v4.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v2.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v2.h[5]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v5.4h, v5.4s
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v2.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v6.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v5.h[1]
+; CHECK-GI-NOFP16-NEXT:    fmla v3.4s, v2.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v5.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[3]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fma_v7f16:
@@ -864,46 +854,36 @@ define <7 x half> @fmuladd_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fmul v3.4s, v3.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT:    mov h4, v2.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v7.h[1], v16.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v7.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    mov h3, v2.h[4]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v5.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v7.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[0], v2.h[4]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov h1, v2.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmul v2.4s, v5.4s, v6.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v2.h[5]
+; CHECK-GI-NOFP16-NEXT:    fmul v1.4s, v3.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v2.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
+; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v1.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v3.h[3]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fmuladd_v7f16:
@@ -1362,46 +1342,36 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fmul v3.4s, v3.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT:    mov h4, v2.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v7.h[1], v16.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v7.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    mov h3, v2.h[4]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v5.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v7.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[0], v2.h[4]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov h1, v2.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmul v2.4s, v5.4s, v6.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v2.h[5]
+; CHECK-GI-NOFP16-NEXT:    fmul v1.4s, v3.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v2.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
+; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v1.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v3.h[3]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fmul_v7f16:

diff  --git a/llvm/test/CodeGen/AArch64/fmul.ll b/llvm/test/CodeGen/AArch64/fmul.ll
index 1f41f2385c3357..bd3d1353e643e5 100644
--- a/llvm/test/CodeGen/AArch64/fmul.ll
+++ b/llvm/test/CodeGen/AArch64/fmul.ll
@@ -188,33 +188,25 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fmul v2.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v6.4h
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmul v1.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[0], v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    fmul v1.4s, v1.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v2.h[3]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fmul_v7f16:

diff  --git a/llvm/test/CodeGen/AArch64/fneg.ll b/llvm/test/CodeGen/AArch64/fneg.ll
index cc0f7d2fd6075d..a0e9edff733e09 100644
--- a/llvm/test/CodeGen/AArch64/fneg.ll
+++ b/llvm/test/CodeGen/AArch64/fneg.ll
@@ -162,27 +162,21 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-LABEL: fabs_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fneg v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fneg v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    fneg v2.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fabs_v7f16:

diff  --git a/llvm/test/CodeGen/AArch64/fpow.ll b/llvm/test/CodeGen/AArch64/fpow.ll
index 8d40121ad4543f..6e8cd0c8c00b41 100644
--- a/llvm/test/CodeGen/AArch64/fpow.ll
+++ b/llvm/test/CodeGen/AArch64/fpow.ll
@@ -965,22 +965,22 @@ define <4 x half> @pow_v4f16(<4 x half> %a, <4 x half> %b) {
 ; CHECK-GI-NEXT:    fcvt s2, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h12
-; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s2
 ; CHECK-GI-NEXT:    bl powf
 ; CHECK-GI-NEXT:    fcvt s2, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h13
-; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s2
 ; CHECK-GI-NEXT:    bl powf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]

diff  --git a/llvm/test/CodeGen/AArch64/fpowi.ll b/llvm/test/CodeGen/AArch64/fpowi.ll
index 5dbcaa4a5fda17..62fc1c0854ca8b 100644
--- a/llvm/test/CodeGen/AArch64/fpowi.ll
+++ b/llvm/test/CodeGen/AArch64/fpowi.ll
@@ -869,22 +869,22 @@ define <4 x half> @powi_v4f16(<4 x half> %a, i32 %b) {
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
-; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl __powisf2
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    mov w0, w19
-; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr d10, [sp, #48] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr d10, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b

diff  --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index 0c880592d955b7..20b5567e973d09 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -2585,7 +2585,7 @@ define <3 x i64> @fptos_v3f32_v3i64(<3 x float> %a) {
 ;
 ; CHECK-GI-LABEL: fptos_v3f32_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s1, v0.s[2]
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[2]
 ; CHECK-GI-NEXT:    fcvtl v0.2d, v0.2s
 ; CHECK-GI-NEXT:    fcvtl v1.2d, v1.2s
 ; CHECK-GI-NEXT:    fcvtzs v0.2d, v0.2d
@@ -2614,7 +2614,7 @@ define <3 x i64> @fptou_v3f32_v3i64(<3 x float> %a) {
 ;
 ; CHECK-GI-LABEL: fptou_v3f32_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s1, v0.s[2]
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[2]
 ; CHECK-GI-NEXT:    fcvtl v0.2d, v0.2s
 ; CHECK-GI-NEXT:    fcvtl v1.2d, v1.2s
 ; CHECK-GI-NEXT:    fcvtzu v0.2d, v0.2d
@@ -3181,10 +3181,10 @@ define <3 x i16> @fptos_v3f32_v3i16(<3 x float> %a) {
 ; CHECK-GI-LABEL: fptos_v3f32_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov s2, v0.s[2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -3202,10 +3202,10 @@ define <3 x i16> @fptou_v3f32_v3i16(<3 x float> %a) {
 ; CHECK-GI-LABEL: fptou_v3f32_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov s2, v0.s[2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -6077,10 +6077,10 @@ define <3 x i16> @fptos_v3f16_v3i16(<3 x half> %a) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov s2, v0.s[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], w9
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -6110,10 +6110,10 @@ define <3 x i16> @fptou_v3f16_v3i16(<3 x half> %a) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov s2, v0.s[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], w9
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -7297,7 +7297,7 @@ define <2 x i64> @fptos_v2f128_v2i64(<2 x fp128> %a) {
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov x19, x0
 ; CHECK-GI-NEXT:    bl __fixtfdi
-; CHECK-GI-NEXT:    fmov d0, x19
+; CHECK-GI-NEXT:    mov v0.d[0], x19
 ; CHECK-GI-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v0.d[1], x0
 ; CHECK-GI-NEXT:    add sp, sp, #32
@@ -7340,7 +7340,7 @@ define <2 x i64> @fptou_v2f128_v2i64(<2 x fp128> %a) {
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov x19, x0
 ; CHECK-GI-NEXT:    bl __fixunstfdi
-; CHECK-GI-NEXT:    fmov d0, x19
+; CHECK-GI-NEXT:    mov v0.d[0], x19
 ; CHECK-GI-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v0.d[1], x0
 ; CHECK-GI-NEXT:    add sp, sp, #32
@@ -7496,7 +7496,7 @@ define <2 x i32> @fptos_v2f128_v2i32(<2 x fp128> %a) {
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w19, w0
 ; CHECK-GI-NEXT:    bl __fixtfsi
-; CHECK-GI-NEXT:    fmov s0, w19
+; CHECK-GI-NEXT:    mov v0.s[0], w19
 ; CHECK-GI-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v0.s[1], w0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -7539,7 +7539,7 @@ define <2 x i32> @fptou_v2f128_v2i32(<2 x fp128> %a) {
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w19, w0
 ; CHECK-GI-NEXT:    bl __fixunstfsi
-; CHECK-GI-NEXT:    fmov s0, w19
+; CHECK-GI-NEXT:    mov v0.s[0], w19
 ; CHECK-GI-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v0.s[1], w0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -7591,7 +7591,7 @@ define <3 x i32> @fptos_v3f128_v3i32(<3 x fp128> %a) {
 ; CHECK-GI-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w20, w0
 ; CHECK-GI-NEXT:    bl __fixtfsi
-; CHECK-GI-NEXT:    fmov s0, w19
+; CHECK-GI-NEXT:    mov v0.s[0], w19
 ; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v0.s[1], w20
 ; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
@@ -7644,7 +7644,7 @@ define <3 x i32> @fptou_v3f128_v3i32(<3 x fp128> %a) {
 ; CHECK-GI-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w20, w0
 ; CHECK-GI-NEXT:    bl __fixunstfsi
-; CHECK-GI-NEXT:    fmov s0, w19
+; CHECK-GI-NEXT:    mov v0.s[0], w19
 ; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v0.s[1], w20
 ; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
@@ -7689,9 +7689,8 @@ define <2 x i16> @fptos_v2f128_v2i16(<2 x fp128> %a) {
 ; CHECK-GI-NEXT:    mov w19, w0
 ; CHECK-GI-NEXT:    bl __fixtfsi
 ; CHECK-GI-NEXT:    fmov s0, w19
-; CHECK-GI-NEXT:    fmov s1, w0
 ; CHECK-GI-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], w0
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    add sp, sp, #32
@@ -7734,9 +7733,8 @@ define <2 x i16> @fptou_v2f128_v2i16(<2 x fp128> %a) {
 ; CHECK-GI-NEXT:    mov w19, w0
 ; CHECK-GI-NEXT:    bl __fixunstfsi
 ; CHECK-GI-NEXT:    fmov s0, w19
-; CHECK-GI-NEXT:    fmov s1, w0
 ; CHECK-GI-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], w0
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    add sp, sp, #32
@@ -7791,12 +7789,10 @@ define <3 x i16> @fptos_v3f128_v3i16(<3 x fp128> %a) {
 ; CHECK-GI-NEXT:    mov w20, w0
 ; CHECK-GI-NEXT:    bl __fixtfsi
 ; CHECK-GI-NEXT:    fmov s0, w19
-; CHECK-GI-NEXT:    fmov s1, w20
 ; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v0.h[1], w20
 ; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[2], w0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
@@ -7850,12 +7846,10 @@ define <3 x i16> @fptou_v3f128_v3i16(<3 x fp128> %a) {
 ; CHECK-GI-NEXT:    mov w20, w0
 ; CHECK-GI-NEXT:    bl __fixunstfsi
 ; CHECK-GI-NEXT:    fmov s0, w19
-; CHECK-GI-NEXT:    fmov s1, w20
 ; CHECK-GI-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v0.h[1], w20
 ; CHECK-GI-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[2], w0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
@@ -7896,7 +7890,7 @@ define <2 x i8> @fptos_v2f128_v2i8(<2 x fp128> %a) {
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w19, w0
 ; CHECK-GI-NEXT:    bl __fixtfsi
-; CHECK-GI-NEXT:    fmov s0, w19
+; CHECK-GI-NEXT:    mov v0.s[0], w19
 ; CHECK-GI-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v0.s[1], w0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -7939,7 +7933,7 @@ define <2 x i8> @fptou_v2f128_v2i8(<2 x fp128> %a) {
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w19, w0
 ; CHECK-GI-NEXT:    bl __fixunstfsi
-; CHECK-GI-NEXT:    fmov s0, w19
+; CHECK-GI-NEXT:    mov v0.s[0], w19
 ; CHECK-GI-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v0.s[1], w0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0

diff  --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll
index aec5d7959226c3..c0d4ddef23132d 100644
--- a/llvm/test/CodeGen/AArch64/fptrunc.ll
+++ b/llvm/test/CodeGen/AArch64/fptrunc.ll
@@ -261,9 +261,9 @@ define <3 x float> @fptrunc_v3f64_v3f32(<3 x double> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    fcvt s2, d2
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT:    fcvtn v1.2s, v0.2d
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -363,9 +363,9 @@ define <2 x half> @fptrunc_v2f32_v2f16(<2 x float> %a) {
 ; CHECK-GI-LABEL: fptrunc_v2f32_v2f16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0

diff  --git a/llvm/test/CodeGen/AArch64/frem.ll b/llvm/test/CodeGen/AArch64/frem.ll
index 1a10fd2f1cdc3d..fe5146d79895cb 100644
--- a/llvm/test/CodeGen/AArch64/frem.ll
+++ b/llvm/test/CodeGen/AArch64/frem.ll
@@ -952,22 +952,22 @@ define <4 x half> @frem_v4f16(<4 x half> %a, <4 x half> %b) {
 ; CHECK-GI-NEXT:    fcvt s2, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h12
-; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s2
 ; CHECK-GI-NEXT:    bl fmodf
 ; CHECK-GI-NEXT:    fcvt s2, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
 ; CHECK-GI-NEXT:    fcvt s1, h13
-; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s2
 ; CHECK-GI-NEXT:    bl fmodf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]

diff  --git a/llvm/test/CodeGen/AArch64/fsincos.ll b/llvm/test/CodeGen/AArch64/fsincos.ll
index 0b34f9570fa77b..557add3a4eaeb2 100644
--- a/llvm/test/CodeGen/AArch64/fsincos.ll
+++ b/llvm/test/CodeGen/AArch64/fsincos.ll
@@ -678,12 +678,12 @@ define <7 x half> @sin_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -701,18 +701,19 @@ define <7 x half> @sin_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #128] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #112] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -789,21 +790,21 @@ define <4 x half> @sin_v4f16(<4 x half> %a) {
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #56] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr x30, [sp, #72] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr d10, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -919,12 +920,12 @@ define <8 x half> @sin_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -947,21 +948,21 @@ define <8 x half> @sin_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #80] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #152] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #136] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldr d14, [sp, #112] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #120] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -1155,7 +1156,7 @@ define <16 x half> @sin_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    fcvt s1, h9
@@ -1180,7 +1181,7 @@ define <16 x half> @sin_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
@@ -1231,7 +1232,7 @@ define <16 x half> @sin_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl sinf
 ; CHECK-GI-NEXT:    ldr q3, [sp, #192] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr q2, [sp, #128] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q2, [sp, #112] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp x29, x30, [sp, #304] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[1], v2.h[0]
 ; CHECK-GI-NEXT:    ldp q1, q2, [sp] // 32-byte Folded Reload
@@ -1257,7 +1258,7 @@ define <16 x half> @sin_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    fcvt h2, s0
-; CHECK-GI-NEXT:    ldr q0, [sp, #112] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q0, [sp, #128] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    ldr q0, [sp, #160] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
@@ -1948,12 +1949,12 @@ define <7 x half> @cos_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -1971,18 +1972,19 @@ define <7 x half> @cos_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #128] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #112] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -2059,21 +2061,21 @@ define <4 x half> @cos_v4f16(<4 x half> %a) {
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #56] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr x30, [sp, #72] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr d10, [sp, #48] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -2189,12 +2191,12 @@ define <8 x half> @cos_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    fcvt s1, h9
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    fcvt s1, h10
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    fcvt s1, h11
@@ -2217,21 +2219,21 @@ define <8 x half> @cos_v8f16(<8 x half> %a) {
 ; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    ldp q2, q1, [sp, #80] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #64] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr q1, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #152] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #136] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x30, [sp, #168] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    ldr d14, [sp, #112] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #48] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp d13, d12, [sp, #120] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ldp q2, q3, [sp, #16] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
-; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
-; CHECK-GI-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[4], v2.h[0]
+; CHECK-GI-NEXT:    ldp q2, q3, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.h[5], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
@@ -2425,7 +2427,7 @@ define <16 x half> @cos_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    fcvt s1, h8
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    fcvt s1, h9
@@ -2450,7 +2452,7 @@ define <16 x half> @cos_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    fcvt s1, h13
 ; CHECK-GI-NEXT:    fcvt h0, s0
-; CHECK-GI-NEXT:    str q0, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q0, [sp, #128] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
@@ -2501,7 +2503,7 @@ define <16 x half> @cos_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    fmov s0, s1
 ; CHECK-GI-NEXT:    bl cosf
 ; CHECK-GI-NEXT:    ldr q3, [sp, #192] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ldr q2, [sp, #128] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q2, [sp, #112] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ldp x29, x30, [sp, #304] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[1], v2.h[0]
 ; CHECK-GI-NEXT:    ldp q1, q2, [sp] // 32-byte Folded Reload
@@ -2527,7 +2529,7 @@ define <16 x half> @cos_v16f16(<16 x half> %a) {
 ; CHECK-GI-NEXT:    ldr q2, [sp, #96] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    fcvt h2, s0
-; CHECK-GI-NEXT:    ldr q0, [sp, #112] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q0, [sp, #128] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v3.h[6], v0.h[0]
 ; CHECK-GI-NEXT:    ldr q0, [sp, #160] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]

diff  --git a/llvm/test/CodeGen/AArch64/fsqrt.ll b/llvm/test/CodeGen/AArch64/fsqrt.ll
index 4b48bcc5508db0..6c5fd8e52b017c 100644
--- a/llvm/test/CodeGen/AArch64/fsqrt.ll
+++ b/llvm/test/CodeGen/AArch64/fsqrt.ll
@@ -196,27 +196,21 @@ define <7 x half> @sqrt_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-LABEL: sqrt_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[0], v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fsqrt v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    fsqrt v1.4s, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fsqrt v2.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: sqrt_v7f16:

diff  --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll
index 6baf1a84d407c4..b00e5d6c701d8b 100644
--- a/llvm/test/CodeGen/AArch64/icmp.ll
+++ b/llvm/test/CodeGen/AArch64/icmp.ll
@@ -1228,18 +1228,18 @@ define <3 x i32> @v3i32_i32(<3 x i32> %a, <3 x i32> %b, <3 x i32> %d, <3 x i32>
 ; CHECK-GI-LABEL: v3i32_i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov w8, #31 // =0x1f
+; CHECK-GI-NEXT:    mov w9, #-1 // =0xffffffff
 ; CHECK-GI-NEXT:    cmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v4.s[0], w8
+; CHECK-GI-NEXT:    mov v5.s[0], w9
 ; CHECK-GI-NEXT:    mov v4.s[1], w8
+; CHECK-GI-NEXT:    mov v5.s[1], w9
 ; CHECK-GI-NEXT:    mov v4.s[2], w8
-; CHECK-GI-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    mov v5.s[2], w9
 ; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT:    neg v4.4s, v4.4s
-; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    neg v1.4s, v4.4s
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v5.16b
 ; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
 ; CHECK-GI-NEXT:    and v1.16b, v3.16b, v1.16b
 ; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b

diff  --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll
index 8b82004388b095..296e267a9c7f0b 100644
--- a/llvm/test/CodeGen/AArch64/insertextract.ll
+++ b/llvm/test/CodeGen/AArch64/insertextract.ll
@@ -250,23 +250,13 @@ entry:
 }
 
 define <3 x float> @insert_v3f32_0(<3 x float> %a, float %b, i32 %c) {
-; CHECK-SD-LABEL: insert_v3f32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-SD-NEXT:    mov v1.s[1], v0.s[1]
-; CHECK-SD-NEXT:    mov v1.s[2], v0.s[2]
-; CHECK-SD-NEXT:    mov v0.16b, v1.16b
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: insert_v3f32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-GI-NEXT:    mov s0, v0.s[2]
-; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
-; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v0.16b, v1.16b
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: insert_v3f32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-NEXT:    mov v1.s[2], v0.s[2]
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
 entry:
   %d = insertelement <3 x float> %a, float %b, i32 0
   ret <3 x float> %d
@@ -281,10 +271,11 @@ define <3 x float> @insert_v3f32_2(<3 x float> %a, float %b, i32 %c) {
 ;
 ; CHECK-GI-LABEL: insert_v3f32_2:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov v2.s[0], v0.s[0]
 ; CHECK-GI-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
-; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v2.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = insertelement <3 x float> %a, float %b, i32 2
@@ -983,11 +974,9 @@ define <3 x i32> @insert_v3i32_0(<3 x i32> %a, i32 %b, i32 %c) {
 ;
 ; CHECK-GI-LABEL: insert_v3i32_0:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, v0.s[1]
-; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    mov w9, v0.s[2]
-; CHECK-GI-NEXT:    mov v1.s[1], w8
-; CHECK-GI-NEXT:    mov v1.s[2], w9
+; CHECK-GI-NEXT:    mov v1.s[0], w0
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], v0.s[2]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1003,10 +992,10 @@ define <3 x i32> @insert_v3i32_2(<3 x i32> %a, i32 %b, i32 %c) {
 ;
 ; CHECK-GI-LABEL: insert_v3i32_2:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    mov v1.s[2], w0
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = insertelement <3 x i32> %a, i32 %b, i32 2

diff  --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index 7a4c5cee27b805..4ac04798e15481 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -3309,30 +3309,28 @@ define <3 x double> @stofp_v3i8_v3f64(<3 x i8> %a) {
 ; CHECK-GI-LABEL: stofp_v3i8_v3f64:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w2
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], w1
+; CHECK-GI-NEXT:    mov v0.h[2], w2
 ; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8
-; CHECK-GI-NEXT:    mov h2, v0.h[1]
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov h2, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
 ; CHECK-GI-NEXT:    mov h2, v1.h[1]
 ; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
 ; CHECK-GI-NEXT:    smov x8, v0.s[0]
 ; CHECK-GI-NEXT:    smov x9, v0.s[1]
-; CHECK-GI-NEXT:    sshll v0.4s, v1.4h, #0
-; CHECK-GI-NEXT:    fmov d1, x8
-; CHECK-GI-NEXT:    smov x8, v0.s[0]
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    smov x8, v1.s[0]
+; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    smov x9, v1.s[1]
+; CHECK-GI-NEXT:    mov v1.d[0], x8
 ; CHECK-GI-NEXT:    mov v1.d[1], x9
-; CHECK-GI-NEXT:    smov x9, v0.s[1]
-; CHECK-GI-NEXT:    fmov d2, x8
-; CHECK-GI-NEXT:    scvtf v0.2d, v1.2d
-; CHECK-GI-NEXT:    mov v2.d[1], x9
+; CHECK-GI-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-GI-NEXT:    scvtf v2.2d, v1.2d
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
-; CHECK-GI-NEXT:    scvtf v2.2d, v2.2d
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    ret
@@ -3363,30 +3361,28 @@ define <3 x double> @utofp_v3i8_v3f64(<3 x i8> %a) {
 ; CHECK-GI-LABEL: utofp_v3i8_v3f64:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w2
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
 ; CHECK-GI-NEXT:    movi d1, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    mov v0.h[1], w1
+; CHECK-GI-NEXT:    mov v0.h[2], w2
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-GI-NEXT:    mov h2, v0.h[1]
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov h2, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
 ; CHECK-GI-NEXT:    mov h2, v1.h[1]
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
 ; CHECK-GI-NEXT:    mov w8, v0.s[0]
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
-; CHECK-GI-NEXT:    ushll v0.4s, v1.4h, #0
-; CHECK-GI-NEXT:    fmov d1, x8
-; CHECK-GI-NEXT:    mov w8, v0.s[0]
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov w8, v1.s[0]
+; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    mov v1.d[0], x8
 ; CHECK-GI-NEXT:    mov v1.d[1], x9
-; CHECK-GI-NEXT:    mov w9, v0.s[1]
-; CHECK-GI-NEXT:    fmov d2, x8
-; CHECK-GI-NEXT:    ucvtf v0.2d, v1.2d
-; CHECK-GI-NEXT:    mov v2.d[1], x9
+; CHECK-GI-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-GI-NEXT:    ucvtf v2.2d, v1.2d
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
-; CHECK-GI-NEXT:    ucvtf v2.2d, v2.2d
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 killed $q2
 ; CHECK-GI-NEXT:    ret
@@ -4479,13 +4475,13 @@ define <3 x float> @stofp_v3i64_v3f32(<3 x i64> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT:    scvtf v1.2d, v2.2d
+; CHECK-GI-NEXT:    scvtf v2.2d, v2.2d
 ; CHECK-GI-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-GI-NEXT:    fcvtn v1.2s, v1.2d
-; CHECK-GI-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
-; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-GI-NEXT:    fcvtn v1.2s, v0.2d
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <3 x i64> %a to <3 x float>
@@ -4511,13 +4507,13 @@ define <3 x float> @utofp_v3i64_v3f32(<3 x i64> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT:    ucvtf v1.2d, v2.2d
+; CHECK-GI-NEXT:    ucvtf v2.2d, v2.2d
 ; CHECK-GI-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-GI-NEXT:    fcvtn v1.2s, v1.2d
-; CHECK-GI-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
-; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-GI-NEXT:    fcvtn v1.2s, v0.2d
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
+; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <3 x i64> %a to <3 x float>
@@ -5267,10 +5263,8 @@ define <3 x float> @stofp_v3i8_v3f32(<3 x i8> %a) {
 ; CHECK-GI-LABEL: stofp_v3i8_v3f32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w2
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], w1
+; CHECK-GI-NEXT:    mov v0.h[2], w2
 ; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
@@ -5302,11 +5296,9 @@ define <3 x float> @utofp_v3i8_v3f32(<3 x i8> %a) {
 ; CHECK-GI-LABEL: utofp_v3i8_v3f32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w2
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
 ; CHECK-GI-NEXT:    movi d1, #0xff00ff00ff00ff
+; CHECK-GI-NEXT:    mov v0.h[1], w1
+; CHECK-GI-NEXT:    mov v0.h[2], w2
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    mov h2, v0.h[1]
@@ -6227,9 +6219,9 @@ define <2 x half> @stofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    scvtf v0.2d, v0.2d
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -6276,9 +6268,9 @@ define <2 x half> @utofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    ucvtf v0.2d, v0.2d
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -7215,9 +7207,9 @@ define <2 x half> @stofp_v2i32_v2f16(<2 x i32> %a) {
 ; CHECK-GI-LABEL: stofp_v2i32_v2f16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    scvtf v0.2s, v0.2s
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -7238,9 +7230,9 @@ define <2 x half> @utofp_v2i32_v2f16(<2 x i32> %a) {
 ; CHECK-GI-LABEL: utofp_v2i32_v2f16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ucvtf v0.2s, v0.2s
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -7448,9 +7440,9 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-GI-NOFP16-NEXT:    shl v0.2s, v0.2s, #16
 ; CHECK-GI-NOFP16-NEXT:    sshr v0.2s, v0.2s, #16
 ; CHECK-GI-NOFP16-NEXT:    scvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -7459,8 +7451,8 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-GI-FP16-LABEL: stofp_v2i16_v2f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], w8
 ; CHECK-GI-FP16-NEXT:    scvtf v0.4h, v0.4h
 ; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
@@ -7491,9 +7483,9 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-GI-NOFP16-NEXT:    movi d1, #0x00ffff0000ffff
 ; CHECK-GI-NOFP16-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NOFP16-NEXT:    ucvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -7502,8 +7494,8 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-GI-FP16-LABEL: utofp_v2i16_v2f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], w8
 ; CHECK-GI-FP16-NEXT:    ucvtf v0.4h, v0.4h
 ; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
@@ -7986,9 +7978,9 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-NOFP16-NEXT:    shl v0.2s, v0.2s, #24
 ; CHECK-GI-NOFP16-NEXT:    sshr v0.2s, v0.2s, #24
 ; CHECK-GI-NOFP16-NEXT:    scvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -7997,9 +7989,9 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-FP16-LABEL: stofp_v2i8_v2f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-FP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-FP16-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-FP16-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-FP16-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-FP16-NEXT:    xtn v0.4h, v1.4s
 ; CHECK-GI-FP16-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-GI-FP16-NEXT:    sshr v0.4h, v0.4h, #8
 ; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
@@ -8048,9 +8040,9 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-NOFP16-NEXT:    movi d1, #0x0000ff000000ff
 ; CHECK-GI-NOFP16-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NOFP16-NEXT:    ucvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -8059,16 +8051,16 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-FP16-LABEL: utofp_v2i8_v2f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-FP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-FP16-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-FP16-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-FP16-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-FP16-NEXT:    xtn v0.4h, v1.4s
 ; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-FP16-NEXT:    movi d1, #0x0000ff000000ff
 ; CHECK-GI-FP16-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-FP16-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-GI-FP16-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-FP16-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], w8
 ; CHECK-GI-FP16-NEXT:    ucvtf v0.4h, v0.4h
 ; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
@@ -8105,10 +8097,8 @@ define <3 x half> @stofp_v3i8_v3f16(<3 x i8> %a) {
 ; CHECK-GI-NOFP16-LABEL: stofp_v3i8_v3f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fmov s0, w0
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w1
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w2
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], w1
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], w2
 ; CHECK-GI-NOFP16-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-GI-NOFP16-NEXT:    sshr v0.4h, v0.4h, #8
 ; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
@@ -8126,10 +8116,8 @@ define <3 x half> @stofp_v3i8_v3f16(<3 x i8> %a) {
 ; CHECK-GI-FP16-LABEL: stofp_v3i8_v3f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    fmov s0, w0
-; CHECK-GI-FP16-NEXT:    fmov s1, w1
-; CHECK-GI-FP16-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-FP16-NEXT:    fmov s1, w2
-; CHECK-GI-FP16-NEXT:    mov v0.b[2], v1.b[0]
+; CHECK-GI-FP16-NEXT:    mov v0.b[1], w1
+; CHECK-GI-FP16-NEXT:    mov v0.b[2], w2
 ; CHECK-GI-FP16-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-GI-FP16-NEXT:    scvtf v0.4h, v0.4h
 ; CHECK-GI-FP16-NEXT:    ret
@@ -8162,11 +8150,9 @@ define <3 x half> @utofp_v3i8_v3f16(<3 x i8> %a) {
 ; CHECK-GI-NOFP16-LABEL: utofp_v3i8_v3f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    fmov s0, w0
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w1
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w2
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    movi d1, #0xff00ff00ff00ff
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], w1
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], w2
 ; CHECK-GI-NOFP16-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
@@ -8183,10 +8169,8 @@ define <3 x half> @utofp_v3i8_v3f16(<3 x i8> %a) {
 ; CHECK-GI-FP16-LABEL: utofp_v3i8_v3f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    fmov s0, w0
-; CHECK-GI-FP16-NEXT:    fmov s1, w1
-; CHECK-GI-FP16-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-FP16-NEXT:    fmov s1, w2
-; CHECK-GI-FP16-NEXT:    mov v0.b[2], v1.b[0]
+; CHECK-GI-FP16-NEXT:    mov v0.b[1], w1
+; CHECK-GI-FP16-NEXT:    mov v0.b[2], w2
 ; CHECK-GI-FP16-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-GI-FP16-NEXT:    ucvtf v0.4h, v0.4h
 ; CHECK-GI-FP16-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/llvm.exp10.ll b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
index 51d17ad0644f15..c1ea891bc86e7e 100644
--- a/llvm/test/CodeGen/AArch64/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
@@ -267,21 +267,21 @@ define <4 x half> @exp10_v4f16(<4 x half> %x) {
 ; GISEL-NEXT:    bl exp10f
 ; GISEL-NEXT:    fcvt s1, h9
 ; GISEL-NEXT:    fcvt h0, s0
-; GISEL-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; GISEL-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; GISEL-NEXT:    fmov s0, s1
 ; GISEL-NEXT:    bl exp10f
 ; GISEL-NEXT:    fcvt s1, h10
 ; GISEL-NEXT:    fcvt h0, s0
-; GISEL-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; GISEL-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; GISEL-NEXT:    fmov s0, s1
 ; GISEL-NEXT:    bl exp10f
-; GISEL-NEXT:    ldp q2, q1, [sp, #16] // 32-byte Folded Reload
+; GISEL-NEXT:    ldp q3, q2, [sp] // 32-byte Folded Reload
 ; GISEL-NEXT:    fcvt h0, s0
+; GISEL-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldp d9, d8, [sp, #56] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr x30, [sp, #72] // 8-byte Folded Reload
 ; GISEL-NEXT:    ldr d10, [sp, #48] // 8-byte Folded Reload
-; GISEL-NEXT:    mov v1.h[1], v2.h[0]
-; GISEL-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; GISEL-NEXT:    mov v1.h[1], v3.h[0]
 ; GISEL-NEXT:    mov v1.h[2], v2.h[0]
 ; GISEL-NEXT:    mov v1.h[3], v0.h[0]
 ; GISEL-NEXT:    mov v0.16b, v1.16b

diff  --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
index c3c0ec5e3d9d8d..a4d1c53c272aa1 100644
--- a/llvm/test/CodeGen/AArch64/load.ll
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -118,7 +118,7 @@ define <2 x i8> @load_v2i8(ptr %ptr, <2 x i8> %b){
 ;
 ; CHECK-GI-LABEL: load_v2i8:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr b0, [x0]
+; CHECK-GI-NEXT:    ld1 { v0.b }[0], [x0]
 ; CHECK-GI-NEXT:    ldr b1, [x0, #1]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -158,8 +158,8 @@ define <2 x i16> @load_v2i16(ptr %ptr){
 ; CHECK-GI-LABEL: load_v2i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
@@ -235,6 +235,7 @@ define <7 x i8> @load_v7i8(ptr %ptr){
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    ldr b0, [x0]
 ; CHECK-GI-NEXT:    ldr b1, [x0, #1]
+; CHECK-GI-NEXT:    mov v0.b[0], v0.b[0]
 ; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
 ; CHECK-GI-NEXT:    ldr b1, [x0, #2]
 ; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
@@ -261,10 +262,10 @@ define <3 x i16> @load_v3i16(ptr %ptr){
 ; CHECK-GI-LABEL: load_v3i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #4]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #4
+; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x8]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %a = load <3 x i16>, ptr %ptr
@@ -280,18 +281,18 @@ define <7 x i16> @load_v7i16(ptr %ptr){
 ; CHECK-GI-LABEL: load_v7i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #4]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #6]
-; CHECK-GI-NEXT:    mov v0.h[3], v1.h[0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #8]
-; CHECK-GI-NEXT:    mov v0.h[4], v1.h[0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #10]
-; CHECK-GI-NEXT:    mov v0.h[5], v1.h[0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #12]
-; CHECK-GI-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #4
+; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #6
+; CHECK-GI-NEXT:    ld1 { v0.h }[3], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #8
+; CHECK-GI-NEXT:    ld1 { v0.h }[4], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #10
+; CHECK-GI-NEXT:    ld1 { v0.h }[5], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #12
+; CHECK-GI-NEXT:    ld1 { v0.h }[6], [x8]
 ; CHECK-GI-NEXT:    ret
     %a = load <7 x i16>, ptr %ptr
     ret <7 x i16> %a
@@ -305,10 +306,11 @@ define <3 x i32> @load_v3i32(ptr %ptr){
 ;
 ; CHECK-GI-LABEL: load_v3i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldp s0, s1, [x0]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    ldr s1, [x0, #8]
-; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    ldr s0, [x0]
+; CHECK-GI-NEXT:    add x8, x0, #4
+; CHECK-GI-NEXT:    ld1 { v0.s }[1], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #8
+; CHECK-GI-NEXT:    ld1 { v0.s }[2], [x8]
 ; CHECK-GI-NEXT:    ret
     %a = load <3 x i32>, ptr %ptr
     ret <3 x i32> %a

diff  --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll
index 02258bc47c54d4..9e748c9641aa8c 100644
--- a/llvm/test/CodeGen/AArch64/mul.ll
+++ b/llvm/test/CodeGen/AArch64/mul.ll
@@ -83,13 +83,13 @@ define void @v2i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: v2i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr b0, [x0]
-; CHECK-GI-NEXT:    ldr b1, [x0, #1]
-; CHECK-GI-NEXT:    ldr b2, [x1]
+; CHECK-GI-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-GI-NEXT:    ld1 { v1.b }[0], [x1]
+; CHECK-GI-NEXT:    ldr b2, [x0, #1]
 ; CHECK-GI-NEXT:    ldr b3, [x1, #1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT:    mul v0.2s, v0.2s, v2.2s
+; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v3.s[0]
+; CHECK-GI-NEXT:    mul v0.2s, v0.2s, v1.2s
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    str b0, [x0]
 ; CHECK-GI-NEXT:    str b1, [x0, #1]
@@ -124,22 +124,18 @@ define void @v3i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w10, [x1]
+; CHECK-GI-NEXT:    ldrb w9, [x1]
+; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
 ; CHECK-GI-NEXT:    ldrb w11, [x1, #1]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    fmov s2, w10
-; CHECK-GI-NEXT:    fmov s3, w11
 ; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
 ; CHECK-GI-NEXT:    ldrb w9, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w9
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mul v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    mov v0.h[1], w10
+; CHECK-GI-NEXT:    mov v1.h[1], w11
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    mov v1.h[2], w9
+; CHECK-GI-NEXT:    mul v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov h2, v0.h[2]
 ; CHECK-GI-NEXT:    str b0, [x0]
@@ -171,27 +167,27 @@ define void @v4i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v1.b[1]
-; CHECK-GI-NEXT:    mov b4, v0.b[2]
-; CHECK-GI-NEXT:    mov b5, v0.b[3]
-; CHECK-GI-NEXT:    mov b6, v1.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[2]
-; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mov v3.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[1]
+; CHECK-GI-NEXT:    mov v5.b[0], v1.b[0]
+; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[2]
+; CHECK-GI-NEXT:    mov b1, v1.b[3]
+; CHECK-GI-NEXT:    mov v3.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v5.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v3.b[3], v0.b[0]
+; CHECK-GI-NEXT:    mov v5.b[3], v1.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v3.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v5.8b, #0
 ; CHECK-GI-NEXT:    mul v0.4h, v0.4h, v1.4h
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v3.h[0]
-; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    xtn v0.8b, v1.8h
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    str w8, [x0]
 ; CHECK-GI-NEXT:    ret
@@ -259,13 +255,13 @@ define void @v2i16(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: v2i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    mul v0.2s, v0.2s, v1.2s
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    str h0, [x0]
@@ -293,18 +289,16 @@ define void @v3i16(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    ldr h1, [x1]
 ; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    add x10, x1, #4
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #4]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    ldr h3, [x1, #4]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[2], v3.h[0]
-; CHECK-GI-NEXT:    mul v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
+; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
+; CHECK-GI-NEXT:    mul v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
 ; CHECK-GI-NEXT:    st1 { v0.h }[2], [x9]
@@ -416,14 +410,14 @@ define <2 x i64> @v2i64(<2 x i64> %d, <2 x i64> %e) {
 ;
 ; CHECK-GI-LABEL: v2i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov x10, d0
-; CHECK-GI-NEXT:    fmov x11, d1
-; CHECK-GI-NEXT:    mov x8, v0.d[1]
-; CHECK-GI-NEXT:    mov x9, v1.d[1]
-; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
+; CHECK-GI-NEXT:    mov x11, v1.d[1]
 ; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    fmov d0, x10
-; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    mul x9, x10, x11
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov v0.d[1], x9
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = mul <2 x i64> %d, %e
@@ -461,16 +455,16 @@ define <3 x i64> @v3i64(<3 x i64> %d, <3 x i64> %e) {
 ; CHECK-GI-NEXT:    // kill: def $d4 killed $d4 def $q4
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
-; CHECK-GI-NEXT:    fmov x10, d0
-; CHECK-GI-NEXT:    fmov x11, d3
-; CHECK-GI-NEXT:    mov x8, v0.d[1]
-; CHECK-GI-NEXT:    mov x9, v3.d[1]
-; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d3
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
+; CHECK-GI-NEXT:    mov x11, v3.d[1]
 ; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    fmov x9, d5
-; CHECK-GI-NEXT:    fmov d0, x10
-; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    mul x9, x10, x11
+; CHECK-GI-NEXT:    mov v0.d[0], x8
 ; CHECK-GI-NEXT:    fmov x8, d2
+; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    fmov x9, d5
 ; CHECK-GI-NEXT:    mul x8, x8, x9
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -515,10 +509,10 @@ define <4 x i64> @v4i64(<4 x i64> %d, <4 x i64> %e) {
 ; CHECK-GI-NEXT:    fmov x9, d1
 ; CHECK-GI-NEXT:    mul x10, x10, x11
 ; CHECK-GI-NEXT:    mul x9, x9, x12
-; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    mov v0.d[0], x8
 ; CHECK-GI-NEXT:    mul x11, x13, x14
+; CHECK-GI-NEXT:    mov v1.d[0], x9
 ; CHECK-GI-NEXT:    mov v0.d[1], x10
-; CHECK-GI-NEXT:    fmov d1, x9
 ; CHECK-GI-NEXT:    mov v1.d[1], x11
 ; CHECK-GI-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
index 50c0c8b11e7517..dbb4270fb8002e 100644
--- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -1120,12 +1120,10 @@ define <4 x i16> @vselect_constant_cond_zero_v4i16(<4 x i16> %a) {
 ; CHECK-GI-NEXT:    mov w8, #1 // =0x1
 ; CHECK-GI-NEXT:    mov w9, #0 // =0x0
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    mov v3.16b, v1.16b
-; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov v3.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v3.b[3], v1.b[0]
-; CHECK-GI-NEXT:    ushll v1.8h, v3.8b, #0
+; CHECK-GI-NEXT:    mov v1.b[1], w9
+; CHECK-GI-NEXT:    mov v1.b[2], w9
+; CHECK-GI-NEXT:    mov v1.b[3], w8
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
 ; CHECK-GI-NEXT:    shl v1.4h, v1.4h, #15
 ; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #15
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
@@ -1148,10 +1146,9 @@ define <4 x i32> @vselect_constant_cond_zero_v4i32(<4 x i32> %a) {
 ; CHECK-GI-NEXT:    mov w9, #0 // =0x0
 ; CHECK-GI-NEXT:    fmov s1, w8
 ; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    mov v3.16b, v1.16b
-; CHECK-GI-NEXT:    mov v3.h[1], v2.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v1.h[0]
-; CHECK-GI-NEXT:    ushll v1.4s, v3.4h, #0
+; CHECK-GI-NEXT:    mov v2.h[1], w8
+; CHECK-GI-NEXT:    mov v1.h[1], w9
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
 ; CHECK-GI-NEXT:    mov v1.d[1], v2.d[0]
 ; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #31
@@ -1199,12 +1196,10 @@ define <4 x i16> @vselect_constant_cond_v4i16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-GI-NEXT:    mov w8, #1 // =0x1
 ; CHECK-GI-NEXT:    mov w9, #0 // =0x0
 ; CHECK-GI-NEXT:    fmov s2, w8
-; CHECK-GI-NEXT:    fmov s3, w9
-; CHECK-GI-NEXT:    mov v4.16b, v2.16b
-; CHECK-GI-NEXT:    mov v4.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v4.b[2], v3.b[0]
-; CHECK-GI-NEXT:    mov v4.b[3], v2.b[0]
-; CHECK-GI-NEXT:    ushll v2.8h, v4.8b, #0
+; CHECK-GI-NEXT:    mov v2.b[1], w9
+; CHECK-GI-NEXT:    mov v2.b[2], w9
+; CHECK-GI-NEXT:    mov v2.b[3], w8
+; CHECK-GI-NEXT:    ushll v2.8h, v2.8b, #0
 ; CHECK-GI-NEXT:    shl v2.4h, v2.4h, #15
 ; CHECK-GI-NEXT:    sshr v2.4h, v2.4h, #15
 ; CHECK-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
@@ -1227,10 +1222,9 @@ define <4 x i32> @vselect_constant_cond_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-GI-NEXT:    mov w9, #0 // =0x0
 ; CHECK-GI-NEXT:    fmov s2, w8
 ; CHECK-GI-NEXT:    fmov s3, w9
-; CHECK-GI-NEXT:    mov v4.16b, v2.16b
-; CHECK-GI-NEXT:    mov v4.h[1], v3.h[0]
-; CHECK-GI-NEXT:    mov v3.h[1], v2.h[0]
-; CHECK-GI-NEXT:    ushll v2.4s, v4.4h, #0
+; CHECK-GI-NEXT:    mov v3.h[1], w8
+; CHECK-GI-NEXT:    mov v2.h[1], w9
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
 ; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
 ; CHECK-GI-NEXT:    mov v2.d[1], v3.d[0]
 ; CHECK-GI-NEXT:    shl v2.4s, v2.4s, #31

diff  --git a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
index 59958afdd0d1e9..adc89f7a0d99d8 100644
--- a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
@@ -2674,10 +2674,10 @@ define <4 x i32> @fcmal4xfloat(<4 x float> %A, <4 x float> %B) {
 ; CHECK-GI-NEXT:    mov w8, #1 // =0x1
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    mov v1.16b, v0.16b
-; CHECK-GI-NEXT:    mov v1.h[1], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v0.h[0]
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v1.h[1], w8
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    mov v1.d[1], v0.d[0]
 ; CHECK-GI-NEXT:    shl v0.4s, v1.4s, #31
 ; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #31
@@ -2725,10 +2725,10 @@ define <4 x i32> @fcmnv4xfloat(<4 x float> %A, <4 x float> %B) {
 ; CHECK-GI-NEXT:    mov w8, #0 // =0x0
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    mov v1.16b, v0.16b
-; CHECK-GI-NEXT:    mov v1.h[1], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v0.h[0]
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v1.h[1], w8
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    mov v1.d[1], v0.d[0]
 ; CHECK-GI-NEXT:    shl v0.4s, v1.4s, #31
 ; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #31

diff  --git a/llvm/test/CodeGen/AArch64/neon-extadd.ll b/llvm/test/CodeGen/AArch64/neon-extadd.ll
index 6f4b090fb22bd6..f5e566f49b91e9 100644
--- a/llvm/test/CodeGen/AArch64/neon-extadd.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extadd.ll
@@ -1266,133 +1266,99 @@ define <20 x i32> @v20(<20 x i8> %s0, <20 x i8> %s1) {
 ;
 ; CHECK-GI-LABEL: v20:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr w9, [sp, #64]
-; CHECK-GI-NEXT:    ldr w10, [sp, #72]
-; CHECK-GI-NEXT:    and w13, w2, #0xff
-; CHECK-GI-NEXT:    ldr w11, [sp, #80]
-; CHECK-GI-NEXT:    ldr w12, [sp, #88]
-; CHECK-GI-NEXT:    fmov s19, w13
-; CHECK-GI-NEXT:    fmov s0, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #224]
-; CHECK-GI-NEXT:    fmov s16, w10
-; CHECK-GI-NEXT:    ldr w10, [sp, #232]
-; CHECK-GI-NEXT:    fmov s3, w11
-; CHECK-GI-NEXT:    ldr w11, [sp, #240]
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #248]
-; CHECK-GI-NEXT:    fmov s1, w12
-; CHECK-GI-NEXT:    fmov s7, w10
-; CHECK-GI-NEXT:    and w10, w1, #0xff
-; CHECK-GI-NEXT:    fmov s5, w11
-; CHECK-GI-NEXT:    fmov s4, w9
 ; CHECK-GI-NEXT:    and w9, w0, #0xff
-; CHECK-GI-NEXT:    ldrb w11, [sp]
-; CHECK-GI-NEXT:    ldrb w12, [sp, #8]
-; CHECK-GI-NEXT:    fmov s6, w9
-; CHECK-GI-NEXT:    fmov s20, w10
-; CHECK-GI-NEXT:    ldrb w9, [sp, #96]
-; CHECK-GI-NEXT:    ldrb w10, [sp, #104]
-; CHECK-GI-NEXT:    fmov s17, w11
-; CHECK-GI-NEXT:    fmov s21, w12
-; CHECK-GI-NEXT:    ldrb w11, [sp, #160]
-; CHECK-GI-NEXT:    mov v0.b[1], v16.b[0]
-; CHECK-GI-NEXT:    fmov s18, w9
-; CHECK-GI-NEXT:    fmov s22, w10
+; CHECK-GI-NEXT:    ldrb w10, [sp, #96]
+; CHECK-GI-NEXT:    and w11, w1, #0xff
+; CHECK-GI-NEXT:    fmov s0, w9
+; CHECK-GI-NEXT:    ldrb w9, [sp]
+; CHECK-GI-NEXT:    ldrb w12, [sp, #104]
+; CHECK-GI-NEXT:    fmov s2, w10
+; CHECK-GI-NEXT:    ldrb w10, [sp, #160]
+; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    ldrb w9, [sp, #168]
-; CHECK-GI-NEXT:    mov v6.h[1], v20.h[0]
-; CHECK-GI-NEXT:    fmov s20, w11
-; CHECK-GI-NEXT:    ldrb w10, [sp, #16]
-; CHECK-GI-NEXT:    mov v17.h[1], v21.h[0]
-; CHECK-GI-NEXT:    fmov s21, w9
-; CHECK-GI-NEXT:    ldrb w9, [sp, #112]
-; CHECK-GI-NEXT:    mov v18.h[1], v22.h[0]
-; CHECK-GI-NEXT:    fmov s23, w10
-; CHECK-GI-NEXT:    ldrb w10, [sp, #176]
-; CHECK-GI-NEXT:    and w11, w3, #0xff
-; CHECK-GI-NEXT:    mov v2.b[1], v7.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v3.b[0]
-; CHECK-GI-NEXT:    mov v6.h[2], v19.h[0]
-; CHECK-GI-NEXT:    fmov s19, w9
-; CHECK-GI-NEXT:    mov v20.h[1], v21.h[0]
-; CHECK-GI-NEXT:    ldrb w9, [sp, #24]
-; CHECK-GI-NEXT:    fmov s22, w11
-; CHECK-GI-NEXT:    mov v17.h[2], v23.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], w11
+; CHECK-GI-NEXT:    ldrb w11, [sp, #8]
+; CHECK-GI-NEXT:    fmov s3, w10
+; CHECK-GI-NEXT:    mov v2.h[1], w12
+; CHECK-GI-NEXT:    and w10, w2, #0xff
+; CHECK-GI-NEXT:    and w12, w5, #0xff
+; CHECK-GI-NEXT:    mov v1.h[1], w11
 ; CHECK-GI-NEXT:    and w11, w4, #0xff
-; CHECK-GI-NEXT:    mov v18.h[2], v19.h[0]
-; CHECK-GI-NEXT:    fmov s19, w10
-; CHECK-GI-NEXT:    ldrb w10, [sp, #120]
-; CHECK-GI-NEXT:    fmov s23, w9
+; CHECK-GI-NEXT:    mov v3.h[1], w9
+; CHECK-GI-NEXT:    ldrb w9, [sp, #112]
+; CHECK-GI-NEXT:    mov v0.h[2], w10
+; CHECK-GI-NEXT:    ldrb w10, [sp, #16]
+; CHECK-GI-NEXT:    mov v2.h[2], w9
+; CHECK-GI-NEXT:    ldrb w9, [sp, #176]
+; CHECK-GI-NEXT:    mov v1.h[2], w10
+; CHECK-GI-NEXT:    and w10, w3, #0xff
+; CHECK-GI-NEXT:    mov v3.h[2], w9
+; CHECK-GI-NEXT:    ldrb w9, [sp, #120]
+; CHECK-GI-NEXT:    mov v0.h[3], w10
+; CHECK-GI-NEXT:    ldrb w10, [sp, #24]
+; CHECK-GI-NEXT:    mov v2.h[3], w9
 ; CHECK-GI-NEXT:    ldrb w9, [sp, #184]
-; CHECK-GI-NEXT:    mov v6.h[3], v22.h[0]
-; CHECK-GI-NEXT:    fmov s21, w11
-; CHECK-GI-NEXT:    and w11, w6, #0xff
-; CHECK-GI-NEXT:    mov v2.b[2], v5.b[0]
-; CHECK-GI-NEXT:    mov v20.h[2], v19.h[0]
-; CHECK-GI-NEXT:    fmov s19, w10
-; CHECK-GI-NEXT:    fmov s16, w9
+; CHECK-GI-NEXT:    mov v1.h[3], w10
+; CHECK-GI-NEXT:    ldr w10, [sp, #64]
+; CHECK-GI-NEXT:    mov v3.h[3], w9
 ; CHECK-GI-NEXT:    ldrb w9, [sp, #128]
-; CHECK-GI-NEXT:    and w10, w5, #0xff
-; CHECK-GI-NEXT:    mov v17.h[3], v23.h[0]
-; CHECK-GI-NEXT:    mov v6.h[4], v21.h[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v1.b[0]
-; CHECK-GI-NEXT:    mov v18.h[3], v19.h[0]
-; CHECK-GI-NEXT:    fmov s19, w9
-; CHECK-GI-NEXT:    ldrb w9, [sp, #192]
-; CHECK-GI-NEXT:    mov v20.h[3], v16.h[0]
-; CHECK-GI-NEXT:    fmov s16, w10
-; CHECK-GI-NEXT:    ldrb w10, [sp, #32]
-; CHECK-GI-NEXT:    mov v2.b[3], v4.b[0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    mov v18.h[4], v19.h[0]
-; CHECK-GI-NEXT:    fmov s19, w10
+; CHECK-GI-NEXT:    mov v0.h[4], w11
+; CHECK-GI-NEXT:    ldrb w11, [sp, #32]
+; CHECK-GI-NEXT:    fmov s4, w10
+; CHECK-GI-NEXT:    ldrb w10, [sp, #192]
+; CHECK-GI-NEXT:    mov v2.h[4], w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #72]
+; CHECK-GI-NEXT:    mov v1.h[4], w11
+; CHECK-GI-NEXT:    ldr w11, [sp, #224]
+; CHECK-GI-NEXT:    mov v3.h[4], w10
 ; CHECK-GI-NEXT:    ldrb w10, [sp, #136]
-; CHECK-GI-NEXT:    mov v6.h[5], v16.h[0]
-; CHECK-GI-NEXT:    fmov s16, w10
-; CHECK-GI-NEXT:    ldrb w10, [sp, #48]
-; CHECK-GI-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-GI-NEXT:    mov v17.h[4], v19.h[0]
-; CHECK-GI-NEXT:    fmov s19, w9
-; CHECK-GI-NEXT:    ldrb w9, [sp, #40]
-; CHECK-GI-NEXT:    mov v18.h[5], v16.h[0]
-; CHECK-GI-NEXT:    fmov s16, w9
+; CHECK-GI-NEXT:    mov v4.b[1], w9
+; CHECK-GI-NEXT:    fmov s5, w11
+; CHECK-GI-NEXT:    ldr w11, [sp, #232]
+; CHECK-GI-NEXT:    mov v0.h[5], w12
+; CHECK-GI-NEXT:    ldrb w12, [sp, #40]
+; CHECK-GI-NEXT:    mov v2.h[5], w10
+; CHECK-GI-NEXT:    ldrb w10, [sp, #200]
 ; CHECK-GI-NEXT:    ldrb w9, [sp, #144]
-; CHECK-GI-NEXT:    mov v20.h[4], v19.h[0]
-; CHECK-GI-NEXT:    fmov s19, w11
-; CHECK-GI-NEXT:    ldrb w11, [sp, #200]
-; CHECK-GI-NEXT:    add v0.4h, v0.4h, v2.4h
-; CHECK-GI-NEXT:    fmov s7, w11
-; CHECK-GI-NEXT:    mov v17.h[5], v16.h[0]
-; CHECK-GI-NEXT:    fmov s16, w9
-; CHECK-GI-NEXT:    ldrb w11, [sp, #208]
-; CHECK-GI-NEXT:    mov v6.h[6], v19.h[0]
-; CHECK-GI-NEXT:    ldrb w9, [sp, #56]
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    mov v20.h[5], v7.h[0]
-; CHECK-GI-NEXT:    fmov s7, w10
-; CHECK-GI-NEXT:    mov v18.h[6], v16.h[0]
-; CHECK-GI-NEXT:    fmov s16, w11
+; CHECK-GI-NEXT:    mov v5.b[1], w11
+; CHECK-GI-NEXT:    mov v1.h[5], w12
+; CHECK-GI-NEXT:    mov v3.h[5], w10
+; CHECK-GI-NEXT:    ldr w10, [sp, #80]
+; CHECK-GI-NEXT:    ldr w12, [sp, #240]
+; CHECK-GI-NEXT:    and w11, w6, #0xff
+; CHECK-GI-NEXT:    mov v0.h[6], w11
+; CHECK-GI-NEXT:    ldrb w11, [sp, #48]
+; CHECK-GI-NEXT:    mov v2.h[6], w9
+; CHECK-GI-NEXT:    ldrb w9, [sp, #208]
+; CHECK-GI-NEXT:    mov v4.b[2], w10
 ; CHECK-GI-NEXT:    ldrb w10, [sp, #152]
-; CHECK-GI-NEXT:    and w11, w7, #0xff
-; CHECK-GI-NEXT:    fmov s3, w11
-; CHECK-GI-NEXT:    str q0, [x8, #64]
-; CHECK-GI-NEXT:    fmov s5, w10
+; CHECK-GI-NEXT:    mov v5.b[2], w12
+; CHECK-GI-NEXT:    mov v1.h[6], w11
+; CHECK-GI-NEXT:    ldr w11, [sp, #248]
+; CHECK-GI-NEXT:    mov v3.h[6], w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #88]
+; CHECK-GI-NEXT:    and w12, w7, #0xff
+; CHECK-GI-NEXT:    mov v0.h[7], w12
+; CHECK-GI-NEXT:    mov v2.h[7], w10
+; CHECK-GI-NEXT:    ldrb w12, [sp, #56]
+; CHECK-GI-NEXT:    mov v4.b[3], w9
 ; CHECK-GI-NEXT:    ldrb w10, [sp, #216]
-; CHECK-GI-NEXT:    mov v17.h[6], v7.h[0]
-; CHECK-GI-NEXT:    mov v20.h[6], v16.h[0]
-; CHECK-GI-NEXT:    fmov s7, w9
-; CHECK-GI-NEXT:    mov v6.h[7], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w10
-; CHECK-GI-NEXT:    mov v18.h[7], v5.h[0]
-; CHECK-GI-NEXT:    mov v17.h[7], v7.h[0]
-; CHECK-GI-NEXT:    mov v20.h[7], v3.h[0]
-; CHECK-GI-NEXT:    add v1.8h, v6.8h, v18.8h
-; CHECK-GI-NEXT:    add v3.8h, v17.8h, v20.8h
+; CHECK-GI-NEXT:    mov v5.b[3], w11
+; CHECK-GI-NEXT:    mov v1.h[7], w12
+; CHECK-GI-NEXT:    mov v3.h[7], w10
+; CHECK-GI-NEXT:    add v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    ushll v2.8h, v4.8b, #0
+; CHECK-GI-NEXT:    ushll v4.8h, v5.8b, #0
+; CHECK-GI-NEXT:    add v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ushll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    add v2.4h, v2.4h, v4.4h
 ; CHECK-GI-NEXT:    ushll v4.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ushll v2.4s, v3.4h, #0
-; CHECK-GI-NEXT:    ushll2 v3.4s, v3.8h, #0
-; CHECK-GI-NEXT:    stp q4, q1, [x8]
-; CHECK-GI-NEXT:    stp q2, q3, [x8, #32]
+; CHECK-GI-NEXT:    stp q3, q0, [x8]
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    stp q4, q1, [x8, #32]
+; CHECK-GI-NEXT:    str q2, [x8, #64]
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <20 x i8> %s0 to <20 x i32>
@@ -1497,107 +1463,83 @@ define <16 x i32> @i12(<16 x i12> %s0, <16 x i12> %s1) {
 ;
 ; CHECK-GI-LABEL: i12:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr w12, [sp]
+; CHECK-GI-NEXT:    ldr w14, [sp, #32]
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
-; CHECK-GI-NEXT:    ldr w8, [sp]
-; CHECK-GI-NEXT:    fmov s2, w5
-; CHECK-GI-NEXT:    ldr w9, [sp, #8]
-; CHECK-GI-NEXT:    ldr w11, [sp, #32]
-; CHECK-GI-NEXT:    ldr w12, [sp, #40]
-; CHECK-GI-NEXT:    fmov s5, w7
-; CHECK-GI-NEXT:    ldr w10, [sp, #16]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    ldr w16, [sp, #128]
+; CHECK-GI-NEXT:    ldr w17, [sp, #160]
 ; CHECK-GI-NEXT:    fmov s1, w4
-; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    fmov s2, w12
+; CHECK-GI-NEXT:    fmov s3, w14
+; CHECK-GI-NEXT:    ldr w12, [sp, #64]
+; CHECK-GI-NEXT:    ldr w14, [sp, #96]
+; CHECK-GI-NEXT:    ldr w13, [sp, #8]
+; CHECK-GI-NEXT:    ldr w15, [sp, #40]
 ; CHECK-GI-NEXT:    fmov s4, w12
-; CHECK-GI-NEXT:    ldr w12, [sp, #96]
-; CHECK-GI-NEXT:    ldr w13, [sp, #104]
-; CHECK-GI-NEXT:    ldr w14, [sp, #128]
-; CHECK-GI-NEXT:    ldr w15, [sp, #136]
-; CHECK-GI-NEXT:    ldr w16, [sp, #160]
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w2
-; CHECK-GI-NEXT:    fmov s7, w13
-; CHECK-GI-NEXT:    fmov s16, w15
-; CHECK-GI-NEXT:    ldr w17, [sp, #168]
-; CHECK-GI-NEXT:    ldr w9, [sp, #24]
-; CHECK-GI-NEXT:    ldr w13, [sp, #176]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w6
-; CHECK-GI-NEXT:    fmov s17, w17
-; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w8
-; CHECK-GI-NEXT:    ldr w8, [sp, #56]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w11
+; CHECK-GI-NEXT:    fmov s6, w16
+; CHECK-GI-NEXT:    fmov s7, w17
+; CHECK-GI-NEXT:    fmov s5, w14
+; CHECK-GI-NEXT:    mov v2.h[1], w13
+; CHECK-GI-NEXT:    mov v3.h[1], w15
+; CHECK-GI-NEXT:    ldr w13, [sp, #72]
+; CHECK-GI-NEXT:    ldr w15, [sp, #104]
+; CHECK-GI-NEXT:    ldr w12, [sp, #136]
+; CHECK-GI-NEXT:    ldr w18, [sp, #168]
+; CHECK-GI-NEXT:    mov v0.h[1], w1
+; CHECK-GI-NEXT:    mov v1.h[1], w5
+; CHECK-GI-NEXT:    mov v4.h[1], w13
+; CHECK-GI-NEXT:    mov v5.h[1], w15
+; CHECK-GI-NEXT:    mov v6.h[1], w12
+; CHECK-GI-NEXT:    mov v7.h[1], w18
+; CHECK-GI-NEXT:    ldr w10, [sp, #16]
 ; CHECK-GI-NEXT:    ldr w11, [sp, #48]
-; CHECK-GI-NEXT:    mov v1.h[3], v5.h[0]
-; CHECK-GI-NEXT:    fmov s5, w10
-; CHECK-GI-NEXT:    ldr w10, [sp, #64]
-; CHECK-GI-NEXT:    mov v3.h[1], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w3
-; CHECK-GI-NEXT:    mov v2.h[2], v5.h[0]
-; CHECK-GI-NEXT:    fmov s5, w11
-; CHECK-GI-NEXT:    ldr w11, [sp, #72]
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    fmov s6, w11
-; CHECK-GI-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    mov v3.h[2], v5.h[0]
-; CHECK-GI-NEXT:    fmov s5, w10
-; CHECK-GI-NEXT:    ldr w9, [sp, #80]
-; CHECK-GI-NEXT:    ldr w10, [sp, #112]
-; CHECK-GI-NEXT:    ldr w11, [sp, #144]
-; CHECK-GI-NEXT:    mov v2.h[3], v4.h[0]
-; CHECK-GI-NEXT:    mov v5.h[1], v6.h[0]
-; CHECK-GI-NEXT:    fmov s6, w12
-; CHECK-GI-NEXT:    fmov s18, w11
-; CHECK-GI-NEXT:    ldr w12, [sp, #88]
+; CHECK-GI-NEXT:    ldr w12, [sp, #80]
+; CHECK-GI-NEXT:    ldr w13, [sp, #112]
+; CHECK-GI-NEXT:    ldr w14, [sp, #144]
+; CHECK-GI-NEXT:    ldr w15, [sp, #176]
+; CHECK-GI-NEXT:    mov v0.h[2], w2
+; CHECK-GI-NEXT:    mov v1.h[2], w6
+; CHECK-GI-NEXT:    mov v2.h[2], w10
+; CHECK-GI-NEXT:    mov v3.h[2], w11
+; CHECK-GI-NEXT:    mov v4.h[2], w12
+; CHECK-GI-NEXT:    mov v5.h[2], w13
+; CHECK-GI-NEXT:    mov v6.h[2], w14
+; CHECK-GI-NEXT:    mov v7.h[2], w15
+; CHECK-GI-NEXT:    ldr w8, [sp, #24]
+; CHECK-GI-NEXT:    ldr w9, [sp, #56]
+; CHECK-GI-NEXT:    ldr w10, [sp, #88]
+; CHECK-GI-NEXT:    ldr w11, [sp, #120]
+; CHECK-GI-NEXT:    ldr w12, [sp, #152]
+; CHECK-GI-NEXT:    ldr w13, [sp, #184]
+; CHECK-GI-NEXT:    mov v0.h[3], w3
+; CHECK-GI-NEXT:    mov v1.h[3], w7
+; CHECK-GI-NEXT:    mov v2.h[3], w8
+; CHECK-GI-NEXT:    mov v3.h[3], w9
+; CHECK-GI-NEXT:    mov v4.h[3], w10
+; CHECK-GI-NEXT:    mov v5.h[3], w11
+; CHECK-GI-NEXT:    mov v6.h[3], w12
+; CHECK-GI-NEXT:    mov v7.h[3], w13
+; CHECK-GI-NEXT:    movi v16.4s, #15, msl #8
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    mov v6.h[1], v7.h[0]
-; CHECK-GI-NEXT:    fmov s7, w14
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT:    mov v7.h[1], v16.h[0]
-; CHECK-GI-NEXT:    fmov s16, w16
-; CHECK-GI-NEXT:    mov v16.h[1], v17.h[0]
-; CHECK-GI-NEXT:    fmov s17, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #152]
-; CHECK-GI-NEXT:    mov v7.h[2], v18.h[0]
-; CHECK-GI-NEXT:    fmov s18, w8
-; CHECK-GI-NEXT:    ldr w8, [sp, #120]
-; CHECK-GI-NEXT:    mov v5.h[2], v17.h[0]
-; CHECK-GI-NEXT:    fmov s17, w10
-; CHECK-GI-NEXT:    ldr w10, [sp, #184]
-; CHECK-GI-NEXT:    mov v3.h[3], v18.h[0]
-; CHECK-GI-NEXT:    fmov s4, w8
-; CHECK-GI-NEXT:    fmov s18, w10
-; CHECK-GI-NEXT:    mov v6.h[2], v17.h[0]
-; CHECK-GI-NEXT:    fmov s17, w13
 ; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
-; CHECK-GI-NEXT:    mov v16.h[2], v17.h[0]
-; CHECK-GI-NEXT:    fmov s17, w12
-; CHECK-GI-NEXT:    mov v6.h[3], v4.h[0]
-; CHECK-GI-NEXT:    movi v4.4s, #15, msl #8
-; CHECK-GI-NEXT:    mov v5.h[3], v17.h[0]
-; CHECK-GI-NEXT:    fmov s17, w9
-; CHECK-GI-NEXT:    mov v16.h[3], v18.h[0]
-; CHECK-GI-NEXT:    ushll v6.4s, v6.4h, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v4.16b
-; CHECK-GI-NEXT:    and v1.16b, v1.16b, v4.16b
-; CHECK-GI-NEXT:    mov v7.h[3], v17.h[0]
-; CHECK-GI-NEXT:    and v2.16b, v2.16b, v4.16b
-; CHECK-GI-NEXT:    and v3.16b, v3.16b, v4.16b
+; CHECK-GI-NEXT:    ushll v4.4s, v4.4h, #0
 ; CHECK-GI-NEXT:    ushll v5.4s, v5.4h, #0
-; CHECK-GI-NEXT:    ushll v16.4s, v16.4h, #0
-; CHECK-GI-NEXT:    and v6.16b, v6.16b, v4.16b
+; CHECK-GI-NEXT:    ushll v6.4s, v6.4h, #0
 ; CHECK-GI-NEXT:    ushll v7.4s, v7.4h, #0
-; CHECK-GI-NEXT:    and v5.16b, v5.16b, v4.16b
-; CHECK-GI-NEXT:    add v1.4s, v1.4s, v6.4s
-; CHECK-GI-NEXT:    and v7.16b, v7.16b, v4.16b
-; CHECK-GI-NEXT:    and v4.16b, v16.16b, v4.16b
-; CHECK-GI-NEXT:    add v0.4s, v0.4s, v5.4s
-; CHECK-GI-NEXT:    add v2.4s, v2.4s, v7.4s
-; CHECK-GI-NEXT:    add v3.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v16.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v16.16b
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v16.16b
+; CHECK-GI-NEXT:    and v3.16b, v3.16b, v16.16b
+; CHECK-GI-NEXT:    and v4.16b, v4.16b, v16.16b
+; CHECK-GI-NEXT:    and v5.16b, v5.16b, v16.16b
+; CHECK-GI-NEXT:    and v6.16b, v6.16b, v16.16b
+; CHECK-GI-NEXT:    and v7.16b, v7.16b, v16.16b
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT:    add v1.4s, v1.4s, v5.4s
+; CHECK-GI-NEXT:    add v2.4s, v2.4s, v6.4s
+; CHECK-GI-NEXT:    add v3.4s, v3.4s, v7.4s
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <16 x i12> %s0 to <16 x i32>

diff  --git a/llvm/test/CodeGen/AArch64/neon-extmul.ll b/llvm/test/CodeGen/AArch64/neon-extmul.ll
index 3dbc033dfab964..f83ac8ed642cc1 100644
--- a/llvm/test/CodeGen/AArch64/neon-extmul.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extmul.ll
@@ -272,18 +272,18 @@ define <8 x i64> @extaddsu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) {
 ; CHECK-GI-NEXT:    mul x15, x15, x16
 ; CHECK-GI-NEXT:    mul x10, x10, x11
 ; CHECK-GI-NEXT:    fmov x11, d0
-; CHECK-GI-NEXT:    fmov d0, x8
-; CHECK-GI-NEXT:    fmov d1, x9
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov v1.d[0], x9
 ; CHECK-GI-NEXT:    mul x13, x13, x18
-; CHECK-GI-NEXT:    mov v0.d[1], x12
 ; CHECK-GI-NEXT:    mul x11, x11, x14
 ; CHECK-GI-NEXT:    mov x14, v6.d[1]
+; CHECK-GI-NEXT:    mov v0.d[1], x12
+; CHECK-GI-NEXT:    mov v2.d[0], x10
 ; CHECK-GI-NEXT:    mov v1.d[1], x15
-; CHECK-GI-NEXT:    fmov d2, x10
 ; CHECK-GI-NEXT:    mul x14, x14, x17
-; CHECK-GI-NEXT:    fmov d3, x11
-; CHECK-GI-NEXT:    mov v3.d[1], x13
+; CHECK-GI-NEXT:    mov v3.d[0], x11
 ; CHECK-GI-NEXT:    mov v2.d[1], x14
+; CHECK-GI-NEXT:    mov v3.d[1], x13
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <8 x i8> %s0 to <8 x i64>
@@ -423,22 +423,22 @@ define <8 x i64> @extmuladdsu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1, <8 x i64> %b)
 ; CHECK-GI-NEXT:    mul x15, x15, x16
 ; CHECK-GI-NEXT:    mul x10, x10, x11
 ; CHECK-GI-NEXT:    fmov x11, d0
-; CHECK-GI-NEXT:    fmov d0, x8
-; CHECK-GI-NEXT:    fmov d1, x9
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    mov v1.d[0], x9
 ; CHECK-GI-NEXT:    mul x13, x13, x18
-; CHECK-GI-NEXT:    mov v0.d[1], x12
 ; CHECK-GI-NEXT:    mul x11, x11, x14
 ; CHECK-GI-NEXT:    mov x14, v18.d[1]
+; CHECK-GI-NEXT:    mov v0.d[1], x12
+; CHECK-GI-NEXT:    mov v6.d[0], x10
 ; CHECK-GI-NEXT:    mov v1.d[1], x15
-; CHECK-GI-NEXT:    fmov d6, x10
-; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
 ; CHECK-GI-NEXT:    mul x14, x14, x17
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    mov v7.d[0], x11
 ; CHECK-GI-NEXT:    add v1.2d, v1.2d, v3.2d
-; CHECK-GI-NEXT:    fmov d7, x11
-; CHECK-GI-NEXT:    mov v7.d[1], x13
 ; CHECK-GI-NEXT:    mov v6.d[1], x14
-; CHECK-GI-NEXT:    add v3.2d, v7.2d, v5.2d
+; CHECK-GI-NEXT:    mov v7.d[1], x13
 ; CHECK-GI-NEXT:    add v2.2d, v6.2d, v4.2d
+; CHECK-GI-NEXT:    add v3.2d, v7.2d, v5.2d
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <8 x i8> %s0 to <8 x i64>

diff  --git a/llvm/test/CodeGen/AArch64/neon-perm.ll b/llvm/test/CodeGen/AArch64/neon-perm.ll
index 15763543113eb0..2897741780f602 100644
--- a/llvm/test/CodeGen/AArch64/neon-perm.ll
+++ b/llvm/test/CodeGen/AArch64/neon-perm.ll
@@ -1741,12 +1741,13 @@ define <4 x i8> @test_vzip1_v4i8(<8 x i8> %p) {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v3.b[0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[2], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[3], v0.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v2.8b, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
  %lo = shufflevector <8 x i8> %p, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>

diff  --git a/llvm/test/CodeGen/AArch64/ptradd.ll b/llvm/test/CodeGen/AArch64/ptradd.ll
index af283f6a093e97..3263a5e03c1fdc 100644
--- a/llvm/test/CodeGen/AArch64/ptradd.ll
+++ b/llvm/test/CodeGen/AArch64/ptradd.ll
@@ -77,17 +77,18 @@ define void @vector_gep_v3i32(<3 x ptr> %b, <3 x i32> %off, ptr %p) {
 ;
 ; CHECK-GI-LABEL: vector_gep_v3i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    smov x8, v3.s[0]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    smov x9, v3.s[1]
-; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT:    fmov d1, x8
-; CHECK-GI-NEXT:    mov w8, v3.s[2]
-; CHECK-GI-NEXT:    mov v1.d[1], x9
+; CHECK-GI-NEXT:    smov x9, v3.s[0]
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    smov x10, v3.s[1]
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v4.d[0], x9
 ; CHECK-GI-NEXT:    fmov x9, d2
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    mov w8, v3.s[2]
+; CHECK-GI-NEXT:    mov v4.d[1], x10
 ; CHECK-GI-NEXT:    add x8, x9, w8, sxtw
-; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v4.2d
 ; CHECK-GI-NEXT:    str x8, [x0, #16]
 ; CHECK-GI-NEXT:    str q0, [x0]
 ; CHECK-GI-NEXT:    ret
@@ -166,17 +167,18 @@ define void @vector_gep_v3i64(<3 x ptr> %b, <3 x i64> %off, ptr %p) {
 ;
 ; CHECK-GI-LABEL: vector_gep_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov x8, d0
 ; CHECK-GI-NEXT:    // kill: def $d3 killed $d3 def $q3
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    // kill: def $d4 killed $d4 def $q4
-; CHECK-GI-NEXT:    fmov x8, d2
 ; CHECK-GI-NEXT:    fmov x9, d5
-; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    fmov x8, d2
 ; CHECK-GI-NEXT:    add x8, x8, x9
-; CHECK-GI-NEXT:    str x8, [x0, #16]
 ; CHECK-GI-NEXT:    add v0.2d, v0.2d, v3.2d
+; CHECK-GI-NEXT:    str x8, [x0, #16]
 ; CHECK-GI-NEXT:    str q0, [x0]
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -206,13 +208,21 @@ entry:
 }
 
 define void @vector_gep_v4i128(<2 x ptr> %b, <2 x i128> %off, ptr %p) {
-; CHECK-LABEL: vector_gep_v4i128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov d1, x0
-; CHECK-NEXT:    mov v1.d[1], x2
-; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    str q0, [x4]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: vector_gep_v4i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov d1, x0
+; CHECK-SD-NEXT:    mov v1.d[1], x2
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    str q0, [x4]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: vector_gep_v4i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v1.d[0], x0
+; CHECK-GI-NEXT:    mov v1.d[1], x2
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    str q0, [x4]
+; CHECK-GI-NEXT:    ret
 entry:
   %g = getelementptr i8, <2 x ptr> %b, <2 x i128> %off
   store <2 x ptr> %g, ptr %p

diff  --git a/llvm/test/CodeGen/AArch64/rem.ll b/llvm/test/CodeGen/AArch64/rem.ll
index 81682c5f0ce85d..d807635f5d87d1 100644
--- a/llvm/test/CodeGen/AArch64/rem.ll
+++ b/llvm/test/CodeGen/AArch64/rem.ll
@@ -190,7 +190,7 @@ define <2 x i8> @sv2i8(<2 x i8> %d, <2 x i8> %e) {
 ; CHECK-GI-NEXT:    sdiv w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    sdiv w9, w9, w10
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[0], w8
 ; CHECK-GI-NEXT:    mov v2.s[1], w9
 ; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -284,7 +284,7 @@ define <4 x i8> @sv4i8(<4 x i8> %d, <4 x i8> %e) {
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    sdiv w9, w9, w10
 ; CHECK-GI-NEXT:    mov w10, v0.s[2]
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[0], w8
 ; CHECK-GI-NEXT:    sdiv w10, w10, w11
 ; CHECK-GI-NEXT:    mov w11, v0.s[3]
 ; CHECK-GI-NEXT:    mov v2.s[1], w9
@@ -363,12 +363,12 @@ define <8 x i8> @sv8i8(<8 x i8> %d, <8 x i8> %e) {
 ; CHECK-GI-NEXT:    fmov w13, s1
 ; CHECK-GI-NEXT:    mov w14, v1.s[1]
 ; CHECK-GI-NEXT:    mov w15, v1.s[2]
-; CHECK-GI-NEXT:    mov w16, v1.s[3]
 ; CHECK-GI-NEXT:    sdiv w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, v2.s[1]
 ; CHECK-GI-NEXT:    sdiv w9, w9, w10
 ; CHECK-GI-NEXT:    mov w10, v2.s[2]
-; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v4.s[0], w8
+; CHECK-GI-NEXT:    mov w8, v0.s[3]
 ; CHECK-GI-NEXT:    sdiv w10, w10, w11
 ; CHECK-GI-NEXT:    mov w11, v2.s[3]
 ; CHECK-GI-NEXT:    mov v4.s[1], w9
@@ -381,11 +381,11 @@ define <8 x i8> @sv8i8(<8 x i8> %d, <8 x i8> %e) {
 ; CHECK-GI-NEXT:    mls v2.4s, v4.4s, v3.4s
 ; CHECK-GI-NEXT:    sdiv w13, w13, w14
 ; CHECK-GI-NEXT:    mov w14, v0.s[2]
-; CHECK-GI-NEXT:    fmov s5, w12
+; CHECK-GI-NEXT:    mov v5.s[0], w12
+; CHECK-GI-NEXT:    mov w12, v1.s[3]
 ; CHECK-GI-NEXT:    sdiv w14, w14, w15
-; CHECK-GI-NEXT:    mov w15, v0.s[3]
 ; CHECK-GI-NEXT:    mov v5.s[1], w13
-; CHECK-GI-NEXT:    sdiv w8, w15, w16
+; CHECK-GI-NEXT:    sdiv w8, w8, w12
 ; CHECK-GI-NEXT:    mov v5.s[2], w14
 ; CHECK-GI-NEXT:    mov v5.s[3], w8
 ; CHECK-GI-NEXT:    mls v0.4s, v5.4s, v1.4s
@@ -527,20 +527,20 @@ define <16 x i8> @sv16i8(<16 x i8> %d, <16 x i8> %e) {
 ; CHECK-GI-NEXT:    mov w18, v1.s[1]
 ; CHECK-GI-NEXT:    mov w0, v1.s[2]
 ; CHECK-GI-NEXT:    mov w1, v1.s[3]
-; CHECK-GI-NEXT:    sdiv w10, w8, w9
+; CHECK-GI-NEXT:    sdiv w11, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v2.s[1]
 ; CHECK-GI-NEXT:    mov w9, v3.s[1]
 ; CHECK-GI-NEXT:    fmov w2, s7
 ; CHECK-GI-NEXT:    mov w3, v7.s[1]
 ; CHECK-GI-NEXT:    mov w4, v7.s[2]
-; CHECK-GI-NEXT:    mov w5, v7.s[3]
-; CHECK-GI-NEXT:    sdiv w11, w8, w9
+; CHECK-GI-NEXT:    sdiv w10, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v2.s[2]
 ; CHECK-GI-NEXT:    mov w9, v3.s[2]
-; CHECK-GI-NEXT:    fmov s16, w10
+; CHECK-GI-NEXT:    mov v16.s[0], w11
+; CHECK-GI-NEXT:    mov w11, v6.s[3]
 ; CHECK-GI-NEXT:    sdiv w9, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v2.s[3]
-; CHECK-GI-NEXT:    mov v16.s[1], w11
+; CHECK-GI-NEXT:    mov v16.s[1], w10
 ; CHECK-GI-NEXT:    sdiv w8, w8, w12
 ; CHECK-GI-NEXT:    fmov w12, s4
 ; CHECK-GI-NEXT:    mov v16.s[2], w9
@@ -552,7 +552,8 @@ define <16 x i8> @sv16i8(<16 x i8> %d, <16 x i8> %e) {
 ; CHECK-GI-NEXT:    sdiv w15, w12, w13
 ; CHECK-GI-NEXT:    mov w12, v4.s[2]
 ; CHECK-GI-NEXT:    mov w13, v5.s[2]
-; CHECK-GI-NEXT:    fmov s17, w14
+; CHECK-GI-NEXT:    mov v17.s[0], w14
+; CHECK-GI-NEXT:    mov w14, v7.s[3]
 ; CHECK-GI-NEXT:    sdiv w13, w12, w13
 ; CHECK-GI-NEXT:    mov w12, v4.s[3]
 ; CHECK-GI-NEXT:    mov v17.s[1], w15
@@ -565,7 +566,7 @@ define <16 x i8> @sv16i8(<16 x i8> %d, <16 x i8> %e) {
 ; CHECK-GI-NEXT:    mls v4.4s, v17.4s, v5.4s
 ; CHECK-GI-NEXT:    sdiv w17, w17, w18
 ; CHECK-GI-NEXT:    mov w18, v0.s[2]
-; CHECK-GI-NEXT:    fmov s18, w16
+; CHECK-GI-NEXT:    mov v18.s[0], w16
 ; CHECK-GI-NEXT:    sdiv w18, w18, w0
 ; CHECK-GI-NEXT:    mov w0, v0.s[3]
 ; CHECK-GI-NEXT:    mov v18.s[1], w17
@@ -579,11 +580,10 @@ define <16 x i8> @sv16i8(<16 x i8> %d, <16 x i8> %e) {
 ; CHECK-GI-NEXT:    uzp1 v1.8h, v2.8h, v4.8h
 ; CHECK-GI-NEXT:    sdiv w2, w2, w3
 ; CHECK-GI-NEXT:    mov w3, v6.s[2]
-; CHECK-GI-NEXT:    fmov s19, w1
+; CHECK-GI-NEXT:    mov v19.s[0], w1
 ; CHECK-GI-NEXT:    sdiv w3, w3, w4
-; CHECK-GI-NEXT:    mov w4, v6.s[3]
 ; CHECK-GI-NEXT:    mov v19.s[1], w2
-; CHECK-GI-NEXT:    sdiv w10, w4, w5
+; CHECK-GI-NEXT:    sdiv w10, w11, w14
 ; CHECK-GI-NEXT:    mov v19.s[2], w3
 ; CHECK-GI-NEXT:    mov v19.s[3], w10
 ; CHECK-GI-NEXT:    mls v6.4s, v19.4s, v7.4s
@@ -866,14 +866,13 @@ define <32 x i8> @sv32i8(<32 x i8> %d, <32 x i8> %e) {
 ;
 ; CHECK-GI-LABEL: sv32i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sub sp, sp, #112
-; CHECK-GI-NEXT:    stp x29, x30, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp x26, x25, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp x24, x23, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp x22, x21, [sp, #80] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 112
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 96
 ; CHECK-GI-NEXT:    .cfi_offset w19, -8
 ; CHECK-GI-NEXT:    .cfi_offset w20, -16
 ; CHECK-GI-NEXT:    .cfi_offset w21, -24
@@ -902,43 +901,41 @@ define <32 x i8> @sv32i8(<32 x i8> %d, <32 x i8> %e) {
 ; CHECK-GI-NEXT:    fmov w9, s7
 ; CHECK-GI-NEXT:    mov w12, v7.s[3]
 ; CHECK-GI-NEXT:    fmov w13, s5
-; CHECK-GI-NEXT:    mov w14, v5.s[1]
 ; CHECK-GI-NEXT:    mov w16, v5.s[3]
 ; CHECK-GI-NEXT:    fmov w6, s19
 ; CHECK-GI-NEXT:    mov w7, v19.s[3]
 ; CHECK-GI-NEXT:    fmov w21, s17
-; CHECK-GI-NEXT:    sdiv w10, w8, w9
+; CHECK-GI-NEXT:    mov w23, v17.s[3]
+; CHECK-GI-NEXT:    sdiv w11, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v6.s[1]
 ; CHECK-GI-NEXT:    mov w9, v7.s[1]
-; CHECK-GI-NEXT:    mov w22, v17.s[3]
-; CHECK-GI-NEXT:    sdiv w11, w8, w9
+; CHECK-GI-NEXT:    sdiv w10, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v6.s[2]
 ; CHECK-GI-NEXT:    mov w9, v7.s[2]
-; CHECK-GI-NEXT:    fmov s20, w10
+; CHECK-GI-NEXT:    mov v20.s[0], w11
 ; CHECK-GI-NEXT:    sdiv w9, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v6.s[3]
 ; CHECK-GI-NEXT:    sshll2 v6.8h, v0.16b, #0
-; CHECK-GI-NEXT:    mov v20.s[1], w11
+; CHECK-GI-NEXT:    mov v20.s[1], w10
 ; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    sshll v28.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
 ; CHECK-GI-NEXT:    sdiv w8, w8, w12
 ; CHECK-GI-NEXT:    fmov w12, s4
 ; CHECK-GI-NEXT:    mov v20.s[2], w9
-; CHECK-GI-NEXT:    sdiv w13, w12, w13
+; CHECK-GI-NEXT:    sdiv w15, w12, w13
 ; CHECK-GI-NEXT:    mov w12, v4.s[1]
-; CHECK-GI-NEXT:    str w8, [sp, #12] // 4-byte Folded Spill
-; CHECK-GI-NEXT:    ldr w11, [sp, #12] // 4-byte Folded Reload
-; CHECK-GI-NEXT:    mov v20.s[3], w11
-; CHECK-GI-NEXT:    sdiv w15, w12, w14
+; CHECK-GI-NEXT:    mov w13, v5.s[1]
+; CHECK-GI-NEXT:    mov v20.s[3], w8
+; CHECK-GI-NEXT:    sdiv w14, w12, w13
 ; CHECK-GI-NEXT:    mov w12, v4.s[2]
-; CHECK-GI-NEXT:    mov w14, v5.s[2]
+; CHECK-GI-NEXT:    mov w13, v5.s[2]
 ; CHECK-GI-NEXT:    sshll v5.4s, v6.4h, #0
-; CHECK-GI-NEXT:    fmov s21, w13
-; CHECK-GI-NEXT:    sdiv w14, w12, w14
+; CHECK-GI-NEXT:    mov v21.s[0], w15
+; CHECK-GI-NEXT:    sdiv w13, w12, w13
 ; CHECK-GI-NEXT:    mov w12, v4.s[3]
 ; CHECK-GI-NEXT:    sshll2 v4.8h, v2.16b, #0
-; CHECK-GI-NEXT:    mov v21.s[1], w15
+; CHECK-GI-NEXT:    mov v21.s[1], w14
 ; CHECK-GI-NEXT:    sshll v2.8h, v2.8b, #0
 ; CHECK-GI-NEXT:    sshll v7.4s, v4.4h, #0
 ; CHECK-GI-NEXT:    sshll v30.4s, v2.4h, #0
@@ -947,72 +944,72 @@ define <32 x i8> @sv32i8(<32 x i8> %d, <32 x i8> %e) {
 ; CHECK-GI-NEXT:    mls v28.4s, v20.4s, v30.4s
 ; CHECK-GI-NEXT:    sdiv w12, w12, w16
 ; CHECK-GI-NEXT:    fmov w16, s5
-; CHECK-GI-NEXT:    mov v21.s[2], w14
-; CHECK-GI-NEXT:    sdiv w18, w16, w17
+; CHECK-GI-NEXT:    mov v21.s[2], w13
+; CHECK-GI-NEXT:    sdiv w1, w16, w17
 ; CHECK-GI-NEXT:    mov w16, v5.s[1]
 ; CHECK-GI-NEXT:    mov w17, v7.s[1]
 ; CHECK-GI-NEXT:    mov v21.s[3], w12
 ; CHECK-GI-NEXT:    mls v0.4s, v21.4s, v2.4s
-; CHECK-GI-NEXT:    sdiv w1, w16, w17
+; CHECK-GI-NEXT:    sdiv w0, w16, w17
 ; CHECK-GI-NEXT:    mov w16, v5.s[2]
 ; CHECK-GI-NEXT:    mov w17, v7.s[2]
-; CHECK-GI-NEXT:    fmov s22, w18
+; CHECK-GI-NEXT:    mov v22.s[0], w1
 ; CHECK-GI-NEXT:    uzp1 v0.8h, v28.8h, v0.8h
-; CHECK-GI-NEXT:    sdiv w0, w16, w17
+; CHECK-GI-NEXT:    sdiv w18, w16, w17
 ; CHECK-GI-NEXT:    mov w16, v5.s[3]
 ; CHECK-GI-NEXT:    mov w17, v7.s[3]
 ; CHECK-GI-NEXT:    sshll2 v5.4s, v6.8h, #0
 ; CHECK-GI-NEXT:    sshll2 v7.4s, v4.8h, #0
-; CHECK-GI-NEXT:    mov v22.s[1], w1
+; CHECK-GI-NEXT:    mov v22.s[1], w0
 ; CHECK-GI-NEXT:    sshll v6.4s, v6.4h, #0
 ; CHECK-GI-NEXT:    sshll v4.4s, v4.4h, #0
 ; CHECK-GI-NEXT:    fmov w2, s7
-; CHECK-GI-NEXT:    mov w3, v7.s[3]
+; CHECK-GI-NEXT:    mov w4, v7.s[3]
 ; CHECK-GI-NEXT:    sdiv w16, w16, w17
 ; CHECK-GI-NEXT:    fmov w17, s5
-; CHECK-GI-NEXT:    mov v22.s[2], w0
+; CHECK-GI-NEXT:    mov v22.s[2], w18
 ; CHECK-GI-NEXT:    sdiv w5, w17, w2
 ; CHECK-GI-NEXT:    mov w17, v5.s[1]
 ; CHECK-GI-NEXT:    mov w2, v7.s[1]
 ; CHECK-GI-NEXT:    mov v22.s[3], w16
 ; CHECK-GI-NEXT:    mls v6.4s, v22.4s, v4.4s
-; CHECK-GI-NEXT:    sdiv w4, w17, w2
+; CHECK-GI-NEXT:    sdiv w3, w17, w2
 ; CHECK-GI-NEXT:    mov w17, v5.s[2]
 ; CHECK-GI-NEXT:    mov w2, v7.s[2]
-; CHECK-GI-NEXT:    fmov s23, w5
+; CHECK-GI-NEXT:    mov v23.s[0], w5
 ; CHECK-GI-NEXT:    sdiv w2, w17, w2
 ; CHECK-GI-NEXT:    mov w17, v5.s[3]
-; CHECK-GI-NEXT:    mov v23.s[1], w4
-; CHECK-GI-NEXT:    sdiv w17, w17, w3
-; CHECK-GI-NEXT:    fmov w3, s18
+; CHECK-GI-NEXT:    mov v23.s[1], w3
+; CHECK-GI-NEXT:    sdiv w17, w17, w4
+; CHECK-GI-NEXT:    fmov w4, s18
 ; CHECK-GI-NEXT:    mov v23.s[2], w2
-; CHECK-GI-NEXT:    sdiv w20, w3, w6
-; CHECK-GI-NEXT:    mov w3, v18.s[1]
+; CHECK-GI-NEXT:    sdiv w20, w4, w6
+; CHECK-GI-NEXT:    mov w4, v18.s[1]
 ; CHECK-GI-NEXT:    mov w6, v19.s[1]
 ; CHECK-GI-NEXT:    mov v23.s[3], w17
 ; CHECK-GI-NEXT:    mls v5.4s, v23.4s, v7.4s
-; CHECK-GI-NEXT:    sdiv w19, w3, w6
-; CHECK-GI-NEXT:    mov w3, v18.s[2]
+; CHECK-GI-NEXT:    sdiv w19, w4, w6
+; CHECK-GI-NEXT:    mov w4, v18.s[2]
 ; CHECK-GI-NEXT:    mov w6, v19.s[2]
-; CHECK-GI-NEXT:    fmov s24, w20
+; CHECK-GI-NEXT:    mov v24.s[0], w20
 ; CHECK-GI-NEXT:    uzp1 v2.8h, v6.8h, v5.8h
 ; CHECK-GI-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
-; CHECK-GI-NEXT:    sdiv w6, w3, w6
-; CHECK-GI-NEXT:    mov w3, v18.s[3]
+; CHECK-GI-NEXT:    sdiv w6, w4, w6
+; CHECK-GI-NEXT:    mov w4, v18.s[3]
 ; CHECK-GI-NEXT:    mov v24.s[1], w19
-; CHECK-GI-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    sdiv w3, w3, w7
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    sdiv w4, w4, w7
 ; CHECK-GI-NEXT:    fmov w7, s16
 ; CHECK-GI-NEXT:    mov v24.s[2], w6
-; CHECK-GI-NEXT:    sdiv w23, w7, w21
+; CHECK-GI-NEXT:    sdiv w24, w7, w21
 ; CHECK-GI-NEXT:    mov w7, v16.s[1]
 ; CHECK-GI-NEXT:    mov w21, v17.s[1]
-; CHECK-GI-NEXT:    mov v24.s[3], w3
-; CHECK-GI-NEXT:    sdiv w24, w7, w21
+; CHECK-GI-NEXT:    mov v24.s[3], w4
+; CHECK-GI-NEXT:    sdiv w22, w7, w21
 ; CHECK-GI-NEXT:    mov w7, v16.s[2]
 ; CHECK-GI-NEXT:    mov w21, v17.s[2]
 ; CHECK-GI-NEXT:    sshll2 v17.8h, v1.16b, #0
-; CHECK-GI-NEXT:    fmov s25, w23
+; CHECK-GI-NEXT:    mov v25.s[0], w24
 ; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
 ; CHECK-GI-NEXT:    sshll v18.4s, v17.4h, #0
 ; CHECK-GI-NEXT:    sshll v29.4s, v1.4h, #0
@@ -1020,9 +1017,8 @@ define <32 x i8> @sv32i8(<32 x i8> %d, <32 x i8> %e) {
 ; CHECK-GI-NEXT:    sdiv w21, w7, w21
 ; CHECK-GI-NEXT:    mov w7, v16.s[3]
 ; CHECK-GI-NEXT:    sshll2 v16.8h, v3.16b, #0
-; CHECK-GI-NEXT:    mov v25.s[1], w24
+; CHECK-GI-NEXT:    mov v25.s[1], w22
 ; CHECK-GI-NEXT:    sshll v3.8h, v3.8b, #0
-; CHECK-GI-NEXT:    ldp x24, x23, [sp, #64] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    sshll v19.4s, v16.4h, #0
 ; CHECK-GI-NEXT:    sshll v31.4s, v3.4h, #0
 ; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
@@ -1032,51 +1028,51 @@ define <32 x i8> @sv32i8(<32 x i8> %d, <32 x i8> %e) {
 ; CHECK-GI-NEXT:    mov w28, v19.s[3]
 ; CHECK-GI-NEXT:    sshll2 v19.4s, v16.8h, #0
 ; CHECK-GI-NEXT:    sshll v16.4s, v16.4h, #0
-; CHECK-GI-NEXT:    sdiv w7, w7, w22
-; CHECK-GI-NEXT:    fmov w22, s18
+; CHECK-GI-NEXT:    sdiv w7, w7, w23
+; CHECK-GI-NEXT:    fmov w23, s18
 ; CHECK-GI-NEXT:    mov v25.s[2], w21
 ; CHECK-GI-NEXT:    mls v29.4s, v24.4s, v31.4s
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov w29, s19
 ; CHECK-GI-NEXT:    mov w30, v19.s[1]
-; CHECK-GI-NEXT:    mov w8, v19.s[2]
-; CHECK-GI-NEXT:    mov w10, v19.s[3]
-; CHECK-GI-NEXT:    sdiv w25, w22, w25
-; CHECK-GI-NEXT:    mov w22, v18.s[1]
+; CHECK-GI-NEXT:    mov w15, v19.s[2]
+; CHECK-GI-NEXT:    sdiv w25, w23, w25
+; CHECK-GI-NEXT:    mov w23, v18.s[1]
 ; CHECK-GI-NEXT:    mov v25.s[3], w7
 ; CHECK-GI-NEXT:    mls v1.4s, v25.4s, v3.4s
-; CHECK-GI-NEXT:    sdiv w26, w22, w26
-; CHECK-GI-NEXT:    mov w22, v18.s[2]
-; CHECK-GI-NEXT:    fmov s26, w25
+; CHECK-GI-NEXT:    sdiv w26, w23, w26
+; CHECK-GI-NEXT:    mov w23, v18.s[2]
+; CHECK-GI-NEXT:    mov v26.s[0], w25
 ; CHECK-GI-NEXT:    uzp1 v1.8h, v29.8h, v1.8h
-; CHECK-GI-NEXT:    sdiv w27, w22, w27
-; CHECK-GI-NEXT:    mov w22, v18.s[3]
+; CHECK-GI-NEXT:    sdiv w27, w23, w27
+; CHECK-GI-NEXT:    mov w23, v18.s[3]
 ; CHECK-GI-NEXT:    sshll2 v18.4s, v17.8h, #0
 ; CHECK-GI-NEXT:    mov v26.s[1], w26
 ; CHECK-GI-NEXT:    sshll v17.4s, v17.4h, #0
-; CHECK-GI-NEXT:    ldp x26, x25, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov w11, v18.s[2]
 ; CHECK-GI-NEXT:    mov w9, v18.s[3]
-; CHECK-GI-NEXT:    sdiv w22, w22, w28
+; CHECK-GI-NEXT:    sdiv w23, w23, w28
 ; CHECK-GI-NEXT:    fmov w28, s18
 ; CHECK-GI-NEXT:    mov v26.s[2], w27
 ; CHECK-GI-NEXT:    sdiv w28, w28, w29
 ; CHECK-GI-NEXT:    mov w29, v18.s[1]
-; CHECK-GI-NEXT:    mov v26.s[3], w22
-; CHECK-GI-NEXT:    ldp x22, x21, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v26.s[3], w23
+; CHECK-GI-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mls v17.4s, v26.4s, v16.4s
 ; CHECK-GI-NEXT:    sdiv w29, w29, w30
-; CHECK-GI-NEXT:    mov w30, v18.s[2]
-; CHECK-GI-NEXT:    fmov s27, w28
-; CHECK-GI-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    sdiv w8, w30, w8
+; CHECK-GI-NEXT:    mov v27.s[0], w28
+; CHECK-GI-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    sdiv w10, w11, w15
+; CHECK-GI-NEXT:    mov w11, v19.s[3]
 ; CHECK-GI-NEXT:    mov v27.s[1], w29
-; CHECK-GI-NEXT:    ldp x29, x30, [sp, #16] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    sdiv w9, w9, w10
-; CHECK-GI-NEXT:    mov v27.s[2], w8
-; CHECK-GI-NEXT:    mov v27.s[3], w9
+; CHECK-GI-NEXT:    sdiv w8, w9, w11
+; CHECK-GI-NEXT:    mov v27.s[2], w10
+; CHECK-GI-NEXT:    mov v27.s[3], w8
 ; CHECK-GI-NEXT:    mls v18.4s, v27.4s, v19.4s
 ; CHECK-GI-NEXT:    uzp1 v3.8h, v17.8h, v18.8h
 ; CHECK-GI-NEXT:    uzp1 v1.16b, v1.16b, v3.16b
-; CHECK-GI-NEXT:    add sp, sp, #112
+; CHECK-GI-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = srem <32 x i8> %d, %e
@@ -1113,7 +1109,7 @@ define <2 x i8> @uv2i8(<2 x i8> %d, <2 x i8> %e) {
 ; CHECK-GI-NEXT:    udiv w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    udiv w9, w9, w10
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[0], w8
 ; CHECK-GI-NEXT:    mov v2.s[1], w9
 ; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -1206,7 +1202,7 @@ define <4 x i8> @uv4i8(<4 x i8> %d, <4 x i8> %e) {
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    udiv w9, w9, w10
 ; CHECK-GI-NEXT:    mov w10, v0.s[2]
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[0], w8
 ; CHECK-GI-NEXT:    udiv w10, w10, w11
 ; CHECK-GI-NEXT:    mov w11, v0.s[3]
 ; CHECK-GI-NEXT:    mov v2.s[1], w9
@@ -1285,12 +1281,12 @@ define <8 x i8> @uv8i8(<8 x i8> %d, <8 x i8> %e) {
 ; CHECK-GI-NEXT:    fmov w13, s1
 ; CHECK-GI-NEXT:    mov w14, v1.s[1]
 ; CHECK-GI-NEXT:    mov w15, v1.s[2]
-; CHECK-GI-NEXT:    mov w16, v1.s[3]
 ; CHECK-GI-NEXT:    udiv w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, v2.s[1]
 ; CHECK-GI-NEXT:    udiv w9, w9, w10
 ; CHECK-GI-NEXT:    mov w10, v2.s[2]
-; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v4.s[0], w8
+; CHECK-GI-NEXT:    mov w8, v0.s[3]
 ; CHECK-GI-NEXT:    udiv w10, w10, w11
 ; CHECK-GI-NEXT:    mov w11, v2.s[3]
 ; CHECK-GI-NEXT:    mov v4.s[1], w9
@@ -1303,11 +1299,11 @@ define <8 x i8> @uv8i8(<8 x i8> %d, <8 x i8> %e) {
 ; CHECK-GI-NEXT:    mls v2.4s, v4.4s, v3.4s
 ; CHECK-GI-NEXT:    udiv w13, w13, w14
 ; CHECK-GI-NEXT:    mov w14, v0.s[2]
-; CHECK-GI-NEXT:    fmov s5, w12
+; CHECK-GI-NEXT:    mov v5.s[0], w12
+; CHECK-GI-NEXT:    mov w12, v1.s[3]
 ; CHECK-GI-NEXT:    udiv w14, w14, w15
-; CHECK-GI-NEXT:    mov w15, v0.s[3]
 ; CHECK-GI-NEXT:    mov v5.s[1], w13
-; CHECK-GI-NEXT:    udiv w8, w15, w16
+; CHECK-GI-NEXT:    udiv w8, w8, w12
 ; CHECK-GI-NEXT:    mov v5.s[2], w14
 ; CHECK-GI-NEXT:    mov v5.s[3], w8
 ; CHECK-GI-NEXT:    mls v0.4s, v5.4s, v1.4s
@@ -1449,20 +1445,20 @@ define <16 x i8> @uv16i8(<16 x i8> %d, <16 x i8> %e) {
 ; CHECK-GI-NEXT:    mov w18, v1.s[1]
 ; CHECK-GI-NEXT:    mov w0, v1.s[2]
 ; CHECK-GI-NEXT:    mov w1, v1.s[3]
-; CHECK-GI-NEXT:    udiv w10, w8, w9
+; CHECK-GI-NEXT:    udiv w11, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v2.s[1]
 ; CHECK-GI-NEXT:    mov w9, v3.s[1]
 ; CHECK-GI-NEXT:    fmov w2, s7
 ; CHECK-GI-NEXT:    mov w3, v7.s[1]
 ; CHECK-GI-NEXT:    mov w4, v7.s[2]
-; CHECK-GI-NEXT:    mov w5, v7.s[3]
-; CHECK-GI-NEXT:    udiv w11, w8, w9
+; CHECK-GI-NEXT:    udiv w10, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v2.s[2]
 ; CHECK-GI-NEXT:    mov w9, v3.s[2]
-; CHECK-GI-NEXT:    fmov s16, w10
+; CHECK-GI-NEXT:    mov v16.s[0], w11
+; CHECK-GI-NEXT:    mov w11, v6.s[3]
 ; CHECK-GI-NEXT:    udiv w9, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v2.s[3]
-; CHECK-GI-NEXT:    mov v16.s[1], w11
+; CHECK-GI-NEXT:    mov v16.s[1], w10
 ; CHECK-GI-NEXT:    udiv w8, w8, w12
 ; CHECK-GI-NEXT:    fmov w12, s4
 ; CHECK-GI-NEXT:    mov v16.s[2], w9
@@ -1474,7 +1470,8 @@ define <16 x i8> @uv16i8(<16 x i8> %d, <16 x i8> %e) {
 ; CHECK-GI-NEXT:    udiv w15, w12, w13
 ; CHECK-GI-NEXT:    mov w12, v4.s[2]
 ; CHECK-GI-NEXT:    mov w13, v5.s[2]
-; CHECK-GI-NEXT:    fmov s17, w14
+; CHECK-GI-NEXT:    mov v17.s[0], w14
+; CHECK-GI-NEXT:    mov w14, v7.s[3]
 ; CHECK-GI-NEXT:    udiv w13, w12, w13
 ; CHECK-GI-NEXT:    mov w12, v4.s[3]
 ; CHECK-GI-NEXT:    mov v17.s[1], w15
@@ -1487,7 +1484,7 @@ define <16 x i8> @uv16i8(<16 x i8> %d, <16 x i8> %e) {
 ; CHECK-GI-NEXT:    mls v4.4s, v17.4s, v5.4s
 ; CHECK-GI-NEXT:    udiv w17, w17, w18
 ; CHECK-GI-NEXT:    mov w18, v0.s[2]
-; CHECK-GI-NEXT:    fmov s18, w16
+; CHECK-GI-NEXT:    mov v18.s[0], w16
 ; CHECK-GI-NEXT:    udiv w18, w18, w0
 ; CHECK-GI-NEXT:    mov w0, v0.s[3]
 ; CHECK-GI-NEXT:    mov v18.s[1], w17
@@ -1501,11 +1498,10 @@ define <16 x i8> @uv16i8(<16 x i8> %d, <16 x i8> %e) {
 ; CHECK-GI-NEXT:    uzp1 v1.8h, v2.8h, v4.8h
 ; CHECK-GI-NEXT:    udiv w2, w2, w3
 ; CHECK-GI-NEXT:    mov w3, v6.s[2]
-; CHECK-GI-NEXT:    fmov s19, w1
+; CHECK-GI-NEXT:    mov v19.s[0], w1
 ; CHECK-GI-NEXT:    udiv w3, w3, w4
-; CHECK-GI-NEXT:    mov w4, v6.s[3]
 ; CHECK-GI-NEXT:    mov v19.s[1], w2
-; CHECK-GI-NEXT:    udiv w10, w4, w5
+; CHECK-GI-NEXT:    udiv w10, w11, w14
 ; CHECK-GI-NEXT:    mov v19.s[2], w3
 ; CHECK-GI-NEXT:    mov v19.s[3], w10
 ; CHECK-GI-NEXT:    mls v6.4s, v19.4s, v7.4s
@@ -1788,14 +1784,13 @@ define <32 x i8> @uv32i8(<32 x i8> %d, <32 x i8> %e) {
 ;
 ; CHECK-GI-LABEL: uv32i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sub sp, sp, #112
-; CHECK-GI-NEXT:    stp x29, x30, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp x26, x25, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp x24, x23, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp x22, x21, [sp, #80] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 112
+; CHECK-GI-NEXT:    stp x29, x30, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x26, x25, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x24, x23, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 96
 ; CHECK-GI-NEXT:    .cfi_offset w19, -8
 ; CHECK-GI-NEXT:    .cfi_offset w20, -16
 ; CHECK-GI-NEXT:    .cfi_offset w21, -24
@@ -1824,43 +1819,41 @@ define <32 x i8> @uv32i8(<32 x i8> %d, <32 x i8> %e) {
 ; CHECK-GI-NEXT:    fmov w9, s7
 ; CHECK-GI-NEXT:    mov w12, v7.s[3]
 ; CHECK-GI-NEXT:    fmov w13, s5
-; CHECK-GI-NEXT:    mov w14, v5.s[1]
 ; CHECK-GI-NEXT:    mov w16, v5.s[3]
 ; CHECK-GI-NEXT:    fmov w6, s19
 ; CHECK-GI-NEXT:    mov w7, v19.s[3]
 ; CHECK-GI-NEXT:    fmov w21, s17
-; CHECK-GI-NEXT:    udiv w10, w8, w9
+; CHECK-GI-NEXT:    mov w23, v17.s[3]
+; CHECK-GI-NEXT:    udiv w11, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v6.s[1]
 ; CHECK-GI-NEXT:    mov w9, v7.s[1]
-; CHECK-GI-NEXT:    mov w22, v17.s[3]
-; CHECK-GI-NEXT:    udiv w11, w8, w9
+; CHECK-GI-NEXT:    udiv w10, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v6.s[2]
 ; CHECK-GI-NEXT:    mov w9, v7.s[2]
-; CHECK-GI-NEXT:    fmov s20, w10
+; CHECK-GI-NEXT:    mov v20.s[0], w11
 ; CHECK-GI-NEXT:    udiv w9, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v6.s[3]
 ; CHECK-GI-NEXT:    ushll2 v6.8h, v0.16b, #0
-; CHECK-GI-NEXT:    mov v20.s[1], w11
+; CHECK-GI-NEXT:    mov v20.s[1], w10
 ; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    ushll v28.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
 ; CHECK-GI-NEXT:    udiv w8, w8, w12
 ; CHECK-GI-NEXT:    fmov w12, s4
 ; CHECK-GI-NEXT:    mov v20.s[2], w9
-; CHECK-GI-NEXT:    udiv w13, w12, w13
+; CHECK-GI-NEXT:    udiv w15, w12, w13
 ; CHECK-GI-NEXT:    mov w12, v4.s[1]
-; CHECK-GI-NEXT:    str w8, [sp, #12] // 4-byte Folded Spill
-; CHECK-GI-NEXT:    ldr w11, [sp, #12] // 4-byte Folded Reload
-; CHECK-GI-NEXT:    mov v20.s[3], w11
-; CHECK-GI-NEXT:    udiv w15, w12, w14
+; CHECK-GI-NEXT:    mov w13, v5.s[1]
+; CHECK-GI-NEXT:    mov v20.s[3], w8
+; CHECK-GI-NEXT:    udiv w14, w12, w13
 ; CHECK-GI-NEXT:    mov w12, v4.s[2]
-; CHECK-GI-NEXT:    mov w14, v5.s[2]
+; CHECK-GI-NEXT:    mov w13, v5.s[2]
 ; CHECK-GI-NEXT:    ushll v5.4s, v6.4h, #0
-; CHECK-GI-NEXT:    fmov s21, w13
-; CHECK-GI-NEXT:    udiv w14, w12, w14
+; CHECK-GI-NEXT:    mov v21.s[0], w15
+; CHECK-GI-NEXT:    udiv w13, w12, w13
 ; CHECK-GI-NEXT:    mov w12, v4.s[3]
 ; CHECK-GI-NEXT:    ushll2 v4.8h, v2.16b, #0
-; CHECK-GI-NEXT:    mov v21.s[1], w15
+; CHECK-GI-NEXT:    mov v21.s[1], w14
 ; CHECK-GI-NEXT:    ushll v2.8h, v2.8b, #0
 ; CHECK-GI-NEXT:    ushll v7.4s, v4.4h, #0
 ; CHECK-GI-NEXT:    ushll v30.4s, v2.4h, #0
@@ -1869,72 +1862,72 @@ define <32 x i8> @uv32i8(<32 x i8> %d, <32 x i8> %e) {
 ; CHECK-GI-NEXT:    mls v28.4s, v20.4s, v30.4s
 ; CHECK-GI-NEXT:    udiv w12, w12, w16
 ; CHECK-GI-NEXT:    fmov w16, s5
-; CHECK-GI-NEXT:    mov v21.s[2], w14
-; CHECK-GI-NEXT:    udiv w18, w16, w17
+; CHECK-GI-NEXT:    mov v21.s[2], w13
+; CHECK-GI-NEXT:    udiv w1, w16, w17
 ; CHECK-GI-NEXT:    mov w16, v5.s[1]
 ; CHECK-GI-NEXT:    mov w17, v7.s[1]
 ; CHECK-GI-NEXT:    mov v21.s[3], w12
 ; CHECK-GI-NEXT:    mls v0.4s, v21.4s, v2.4s
-; CHECK-GI-NEXT:    udiv w1, w16, w17
+; CHECK-GI-NEXT:    udiv w0, w16, w17
 ; CHECK-GI-NEXT:    mov w16, v5.s[2]
 ; CHECK-GI-NEXT:    mov w17, v7.s[2]
-; CHECK-GI-NEXT:    fmov s22, w18
+; CHECK-GI-NEXT:    mov v22.s[0], w1
 ; CHECK-GI-NEXT:    uzp1 v0.8h, v28.8h, v0.8h
-; CHECK-GI-NEXT:    udiv w0, w16, w17
+; CHECK-GI-NEXT:    udiv w18, w16, w17
 ; CHECK-GI-NEXT:    mov w16, v5.s[3]
 ; CHECK-GI-NEXT:    mov w17, v7.s[3]
 ; CHECK-GI-NEXT:    ushll2 v5.4s, v6.8h, #0
 ; CHECK-GI-NEXT:    ushll2 v7.4s, v4.8h, #0
-; CHECK-GI-NEXT:    mov v22.s[1], w1
+; CHECK-GI-NEXT:    mov v22.s[1], w0
 ; CHECK-GI-NEXT:    ushll v6.4s, v6.4h, #0
 ; CHECK-GI-NEXT:    ushll v4.4s, v4.4h, #0
 ; CHECK-GI-NEXT:    fmov w2, s7
-; CHECK-GI-NEXT:    mov w3, v7.s[3]
+; CHECK-GI-NEXT:    mov w4, v7.s[3]
 ; CHECK-GI-NEXT:    udiv w16, w16, w17
 ; CHECK-GI-NEXT:    fmov w17, s5
-; CHECK-GI-NEXT:    mov v22.s[2], w0
+; CHECK-GI-NEXT:    mov v22.s[2], w18
 ; CHECK-GI-NEXT:    udiv w5, w17, w2
 ; CHECK-GI-NEXT:    mov w17, v5.s[1]
 ; CHECK-GI-NEXT:    mov w2, v7.s[1]
 ; CHECK-GI-NEXT:    mov v22.s[3], w16
 ; CHECK-GI-NEXT:    mls v6.4s, v22.4s, v4.4s
-; CHECK-GI-NEXT:    udiv w4, w17, w2
+; CHECK-GI-NEXT:    udiv w3, w17, w2
 ; CHECK-GI-NEXT:    mov w17, v5.s[2]
 ; CHECK-GI-NEXT:    mov w2, v7.s[2]
-; CHECK-GI-NEXT:    fmov s23, w5
+; CHECK-GI-NEXT:    mov v23.s[0], w5
 ; CHECK-GI-NEXT:    udiv w2, w17, w2
 ; CHECK-GI-NEXT:    mov w17, v5.s[3]
-; CHECK-GI-NEXT:    mov v23.s[1], w4
-; CHECK-GI-NEXT:    udiv w17, w17, w3
-; CHECK-GI-NEXT:    fmov w3, s18
+; CHECK-GI-NEXT:    mov v23.s[1], w3
+; CHECK-GI-NEXT:    udiv w17, w17, w4
+; CHECK-GI-NEXT:    fmov w4, s18
 ; CHECK-GI-NEXT:    mov v23.s[2], w2
-; CHECK-GI-NEXT:    udiv w20, w3, w6
-; CHECK-GI-NEXT:    mov w3, v18.s[1]
+; CHECK-GI-NEXT:    udiv w20, w4, w6
+; CHECK-GI-NEXT:    mov w4, v18.s[1]
 ; CHECK-GI-NEXT:    mov w6, v19.s[1]
 ; CHECK-GI-NEXT:    mov v23.s[3], w17
 ; CHECK-GI-NEXT:    mls v5.4s, v23.4s, v7.4s
-; CHECK-GI-NEXT:    udiv w19, w3, w6
-; CHECK-GI-NEXT:    mov w3, v18.s[2]
+; CHECK-GI-NEXT:    udiv w19, w4, w6
+; CHECK-GI-NEXT:    mov w4, v18.s[2]
 ; CHECK-GI-NEXT:    mov w6, v19.s[2]
-; CHECK-GI-NEXT:    fmov s24, w20
+; CHECK-GI-NEXT:    mov v24.s[0], w20
 ; CHECK-GI-NEXT:    uzp1 v2.8h, v6.8h, v5.8h
 ; CHECK-GI-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
-; CHECK-GI-NEXT:    udiv w6, w3, w6
-; CHECK-GI-NEXT:    mov w3, v18.s[3]
+; CHECK-GI-NEXT:    udiv w6, w4, w6
+; CHECK-GI-NEXT:    mov w4, v18.s[3]
 ; CHECK-GI-NEXT:    mov v24.s[1], w19
-; CHECK-GI-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    udiv w3, w3, w7
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    udiv w4, w4, w7
 ; CHECK-GI-NEXT:    fmov w7, s16
 ; CHECK-GI-NEXT:    mov v24.s[2], w6
-; CHECK-GI-NEXT:    udiv w23, w7, w21
+; CHECK-GI-NEXT:    udiv w24, w7, w21
 ; CHECK-GI-NEXT:    mov w7, v16.s[1]
 ; CHECK-GI-NEXT:    mov w21, v17.s[1]
-; CHECK-GI-NEXT:    mov v24.s[3], w3
-; CHECK-GI-NEXT:    udiv w24, w7, w21
+; CHECK-GI-NEXT:    mov v24.s[3], w4
+; CHECK-GI-NEXT:    udiv w22, w7, w21
 ; CHECK-GI-NEXT:    mov w7, v16.s[2]
 ; CHECK-GI-NEXT:    mov w21, v17.s[2]
 ; CHECK-GI-NEXT:    ushll2 v17.8h, v1.16b, #0
-; CHECK-GI-NEXT:    fmov s25, w23
+; CHECK-GI-NEXT:    mov v25.s[0], w24
 ; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
 ; CHECK-GI-NEXT:    ushll v18.4s, v17.4h, #0
 ; CHECK-GI-NEXT:    ushll v29.4s, v1.4h, #0
@@ -1942,9 +1935,8 @@ define <32 x i8> @uv32i8(<32 x i8> %d, <32 x i8> %e) {
 ; CHECK-GI-NEXT:    udiv w21, w7, w21
 ; CHECK-GI-NEXT:    mov w7, v16.s[3]
 ; CHECK-GI-NEXT:    ushll2 v16.8h, v3.16b, #0
-; CHECK-GI-NEXT:    mov v25.s[1], w24
+; CHECK-GI-NEXT:    mov v25.s[1], w22
 ; CHECK-GI-NEXT:    ushll v3.8h, v3.8b, #0
-; CHECK-GI-NEXT:    ldp x24, x23, [sp, #64] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ushll v19.4s, v16.4h, #0
 ; CHECK-GI-NEXT:    ushll v31.4s, v3.4h, #0
 ; CHECK-GI-NEXT:    ushll2 v3.4s, v3.8h, #0
@@ -1954,51 +1946,51 @@ define <32 x i8> @uv32i8(<32 x i8> %d, <32 x i8> %e) {
 ; CHECK-GI-NEXT:    mov w28, v19.s[3]
 ; CHECK-GI-NEXT:    ushll2 v19.4s, v16.8h, #0
 ; CHECK-GI-NEXT:    ushll v16.4s, v16.4h, #0
-; CHECK-GI-NEXT:    udiv w7, w7, w22
-; CHECK-GI-NEXT:    fmov w22, s18
+; CHECK-GI-NEXT:    udiv w7, w7, w23
+; CHECK-GI-NEXT:    fmov w23, s18
 ; CHECK-GI-NEXT:    mov v25.s[2], w21
 ; CHECK-GI-NEXT:    mls v29.4s, v24.4s, v31.4s
+; CHECK-GI-NEXT:    ldp x22, x21, [sp, #64] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    fmov w29, s19
 ; CHECK-GI-NEXT:    mov w30, v19.s[1]
-; CHECK-GI-NEXT:    mov w8, v19.s[2]
-; CHECK-GI-NEXT:    mov w10, v19.s[3]
-; CHECK-GI-NEXT:    udiv w25, w22, w25
-; CHECK-GI-NEXT:    mov w22, v18.s[1]
+; CHECK-GI-NEXT:    mov w15, v19.s[2]
+; CHECK-GI-NEXT:    udiv w25, w23, w25
+; CHECK-GI-NEXT:    mov w23, v18.s[1]
 ; CHECK-GI-NEXT:    mov v25.s[3], w7
 ; CHECK-GI-NEXT:    mls v1.4s, v25.4s, v3.4s
-; CHECK-GI-NEXT:    udiv w26, w22, w26
-; CHECK-GI-NEXT:    mov w22, v18.s[2]
-; CHECK-GI-NEXT:    fmov s26, w25
+; CHECK-GI-NEXT:    udiv w26, w23, w26
+; CHECK-GI-NEXT:    mov w23, v18.s[2]
+; CHECK-GI-NEXT:    mov v26.s[0], w25
 ; CHECK-GI-NEXT:    uzp1 v1.8h, v29.8h, v1.8h
-; CHECK-GI-NEXT:    udiv w27, w22, w27
-; CHECK-GI-NEXT:    mov w22, v18.s[3]
+; CHECK-GI-NEXT:    udiv w27, w23, w27
+; CHECK-GI-NEXT:    mov w23, v18.s[3]
 ; CHECK-GI-NEXT:    ushll2 v18.4s, v17.8h, #0
 ; CHECK-GI-NEXT:    mov v26.s[1], w26
 ; CHECK-GI-NEXT:    ushll v17.4s, v17.4h, #0
-; CHECK-GI-NEXT:    ldp x26, x25, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp x26, x25, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov w11, v18.s[2]
 ; CHECK-GI-NEXT:    mov w9, v18.s[3]
-; CHECK-GI-NEXT:    udiv w22, w22, w28
+; CHECK-GI-NEXT:    udiv w23, w23, w28
 ; CHECK-GI-NEXT:    fmov w28, s18
 ; CHECK-GI-NEXT:    mov v26.s[2], w27
 ; CHECK-GI-NEXT:    udiv w28, w28, w29
 ; CHECK-GI-NEXT:    mov w29, v18.s[1]
-; CHECK-GI-NEXT:    mov v26.s[3], w22
-; CHECK-GI-NEXT:    ldp x22, x21, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v26.s[3], w23
+; CHECK-GI-NEXT:    ldp x24, x23, [sp, #48] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mls v17.4s, v26.4s, v16.4s
 ; CHECK-GI-NEXT:    udiv w29, w29, w30
-; CHECK-GI-NEXT:    mov w30, v18.s[2]
-; CHECK-GI-NEXT:    fmov s27, w28
-; CHECK-GI-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    udiv w8, w30, w8
+; CHECK-GI-NEXT:    mov v27.s[0], w28
+; CHECK-GI-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    udiv w10, w11, w15
+; CHECK-GI-NEXT:    mov w11, v19.s[3]
 ; CHECK-GI-NEXT:    mov v27.s[1], w29
-; CHECK-GI-NEXT:    ldp x29, x30, [sp, #16] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    udiv w9, w9, w10
-; CHECK-GI-NEXT:    mov v27.s[2], w8
-; CHECK-GI-NEXT:    mov v27.s[3], w9
+; CHECK-GI-NEXT:    udiv w8, w9, w11
+; CHECK-GI-NEXT:    mov v27.s[2], w10
+; CHECK-GI-NEXT:    mov v27.s[3], w8
 ; CHECK-GI-NEXT:    mls v18.4s, v27.4s, v19.4s
 ; CHECK-GI-NEXT:    uzp1 v3.8h, v17.8h, v18.8h
 ; CHECK-GI-NEXT:    uzp1 v1.16b, v1.16b, v3.16b
-; CHECK-GI-NEXT:    add sp, sp, #112
+; CHECK-GI-NEXT:    ldp x29, x30, [sp], #96 // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ret
 entry:
   %s = urem <32 x i8> %d, %e
@@ -2037,7 +2029,7 @@ define <2 x i16> @sv2i16(<2 x i16> %d, <2 x i16> %e) {
 ; CHECK-GI-NEXT:    sdiv w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    sdiv w9, w9, w10
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[0], w8
 ; CHECK-GI-NEXT:    mov v2.s[1], w9
 ; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -2086,11 +2078,9 @@ define <3 x i16> @sv3i16(<3 x i16> %d, <3 x i16> %e) {
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    sdiv w16, w14, w15
 ; CHECK-GI-NEXT:    msub w9, w13, w12, w11
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], w9
 ; CHECK-GI-NEXT:    msub w8, w16, w15, w14
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[2], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -2139,7 +2129,7 @@ define <4 x i16> @sv4i16(<4 x i16> %d, <4 x i16> %e) {
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    sdiv w9, w9, w10
 ; CHECK-GI-NEXT:    mov w10, v0.s[2]
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[0], w8
 ; CHECK-GI-NEXT:    sdiv w10, w10, w11
 ; CHECK-GI-NEXT:    mov w11, v0.s[3]
 ; CHECK-GI-NEXT:    mov v2.s[1], w9
@@ -2214,12 +2204,12 @@ define <8 x i16> @sv8i16(<8 x i16> %d, <8 x i16> %e) {
 ; CHECK-GI-NEXT:    fmov w13, s1
 ; CHECK-GI-NEXT:    mov w14, v1.s[1]
 ; CHECK-GI-NEXT:    mov w15, v1.s[2]
-; CHECK-GI-NEXT:    mov w16, v1.s[3]
 ; CHECK-GI-NEXT:    sdiv w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, v2.s[1]
 ; CHECK-GI-NEXT:    sdiv w9, w9, w10
 ; CHECK-GI-NEXT:    mov w10, v2.s[2]
-; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v4.s[0], w8
+; CHECK-GI-NEXT:    mov w8, v0.s[3]
 ; CHECK-GI-NEXT:    sdiv w10, w10, w11
 ; CHECK-GI-NEXT:    mov w11, v2.s[3]
 ; CHECK-GI-NEXT:    mov v4.s[1], w9
@@ -2232,11 +2222,11 @@ define <8 x i16> @sv8i16(<8 x i16> %d, <8 x i16> %e) {
 ; CHECK-GI-NEXT:    mls v2.4s, v4.4s, v3.4s
 ; CHECK-GI-NEXT:    sdiv w13, w13, w14
 ; CHECK-GI-NEXT:    mov w14, v0.s[2]
-; CHECK-GI-NEXT:    fmov s5, w12
+; CHECK-GI-NEXT:    mov v5.s[0], w12
+; CHECK-GI-NEXT:    mov w12, v1.s[3]
 ; CHECK-GI-NEXT:    sdiv w14, w14, w15
-; CHECK-GI-NEXT:    mov w15, v0.s[3]
 ; CHECK-GI-NEXT:    mov v5.s[1], w13
-; CHECK-GI-NEXT:    sdiv w8, w15, w16
+; CHECK-GI-NEXT:    sdiv w8, w8, w12
 ; CHECK-GI-NEXT:    mov v5.s[2], w14
 ; CHECK-GI-NEXT:    mov v5.s[3], w8
 ; CHECK-GI-NEXT:    mls v0.4s, v5.4s, v1.4s
@@ -2397,18 +2387,17 @@ define <16 x i16> @sv16i16(<16 x i16> %d, <16 x i16> %e) {
 ; CHECK-GI-NEXT:    mov w1, v7.s[3]
 ; CHECK-GI-NEXT:    sshll2 v7.4s, v3.8h, #0
 ; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
-; CHECK-GI-NEXT:    sdiv w10, w8, w9
+; CHECK-GI-NEXT:    sdiv w11, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v4.s[1]
 ; CHECK-GI-NEXT:    mov w9, v5.s[1]
 ; CHECK-GI-NEXT:    fmov w2, s7
 ; CHECK-GI-NEXT:    mov w3, v7.s[1]
 ; CHECK-GI-NEXT:    mov w4, v7.s[2]
-; CHECK-GI-NEXT:    mov w5, v7.s[3]
-; CHECK-GI-NEXT:    sdiv w11, w8, w9
+; CHECK-GI-NEXT:    sdiv w10, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v4.s[2]
 ; CHECK-GI-NEXT:    mov w9, v5.s[2]
 ; CHECK-GI-NEXT:    sshll2 v5.4s, v2.8h, #0
-; CHECK-GI-NEXT:    fmov s16, w10
+; CHECK-GI-NEXT:    mov v16.s[0], w11
 ; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
 ; CHECK-GI-NEXT:    fmov w13, s5
 ; CHECK-GI-NEXT:    mov w14, v5.s[1]
@@ -2417,7 +2406,7 @@ define <16 x i16> @sv16i16(<16 x i16> %d, <16 x i16> %e) {
 ; CHECK-GI-NEXT:    sdiv w9, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v4.s[3]
 ; CHECK-GI-NEXT:    sshll2 v4.4s, v0.8h, #0
-; CHECK-GI-NEXT:    mov v16.s[1], w11
+; CHECK-GI-NEXT:    mov v16.s[1], w10
 ; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    sdiv w8, w8, w12
 ; CHECK-GI-NEXT:    fmov w12, s4
@@ -2428,7 +2417,8 @@ define <16 x i16> @sv16i16(<16 x i16> %d, <16 x i16> %e) {
 ; CHECK-GI-NEXT:    mls v0.4s, v16.4s, v2.4s
 ; CHECK-GI-NEXT:    sdiv w14, w12, w14
 ; CHECK-GI-NEXT:    mov w12, v4.s[2]
-; CHECK-GI-NEXT:    fmov s17, w13
+; CHECK-GI-NEXT:    mov v17.s[0], w13
+; CHECK-GI-NEXT:    mov w13, v7.s[3]
 ; CHECK-GI-NEXT:    sdiv w15, w12, w15
 ; CHECK-GI-NEXT:    mov w12, v4.s[3]
 ; CHECK-GI-NEXT:    mov v17.s[1], w14
@@ -2441,13 +2431,14 @@ define <16 x i16> @sv16i16(<16 x i16> %d, <16 x i16> %e) {
 ; CHECK-GI-NEXT:    mls v4.4s, v17.4s, v5.4s
 ; CHECK-GI-NEXT:    sdiv w17, w17, w18
 ; CHECK-GI-NEXT:    mov w18, v6.s[2]
-; CHECK-GI-NEXT:    fmov s18, w16
+; CHECK-GI-NEXT:    mov v18.s[0], w16
 ; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v4.8h
 ; CHECK-GI-NEXT:    sdiv w18, w18, w0
 ; CHECK-GI-NEXT:    mov w0, v6.s[3]
 ; CHECK-GI-NEXT:    sshll2 v6.4s, v1.8h, #0
 ; CHECK-GI-NEXT:    mov v18.s[1], w17
 ; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mov w11, v6.s[3]
 ; CHECK-GI-NEXT:    sdiv w0, w0, w1
 ; CHECK-GI-NEXT:    fmov w1, s6
 ; CHECK-GI-NEXT:    mov v18.s[2], w18
@@ -2457,11 +2448,10 @@ define <16 x i16> @sv16i16(<16 x i16> %d, <16 x i16> %e) {
 ; CHECK-GI-NEXT:    mls v1.4s, v18.4s, v3.4s
 ; CHECK-GI-NEXT:    sdiv w2, w2, w3
 ; CHECK-GI-NEXT:    mov w3, v6.s[2]
-; CHECK-GI-NEXT:    fmov s19, w1
+; CHECK-GI-NEXT:    mov v19.s[0], w1
 ; CHECK-GI-NEXT:    sdiv w3, w3, w4
-; CHECK-GI-NEXT:    mov w4, v6.s[3]
 ; CHECK-GI-NEXT:    mov v19.s[1], w2
-; CHECK-GI-NEXT:    sdiv w10, w4, w5
+; CHECK-GI-NEXT:    sdiv w10, w11, w13
 ; CHECK-GI-NEXT:    mov v19.s[2], w3
 ; CHECK-GI-NEXT:    mov v19.s[3], w10
 ; CHECK-GI-NEXT:    mls v6.4s, v19.4s, v7.4s
@@ -2502,7 +2492,7 @@ define <2 x i16> @uv2i16(<2 x i16> %d, <2 x i16> %e) {
 ; CHECK-GI-NEXT:    udiv w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    udiv w9, w9, w10
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[0], w8
 ; CHECK-GI-NEXT:    mov v2.s[1], w9
 ; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -2556,11 +2546,9 @@ define <3 x i16> @uv3i16(<3 x i16> %d, <3 x i16> %e) {
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    udiv w16, w14, w15
 ; CHECK-GI-NEXT:    msub w9, w13, w12, w11
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], w9
 ; CHECK-GI-NEXT:    msub w8, w16, w15, w14
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[2], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -2609,7 +2597,7 @@ define <4 x i16> @uv4i16(<4 x i16> %d, <4 x i16> %e) {
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    udiv w9, w9, w10
 ; CHECK-GI-NEXT:    mov w10, v0.s[2]
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[0], w8
 ; CHECK-GI-NEXT:    udiv w10, w10, w11
 ; CHECK-GI-NEXT:    mov w11, v0.s[3]
 ; CHECK-GI-NEXT:    mov v2.s[1], w9
@@ -2684,12 +2672,12 @@ define <8 x i16> @uv8i16(<8 x i16> %d, <8 x i16> %e) {
 ; CHECK-GI-NEXT:    fmov w13, s1
 ; CHECK-GI-NEXT:    mov w14, v1.s[1]
 ; CHECK-GI-NEXT:    mov w15, v1.s[2]
-; CHECK-GI-NEXT:    mov w16, v1.s[3]
 ; CHECK-GI-NEXT:    udiv w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, v2.s[1]
 ; CHECK-GI-NEXT:    udiv w9, w9, w10
 ; CHECK-GI-NEXT:    mov w10, v2.s[2]
-; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v4.s[0], w8
+; CHECK-GI-NEXT:    mov w8, v0.s[3]
 ; CHECK-GI-NEXT:    udiv w10, w10, w11
 ; CHECK-GI-NEXT:    mov w11, v2.s[3]
 ; CHECK-GI-NEXT:    mov v4.s[1], w9
@@ -2702,11 +2690,11 @@ define <8 x i16> @uv8i16(<8 x i16> %d, <8 x i16> %e) {
 ; CHECK-GI-NEXT:    mls v2.4s, v4.4s, v3.4s
 ; CHECK-GI-NEXT:    udiv w13, w13, w14
 ; CHECK-GI-NEXT:    mov w14, v0.s[2]
-; CHECK-GI-NEXT:    fmov s5, w12
+; CHECK-GI-NEXT:    mov v5.s[0], w12
+; CHECK-GI-NEXT:    mov w12, v1.s[3]
 ; CHECK-GI-NEXT:    udiv w14, w14, w15
-; CHECK-GI-NEXT:    mov w15, v0.s[3]
 ; CHECK-GI-NEXT:    mov v5.s[1], w13
-; CHECK-GI-NEXT:    udiv w8, w15, w16
+; CHECK-GI-NEXT:    udiv w8, w8, w12
 ; CHECK-GI-NEXT:    mov v5.s[2], w14
 ; CHECK-GI-NEXT:    mov v5.s[3], w8
 ; CHECK-GI-NEXT:    mls v0.4s, v5.4s, v1.4s
@@ -2867,18 +2855,17 @@ define <16 x i16> @uv16i16(<16 x i16> %d, <16 x i16> %e) {
 ; CHECK-GI-NEXT:    mov w1, v7.s[3]
 ; CHECK-GI-NEXT:    ushll2 v7.4s, v3.8h, #0
 ; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
-; CHECK-GI-NEXT:    udiv w10, w8, w9
+; CHECK-GI-NEXT:    udiv w11, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v4.s[1]
 ; CHECK-GI-NEXT:    mov w9, v5.s[1]
 ; CHECK-GI-NEXT:    fmov w2, s7
 ; CHECK-GI-NEXT:    mov w3, v7.s[1]
 ; CHECK-GI-NEXT:    mov w4, v7.s[2]
-; CHECK-GI-NEXT:    mov w5, v7.s[3]
-; CHECK-GI-NEXT:    udiv w11, w8, w9
+; CHECK-GI-NEXT:    udiv w10, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v4.s[2]
 ; CHECK-GI-NEXT:    mov w9, v5.s[2]
 ; CHECK-GI-NEXT:    ushll2 v5.4s, v2.8h, #0
-; CHECK-GI-NEXT:    fmov s16, w10
+; CHECK-GI-NEXT:    mov v16.s[0], w11
 ; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
 ; CHECK-GI-NEXT:    fmov w13, s5
 ; CHECK-GI-NEXT:    mov w14, v5.s[1]
@@ -2887,7 +2874,7 @@ define <16 x i16> @uv16i16(<16 x i16> %d, <16 x i16> %e) {
 ; CHECK-GI-NEXT:    udiv w9, w8, w9
 ; CHECK-GI-NEXT:    mov w8, v4.s[3]
 ; CHECK-GI-NEXT:    ushll2 v4.4s, v0.8h, #0
-; CHECK-GI-NEXT:    mov v16.s[1], w11
+; CHECK-GI-NEXT:    mov v16.s[1], w10
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    udiv w8, w8, w12
 ; CHECK-GI-NEXT:    fmov w12, s4
@@ -2898,7 +2885,8 @@ define <16 x i16> @uv16i16(<16 x i16> %d, <16 x i16> %e) {
 ; CHECK-GI-NEXT:    mls v0.4s, v16.4s, v2.4s
 ; CHECK-GI-NEXT:    udiv w14, w12, w14
 ; CHECK-GI-NEXT:    mov w12, v4.s[2]
-; CHECK-GI-NEXT:    fmov s17, w13
+; CHECK-GI-NEXT:    mov v17.s[0], w13
+; CHECK-GI-NEXT:    mov w13, v7.s[3]
 ; CHECK-GI-NEXT:    udiv w15, w12, w15
 ; CHECK-GI-NEXT:    mov w12, v4.s[3]
 ; CHECK-GI-NEXT:    mov v17.s[1], w14
@@ -2911,13 +2899,14 @@ define <16 x i16> @uv16i16(<16 x i16> %d, <16 x i16> %e) {
 ; CHECK-GI-NEXT:    mls v4.4s, v17.4s, v5.4s
 ; CHECK-GI-NEXT:    udiv w17, w17, w18
 ; CHECK-GI-NEXT:    mov w18, v6.s[2]
-; CHECK-GI-NEXT:    fmov s18, w16
+; CHECK-GI-NEXT:    mov v18.s[0], w16
 ; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v4.8h
 ; CHECK-GI-NEXT:    udiv w18, w18, w0
 ; CHECK-GI-NEXT:    mov w0, v6.s[3]
 ; CHECK-GI-NEXT:    ushll2 v6.4s, v1.8h, #0
 ; CHECK-GI-NEXT:    mov v18.s[1], w17
 ; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mov w11, v6.s[3]
 ; CHECK-GI-NEXT:    udiv w0, w0, w1
 ; CHECK-GI-NEXT:    fmov w1, s6
 ; CHECK-GI-NEXT:    mov v18.s[2], w18
@@ -2927,11 +2916,10 @@ define <16 x i16> @uv16i16(<16 x i16> %d, <16 x i16> %e) {
 ; CHECK-GI-NEXT:    mls v1.4s, v18.4s, v3.4s
 ; CHECK-GI-NEXT:    udiv w2, w2, w3
 ; CHECK-GI-NEXT:    mov w3, v6.s[2]
-; CHECK-GI-NEXT:    fmov s19, w1
+; CHECK-GI-NEXT:    mov v19.s[0], w1
 ; CHECK-GI-NEXT:    udiv w3, w3, w4
-; CHECK-GI-NEXT:    mov w4, v6.s[3]
 ; CHECK-GI-NEXT:    mov v19.s[1], w2
-; CHECK-GI-NEXT:    udiv w10, w4, w5
+; CHECK-GI-NEXT:    udiv w10, w11, w13
 ; CHECK-GI-NEXT:    mov v19.s[2], w3
 ; CHECK-GI-NEXT:    mov v19.s[3], w10
 ; CHECK-GI-NEXT:    mls v6.4s, v19.4s, v7.4s
@@ -2970,7 +2958,7 @@ define <2 x i32> @sv2i32(<2 x i32> %d, <2 x i32> %e) {
 ; CHECK-GI-NEXT:    sdiv w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    sdiv w9, w9, w10
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[0], w8
 ; CHECK-GI-NEXT:    mov v2.s[1], w9
 ; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -3002,10 +2990,10 @@ define <3 x i32> @sv3i32(<3 x i32> %d, <3 x i32> %e) {
 ;
 ; CHECK-GI-LABEL: sv3i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov s3, v1.s[1]
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
 ; CHECK-GI-NEXT:    mov s0, v0.s[2]
 ; CHECK-GI-NEXT:    mov s1, v1.s[2]
 ; CHECK-GI-NEXT:    sdiv w10, w8, w9
@@ -3015,11 +3003,11 @@ define <3 x i32> @sv3i32(<3 x i32> %d, <3 x i32> %e) {
 ; CHECK-GI-NEXT:    fmov w15, s1
 ; CHECK-GI-NEXT:    sdiv w13, w11, w12
 ; CHECK-GI-NEXT:    msub w8, w10, w9, w8
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    sdiv w16, w14, w15
-; CHECK-GI-NEXT:    msub w9, w13, w12, w11
-; CHECK-GI-NEXT:    mov v0.s[1], w9
-; CHECK-GI-NEXT:    msub w8, w16, w15, w14
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    sdiv w9, w14, w15
+; CHECK-GI-NEXT:    msub w8, w13, w12, w11
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    msub w8, w9, w15, w14
 ; CHECK-GI-NEXT:    mov v0.s[2], w8
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -3063,7 +3051,7 @@ define <4 x i32> @sv4i32(<4 x i32> %d, <4 x i32> %e) {
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    sdiv w9, w9, w10
 ; CHECK-GI-NEXT:    mov w10, v0.s[2]
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[0], w8
 ; CHECK-GI-NEXT:    sdiv w10, w10, w11
 ; CHECK-GI-NEXT:    mov w11, v0.s[3]
 ; CHECK-GI-NEXT:    mov v2.s[1], w9
@@ -3141,12 +3129,12 @@ define <8 x i32> @sv8i32(<8 x i32> %d, <8 x i32> %e) {
 ; CHECK-GI-NEXT:    fmov w13, s3
 ; CHECK-GI-NEXT:    mov w14, v3.s[1]
 ; CHECK-GI-NEXT:    mov w15, v3.s[2]
-; CHECK-GI-NEXT:    mov w16, v3.s[3]
 ; CHECK-GI-NEXT:    sdiv w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    sdiv w9, w9, w10
 ; CHECK-GI-NEXT:    mov w10, v0.s[2]
-; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v4.s[0], w8
+; CHECK-GI-NEXT:    mov w8, v1.s[3]
 ; CHECK-GI-NEXT:    sdiv w10, w10, w11
 ; CHECK-GI-NEXT:    mov w11, v0.s[3]
 ; CHECK-GI-NEXT:    mov v4.s[1], w9
@@ -3159,11 +3147,11 @@ define <8 x i32> @sv8i32(<8 x i32> %d, <8 x i32> %e) {
 ; CHECK-GI-NEXT:    mls v0.4s, v4.4s, v2.4s
 ; CHECK-GI-NEXT:    sdiv w13, w13, w14
 ; CHECK-GI-NEXT:    mov w14, v1.s[2]
-; CHECK-GI-NEXT:    fmov s5, w12
+; CHECK-GI-NEXT:    mov v5.s[0], w12
+; CHECK-GI-NEXT:    mov w12, v3.s[3]
 ; CHECK-GI-NEXT:    sdiv w14, w14, w15
-; CHECK-GI-NEXT:    mov w15, v1.s[3]
 ; CHECK-GI-NEXT:    mov v5.s[1], w13
-; CHECK-GI-NEXT:    sdiv w8, w15, w16
+; CHECK-GI-NEXT:    sdiv w8, w8, w12
 ; CHECK-GI-NEXT:    mov v5.s[2], w14
 ; CHECK-GI-NEXT:    mov v5.s[3], w8
 ; CHECK-GI-NEXT:    mls v1.4s, v5.4s, v3.4s
@@ -3201,7 +3189,7 @@ define <2 x i32> @uv2i32(<2 x i32> %d, <2 x i32> %e) {
 ; CHECK-GI-NEXT:    udiv w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    udiv w9, w9, w10
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[0], w8
 ; CHECK-GI-NEXT:    mov v2.s[1], w9
 ; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -3233,10 +3221,10 @@ define <3 x i32> @uv3i32(<3 x i32> %d, <3 x i32> %e) {
 ;
 ; CHECK-GI-LABEL: uv3i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov s3, v1.s[1]
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
 ; CHECK-GI-NEXT:    mov s0, v0.s[2]
 ; CHECK-GI-NEXT:    mov s1, v1.s[2]
 ; CHECK-GI-NEXT:    udiv w10, w8, w9
@@ -3246,11 +3234,11 @@ define <3 x i32> @uv3i32(<3 x i32> %d, <3 x i32> %e) {
 ; CHECK-GI-NEXT:    fmov w15, s1
 ; CHECK-GI-NEXT:    udiv w13, w11, w12
 ; CHECK-GI-NEXT:    msub w8, w10, w9, w8
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    udiv w16, w14, w15
-; CHECK-GI-NEXT:    msub w9, w13, w12, w11
-; CHECK-GI-NEXT:    mov v0.s[1], w9
-; CHECK-GI-NEXT:    msub w8, w16, w15, w14
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    udiv w9, w14, w15
+; CHECK-GI-NEXT:    msub w8, w13, w12, w11
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    msub w8, w9, w15, w14
 ; CHECK-GI-NEXT:    mov v0.s[2], w8
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -3294,7 +3282,7 @@ define <4 x i32> @uv4i32(<4 x i32> %d, <4 x i32> %e) {
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    udiv w9, w9, w10
 ; CHECK-GI-NEXT:    mov w10, v0.s[2]
-; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov v2.s[0], w8
 ; CHECK-GI-NEXT:    udiv w10, w10, w11
 ; CHECK-GI-NEXT:    mov w11, v0.s[3]
 ; CHECK-GI-NEXT:    mov v2.s[1], w9
@@ -3372,12 +3360,12 @@ define <8 x i32> @uv8i32(<8 x i32> %d, <8 x i32> %e) {
 ; CHECK-GI-NEXT:    fmov w13, s3
 ; CHECK-GI-NEXT:    mov w14, v3.s[1]
 ; CHECK-GI-NEXT:    mov w15, v3.s[2]
-; CHECK-GI-NEXT:    mov w16, v3.s[3]
 ; CHECK-GI-NEXT:    udiv w8, w8, w9
 ; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    udiv w9, w9, w10
 ; CHECK-GI-NEXT:    mov w10, v0.s[2]
-; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v4.s[0], w8
+; CHECK-GI-NEXT:    mov w8, v1.s[3]
 ; CHECK-GI-NEXT:    udiv w10, w10, w11
 ; CHECK-GI-NEXT:    mov w11, v0.s[3]
 ; CHECK-GI-NEXT:    mov v4.s[1], w9
@@ -3390,11 +3378,11 @@ define <8 x i32> @uv8i32(<8 x i32> %d, <8 x i32> %e) {
 ; CHECK-GI-NEXT:    mls v0.4s, v4.4s, v2.4s
 ; CHECK-GI-NEXT:    udiv w13, w13, w14
 ; CHECK-GI-NEXT:    mov w14, v1.s[2]
-; CHECK-GI-NEXT:    fmov s5, w12
+; CHECK-GI-NEXT:    mov v5.s[0], w12
+; CHECK-GI-NEXT:    mov w12, v3.s[3]
 ; CHECK-GI-NEXT:    udiv w14, w14, w15
-; CHECK-GI-NEXT:    mov w15, v1.s[3]
 ; CHECK-GI-NEXT:    mov v5.s[1], w13
-; CHECK-GI-NEXT:    udiv w8, w15, w16
+; CHECK-GI-NEXT:    udiv w8, w8, w12
 ; CHECK-GI-NEXT:    mov v5.s[2], w14
 ; CHECK-GI-NEXT:    mov v5.s[3], w8
 ; CHECK-GI-NEXT:    mls v1.4s, v5.4s, v3.4s
@@ -3427,14 +3415,14 @@ define <2 x i64> @sv2i64(<2 x i64> %d, <2 x i64> %e) {
 ; CHECK-GI-NEXT:    mov x11, v0.d[1]
 ; CHECK-GI-NEXT:    sdiv x8, x8, x9
 ; CHECK-GI-NEXT:    sdiv x11, x11, x10
-; CHECK-GI-NEXT:    fmov d1, x8
+; CHECK-GI-NEXT:    mov v1.d[0], x8
 ; CHECK-GI-NEXT:    mov v1.d[1], x11
-; CHECK-GI-NEXT:    fmov x11, d1
-; CHECK-GI-NEXT:    mov x8, v1.d[1]
-; CHECK-GI-NEXT:    mul x9, x11, x9
-; CHECK-GI-NEXT:    mul x8, x8, x10
-; CHECK-GI-NEXT:    fmov d1, x9
-; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov x11, v1.d[1]
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    mul x9, x11, x10
+; CHECK-GI-NEXT:    mov v1.d[0], x8
+; CHECK-GI-NEXT:    mov v1.d[1], x9
 ; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -3481,21 +3469,21 @@ define <3 x i64> @sv3i64(<3 x i64> %d, <3 x i64> %e) {
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    sdiv x8, x8, x9
 ; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    fmov x14, d3
-; CHECK-GI-NEXT:    mov x12, v3.d[1]
+; CHECK-GI-NEXT:    fmov x11, d3
+; CHECK-GI-NEXT:    mov x14, v3.d[1]
 ; CHECK-GI-NEXT:    sdiv x9, x9, x10
-; CHECK-GI-NEXT:    fmov d6, x8
+; CHECK-GI-NEXT:    mov v6.d[0], x8
 ; CHECK-GI-NEXT:    fmov x8, d2
 ; CHECK-GI-NEXT:    mov v6.d[1], x9
 ; CHECK-GI-NEXT:    fmov x9, d5
-; CHECK-GI-NEXT:    sdiv x10, x8, x9
-; CHECK-GI-NEXT:    fmov x13, d6
-; CHECK-GI-NEXT:    mov x11, v6.d[1]
-; CHECK-GI-NEXT:    mul x13, x13, x14
-; CHECK-GI-NEXT:    mul x11, x11, x12
-; CHECK-GI-NEXT:    fmov d2, x13
+; CHECK-GI-NEXT:    sdiv x12, x8, x9
+; CHECK-GI-NEXT:    fmov x10, d6
+; CHECK-GI-NEXT:    mov x13, v6.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    mul x11, x13, x14
+; CHECK-GI-NEXT:    mov v2.d[0], x10
 ; CHECK-GI-NEXT:    mov v2.d[1], x11
-; CHECK-GI-NEXT:    msub x8, x10, x9, x8
+; CHECK-GI-NEXT:    msub x8, x12, x9, x8
 ; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -3542,26 +3530,26 @@ define <4 x i64> @sv4i64(<4 x i64> %d, <4 x i64> %e) {
 ; CHECK-GI-NEXT:    mov x14, v3.d[1]
 ; CHECK-GI-NEXT:    mov x15, v1.d[1]
 ; CHECK-GI-NEXT:    sdiv x8, x8, x9
-; CHECK-GI-NEXT:    sdiv x11, x11, x10
-; CHECK-GI-NEXT:    fmov d2, x8
 ; CHECK-GI-NEXT:    sdiv x12, x12, x13
-; CHECK-GI-NEXT:    mov v2.d[1], x11
-; CHECK-GI-NEXT:    fmov x11, d2
-; CHECK-GI-NEXT:    mov x8, v2.d[1]
-; CHECK-GI-NEXT:    mul x9, x11, x9
-; CHECK-GI-NEXT:    mul x8, x8, x10
-; CHECK-GI-NEXT:    fmov d2, x9
-; CHECK-GI-NEXT:    mov v2.d[1], x8
+; CHECK-GI-NEXT:    mov v2.d[0], x8
+; CHECK-GI-NEXT:    sdiv x11, x11, x10
+; CHECK-GI-NEXT:    mov v3.d[0], x12
 ; CHECK-GI-NEXT:    sdiv x15, x15, x14
-; CHECK-GI-NEXT:    fmov d3, x12
-; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    mov v2.d[1], x11
+; CHECK-GI-NEXT:    fmov x8, d2
+; CHECK-GI-NEXT:    mov x11, v2.d[1]
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    mul x10, x11, x10
+; CHECK-GI-NEXT:    mov v2.d[0], x8
 ; CHECK-GI-NEXT:    mov v3.d[1], x15
-; CHECK-GI-NEXT:    fmov x11, d3
-; CHECK-GI-NEXT:    mov x10, v3.d[1]
-; CHECK-GI-NEXT:    mul x11, x11, x13
-; CHECK-GI-NEXT:    mul x10, x10, x14
-; CHECK-GI-NEXT:    fmov d3, x11
-; CHECK-GI-NEXT:    mov v3.d[1], x10
+; CHECK-GI-NEXT:    mov v2.d[1], x10
+; CHECK-GI-NEXT:    fmov x9, d3
+; CHECK-GI-NEXT:    mov x12, v3.d[1]
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    mul x9, x9, x13
+; CHECK-GI-NEXT:    mul x11, x12, x14
+; CHECK-GI-NEXT:    mov v3.d[0], x9
+; CHECK-GI-NEXT:    mov v3.d[1], x11
 ; CHECK-GI-NEXT:    sub v1.2d, v1.2d, v3.2d
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -3592,14 +3580,14 @@ define <2 x i64> @uv2i64(<2 x i64> %d, <2 x i64> %e) {
 ; CHECK-GI-NEXT:    mov x11, v0.d[1]
 ; CHECK-GI-NEXT:    udiv x8, x8, x9
 ; CHECK-GI-NEXT:    udiv x11, x11, x10
-; CHECK-GI-NEXT:    fmov d1, x8
+; CHECK-GI-NEXT:    mov v1.d[0], x8
 ; CHECK-GI-NEXT:    mov v1.d[1], x11
-; CHECK-GI-NEXT:    fmov x11, d1
-; CHECK-GI-NEXT:    mov x8, v1.d[1]
-; CHECK-GI-NEXT:    mul x9, x11, x9
-; CHECK-GI-NEXT:    mul x8, x8, x10
-; CHECK-GI-NEXT:    fmov d1, x9
-; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov x11, v1.d[1]
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    mul x9, x11, x10
+; CHECK-GI-NEXT:    mov v1.d[0], x8
+; CHECK-GI-NEXT:    mov v1.d[1], x9
 ; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -3646,21 +3634,21 @@ define <3 x i64> @uv3i64(<3 x i64> %d, <3 x i64> %e) {
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    udiv x8, x8, x9
 ; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    fmov x14, d3
-; CHECK-GI-NEXT:    mov x12, v3.d[1]
+; CHECK-GI-NEXT:    fmov x11, d3
+; CHECK-GI-NEXT:    mov x14, v3.d[1]
 ; CHECK-GI-NEXT:    udiv x9, x9, x10
-; CHECK-GI-NEXT:    fmov d6, x8
+; CHECK-GI-NEXT:    mov v6.d[0], x8
 ; CHECK-GI-NEXT:    fmov x8, d2
 ; CHECK-GI-NEXT:    mov v6.d[1], x9
 ; CHECK-GI-NEXT:    fmov x9, d5
-; CHECK-GI-NEXT:    udiv x10, x8, x9
-; CHECK-GI-NEXT:    fmov x13, d6
-; CHECK-GI-NEXT:    mov x11, v6.d[1]
-; CHECK-GI-NEXT:    mul x13, x13, x14
-; CHECK-GI-NEXT:    mul x11, x11, x12
-; CHECK-GI-NEXT:    fmov d2, x13
+; CHECK-GI-NEXT:    udiv x12, x8, x9
+; CHECK-GI-NEXT:    fmov x10, d6
+; CHECK-GI-NEXT:    mov x13, v6.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    mul x11, x13, x14
+; CHECK-GI-NEXT:    mov v2.d[0], x10
 ; CHECK-GI-NEXT:    mov v2.d[1], x11
-; CHECK-GI-NEXT:    msub x8, x10, x9, x8
+; CHECK-GI-NEXT:    msub x8, x12, x9, x8
 ; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -3707,26 +3695,26 @@ define <4 x i64> @uv4i64(<4 x i64> %d, <4 x i64> %e) {
 ; CHECK-GI-NEXT:    mov x14, v3.d[1]
 ; CHECK-GI-NEXT:    mov x15, v1.d[1]
 ; CHECK-GI-NEXT:    udiv x8, x8, x9
-; CHECK-GI-NEXT:    udiv x11, x11, x10
-; CHECK-GI-NEXT:    fmov d2, x8
 ; CHECK-GI-NEXT:    udiv x12, x12, x13
-; CHECK-GI-NEXT:    mov v2.d[1], x11
-; CHECK-GI-NEXT:    fmov x11, d2
-; CHECK-GI-NEXT:    mov x8, v2.d[1]
-; CHECK-GI-NEXT:    mul x9, x11, x9
-; CHECK-GI-NEXT:    mul x8, x8, x10
-; CHECK-GI-NEXT:    fmov d2, x9
-; CHECK-GI-NEXT:    mov v2.d[1], x8
+; CHECK-GI-NEXT:    mov v2.d[0], x8
+; CHECK-GI-NEXT:    udiv x11, x11, x10
+; CHECK-GI-NEXT:    mov v3.d[0], x12
 ; CHECK-GI-NEXT:    udiv x15, x15, x14
-; CHECK-GI-NEXT:    fmov d3, x12
-; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    mov v2.d[1], x11
+; CHECK-GI-NEXT:    fmov x8, d2
+; CHECK-GI-NEXT:    mov x11, v2.d[1]
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    mul x10, x11, x10
+; CHECK-GI-NEXT:    mov v2.d[0], x8
 ; CHECK-GI-NEXT:    mov v3.d[1], x15
-; CHECK-GI-NEXT:    fmov x11, d3
-; CHECK-GI-NEXT:    mov x10, v3.d[1]
-; CHECK-GI-NEXT:    mul x11, x11, x13
-; CHECK-GI-NEXT:    mul x10, x10, x14
-; CHECK-GI-NEXT:    fmov d3, x11
-; CHECK-GI-NEXT:    mov v3.d[1], x10
+; CHECK-GI-NEXT:    mov v2.d[1], x10
+; CHECK-GI-NEXT:    fmov x9, d3
+; CHECK-GI-NEXT:    mov x12, v3.d[1]
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    mul x9, x9, x13
+; CHECK-GI-NEXT:    mul x11, x12, x14
+; CHECK-GI-NEXT:    mov v3.d[0], x9
+; CHECK-GI-NEXT:    mov v3.d[1], x11
 ; CHECK-GI-NEXT:    sub v1.2d, v1.2d, v3.2d
 ; CHECK-GI-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index fa0447c2c5d798..adac75758220e2 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -165,18 +165,20 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v1.b[1]
-; CHECK-GI-NEXT:    mov b4, v0.b[2]
-; CHECK-GI-NEXT:    mov b5, v0.b[3]
-; CHECK-GI-NEXT:    mov b6, v1.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[2]
-; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
-; CHECK-GI-NEXT:    sqadd v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    mov v3.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[1]
+; CHECK-GI-NEXT:    mov v5.b[0], v1.b[0]
+; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[2]
+; CHECK-GI-NEXT:    mov b1, v1.b[3]
+; CHECK-GI-NEXT:    mov v3.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v5.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v3.b[3], v0.b[0]
+; CHECK-GI-NEXT:    mov v5.b[3], v1.b[0]
+; CHECK-GI-NEXT:    sqadd v0.8b, v3.8b, v5.8b
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    str w8, [x2]
 ; CHECK-GI-NEXT:    ret
@@ -249,12 +251,12 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-GI-LABEL: v2i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    sqadd v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
+; CHECK-GI-NEXT:    sqadd v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    str h0, [x2]
 ; CHECK-GI-NEXT:    str h1, [x2, #2]

diff  --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll
index 529a3b72e09714..0f256c1f18f589 100644
--- a/llvm/test/CodeGen/AArch64/sext.ll
+++ b/llvm/test/CodeGen/AArch64/sext.ll
@@ -224,15 +224,13 @@ define <3 x i16> @sext_v3i8_v3i16(<3 x i8> %a) {
 ; CHECK-GI-NEXT:    lsl w10, w2, #8
 ; CHECK-GI-NEXT:    sxth w8, w8
 ; CHECK-GI-NEXT:    sxth w9, w9
-; CHECK-GI-NEXT:    sxth w10, w10
 ; CHECK-GI-NEXT:    asr w8, w8, #8
 ; CHECK-GI-NEXT:    asr w9, w9, #8
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    asr w8, w10, #8
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    sxth w8, w10
+; CHECK-GI-NEXT:    asr w8, w8, #8
+; CHECK-GI-NEXT:    mov v0.h[1], w9
+; CHECK-GI-NEXT:    mov v0.h[2], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -254,10 +252,10 @@ define <3 x i32> @sext_v3i8_v3i32(<3 x i8> %a) {
 ; CHECK-GI-LABEL: sext_v3i8_v3i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    sxtb w8, w0
-; CHECK-GI-NEXT:    sxtb w9, w1
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    sxtb w8, w1
+; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    sxtb w8, w2
-; CHECK-GI-NEXT:    mov v0.s[1], w9
 ; CHECK-GI-NEXT:    mov v0.s[2], w8
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -311,7 +309,7 @@ define <3 x i32> @sext_v3i16_v3i32(<3 x i16> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    smov w8, v0.h[0]
 ; CHECK-GI-NEXT:    smov w9, v0.h[1]
-; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.s[0], w8
 ; CHECK-GI-NEXT:    smov w8, v0.h[2]
 ; CHECK-GI-NEXT:    mov v1.s[1], w9
 ; CHECK-GI-NEXT:    mov v1.s[2], w8
@@ -391,15 +389,13 @@ define <3 x i16> @sext_v3i10_v3i16(<3 x i10> %a) {
 ; CHECK-GI-NEXT:    lsl w10, w2, #6
 ; CHECK-GI-NEXT:    sxth w8, w8
 ; CHECK-GI-NEXT:    sxth w9, w9
-; CHECK-GI-NEXT:    sxth w10, w10
 ; CHECK-GI-NEXT:    asr w8, w8, #6
 ; CHECK-GI-NEXT:    asr w9, w9, #6
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    asr w8, w10, #6
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    sxth w8, w10
+; CHECK-GI-NEXT:    asr w8, w8, #6
+; CHECK-GI-NEXT:    mov v0.h[1], w9
+; CHECK-GI-NEXT:    mov v0.h[2], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -421,10 +417,10 @@ define <3 x i32> @sext_v3i10_v3i32(<3 x i10> %a) {
 ; CHECK-GI-LABEL: sext_v3i10_v3i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    sbfx w8, w0, #0, #10
-; CHECK-GI-NEXT:    sbfx w9, w1, #0, #10
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    sbfx w8, w1, #0, #10
+; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    sbfx w8, w2, #0, #10
-; CHECK-GI-NEXT:    mov v0.s[1], w9
 ; CHECK-GI-NEXT:    mov v0.s[2], w8
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1033,43 +1029,29 @@ define <16 x i16> @sext_v16i10_v16i16(<16 x i10> %a) {
 ; CHECK-GI-LABEL: sext_v16i10_v16i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr w8, [sp]
-; CHECK-GI-NEXT:    ldr w9, [sp, #8]
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s2, w1
+; CHECK-GI-NEXT:    ldr w9, [sp, #8]
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    fmov s3, w9
 ; CHECK-GI-NEXT:    ldr w8, [sp, #16]
-; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w2
-; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v0.h[1], w1
+; CHECK-GI-NEXT:    mov v1.h[1], w9
+; CHECK-GI-NEXT:    mov v0.h[2], w2
+; CHECK-GI-NEXT:    mov v1.h[2], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #24]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w3
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v0.h[3], w3
+; CHECK-GI-NEXT:    mov v1.h[3], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #32]
-; CHECK-GI-NEXT:    mov v0.h[3], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w4
-; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v0.h[4], w4
+; CHECK-GI-NEXT:    mov v1.h[4], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #40]
-; CHECK-GI-NEXT:    mov v0.h[4], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w5
-; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v0.h[5], w5
+; CHECK-GI-NEXT:    mov v1.h[5], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #48]
-; CHECK-GI-NEXT:    mov v0.h[5], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w6
-; CHECK-GI-NEXT:    mov v1.h[5], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v0.h[6], w6
+; CHECK-GI-NEXT:    mov v1.h[6], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #56]
-; CHECK-GI-NEXT:    mov v0.h[6], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w7
-; CHECK-GI-NEXT:    mov v1.h[6], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w8
-; CHECK-GI-NEXT:    mov v0.h[7], v2.h[0]
-; CHECK-GI-NEXT:    mov v1.h[7], v3.h[0]
+; CHECK-GI-NEXT:    mov v0.h[7], w7
+; CHECK-GI-NEXT:    mov v1.h[7], w8
 ; CHECK-GI-NEXT:    shl v0.8h, v0.8h, #6
 ; CHECK-GI-NEXT:    shl v1.8h, v1.8h, #6
 ; CHECK-GI-NEXT:    sshr v0.8h, v0.8h, #6
@@ -1123,54 +1105,42 @@ define <16 x i32> @sext_v16i10_v16i32(<16 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v16i10_v16i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
 ; CHECK-GI-NEXT:    ldr w8, [sp]
-; CHECK-GI-NEXT:    fmov s2, w5
-; CHECK-GI-NEXT:    ldr w9, [sp, #8]
-; CHECK-GI-NEXT:    ldr w10, [sp, #32]
+; CHECK-GI-NEXT:    ldr w9, [sp, #32]
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fmov s1, w4
+; CHECK-GI-NEXT:    ldr w10, [sp, #8]
 ; CHECK-GI-NEXT:    ldr w11, [sp, #40]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    fmov s3, w9
 ; CHECK-GI-NEXT:    ldr w8, [sp, #16]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w4
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    fmov s5, w10
-; CHECK-GI-NEXT:    fmov s6, w11
+; CHECK-GI-NEXT:    mov v0.h[1], w1
 ; CHECK-GI-NEXT:    ldr w9, [sp, #48]
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w2
-; CHECK-GI-NEXT:    mov v3.h[1], v4.h[0]
-; CHECK-GI-NEXT:    mov v5.h[1], v6.h[0]
-; CHECK-GI-NEXT:    fmov s4, w8
-; CHECK-GI-NEXT:    fmov s6, w9
+; CHECK-GI-NEXT:    mov v1.h[1], w5
+; CHECK-GI-NEXT:    mov v2.h[1], w10
+; CHECK-GI-NEXT:    mov v3.h[1], w11
+; CHECK-GI-NEXT:    mov v0.h[2], w2
+; CHECK-GI-NEXT:    mov v1.h[2], w6
+; CHECK-GI-NEXT:    mov v2.h[2], w8
+; CHECK-GI-NEXT:    mov v3.h[2], w9
 ; CHECK-GI-NEXT:    ldr w8, [sp, #24]
 ; CHECK-GI-NEXT:    ldr w9, [sp, #56]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w6
-; CHECK-GI-NEXT:    mov v3.h[2], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w8
-; CHECK-GI-NEXT:    mov v5.h[2], v6.h[0]
-; CHECK-GI-NEXT:    fmov s6, w9
-; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w3
-; CHECK-GI-NEXT:    mov v3.h[3], v4.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w7
-; CHECK-GI-NEXT:    mov v5.h[3], v6.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-GI-NEXT:    ushll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    mov v0.h[3], w3
+; CHECK-GI-NEXT:    mov v1.h[3], w7
+; CHECK-GI-NEXT:    mov v2.h[3], w8
+; CHECK-GI-NEXT:    mov v3.h[3], w9
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v5.4h, #0
-; CHECK-GI-NEXT:    shl v2.4s, v2.4s, #22
 ; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
 ; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #22
-; CHECK-GI-NEXT:    shl v3.4s, v3.4s, #22
-; CHECK-GI-NEXT:    sshr v2.4s, v2.4s, #22
 ; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #22
+; CHECK-GI-NEXT:    shl v2.4s, v2.4s, #22
+; CHECK-GI-NEXT:    shl v3.4s, v3.4s, #22
 ; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #22
-; CHECK-GI-NEXT:    sshr v3.4s, v3.4s, #22
 ; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #22
+; CHECK-GI-NEXT:    sshr v2.4s, v2.4s, #22
+; CHECK-GI-NEXT:    sshr v3.4s, v3.4s, #22
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <16 x i10> %a to <16 x i32>
@@ -1228,67 +1198,55 @@ define <16 x i64> @sext_v16i10_v16i64(<16 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v16i10_v16i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
 ; CHECK-GI-NEXT:    ldr w8, [sp]
-; CHECK-GI-NEXT:    fmov s2, w5
-; CHECK-GI-NEXT:    ldr w9, [sp, #8]
 ; CHECK-GI-NEXT:    ldr w10, [sp, #32]
-; CHECK-GI-NEXT:    ldr w11, [sp, #40]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    fmov s0, w0
 ; CHECK-GI-NEXT:    fmov s1, w4
-; CHECK-GI-NEXT:    fmov s3, w9
-; CHECK-GI-NEXT:    fmov s4, w11
-; CHECK-GI-NEXT:    ldr w9, [sp, #48]
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
+; CHECK-GI-NEXT:    ldr w9, [sp, #8]
+; CHECK-GI-NEXT:    ldr w11, [sp, #40]
 ; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    fmov s3, w10
 ; CHECK-GI-NEXT:    ldr w8, [sp, #16]
-; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    mov v0.h[1], w1
+; CHECK-GI-NEXT:    mov v1.h[1], w5
+; CHECK-GI-NEXT:    mov v2.h[1], w9
+; CHECK-GI-NEXT:    mov v3.h[1], w11
+; CHECK-GI-NEXT:    ldr w9, [sp, #48]
+; CHECK-GI-NEXT:    mov v0.h[2], w2
+; CHECK-GI-NEXT:    mov v1.h[2], w6
+; CHECK-GI-NEXT:    mov v2.h[2], w8
+; CHECK-GI-NEXT:    mov v3.h[2], w9
 ; CHECK-GI-NEXT:    ldr w8, [sp, #24]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w10
-; CHECK-GI-NEXT:    mov v3.h[1], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w2
-; CHECK-GI-NEXT:    mov v2.h[2], v5.h[0]
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    mov v0.h[2], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w6
-; CHECK-GI-NEXT:    mov v2.h[3], v5.h[0]
-; CHECK-GI-NEXT:    mov v1.h[2], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w9
 ; CHECK-GI-NEXT:    ldr w9, [sp, #56]
-; CHECK-GI-NEXT:    mov v3.h[2], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w3
-; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w7
-; CHECK-GI-NEXT:    ushll v6.2d, v2.2s, #0
-; CHECK-GI-NEXT:    ushll2 v2.2d, v2.4s, #0
-; CHECK-GI-NEXT:    mov v1.h[3], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    shl v6.2d, v6.2d, #54
+; CHECK-GI-NEXT:    mov v0.h[3], w3
+; CHECK-GI-NEXT:    mov v1.h[3], w7
+; CHECK-GI-NEXT:    mov v2.h[3], w8
+; CHECK-GI-NEXT:    mov v3.h[3], w9
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    shl v18.2d, v2.2d, #54
-; CHECK-GI-NEXT:    mov v3.h[3], v4.h[0]
 ; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
 ; CHECK-GI-NEXT:    ushll v4.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
 ; CHECK-GI-NEXT:    ushll v5.2d, v1.2s, #0
 ; CHECK-GI-NEXT:    ushll2 v1.2d, v1.4s, #0
-; CHECK-GI-NEXT:    shl v4.2d, v4.2d, #54
-; CHECK-GI-NEXT:    shl v16.2d, v0.2d, #54
+; CHECK-GI-NEXT:    ushll v6.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll2 v2.2d, v2.4s, #0
 ; CHECK-GI-NEXT:    ushll v7.2d, v3.2s, #0
 ; CHECK-GI-NEXT:    ushll2 v3.2d, v3.4s, #0
+; CHECK-GI-NEXT:    shl v4.2d, v4.2d, #54
+; CHECK-GI-NEXT:    shl v16.2d, v0.2d, #54
 ; CHECK-GI-NEXT:    shl v5.2d, v5.2d, #54
 ; CHECK-GI-NEXT:    shl v17.2d, v1.2d, #54
-; CHECK-GI-NEXT:    sshr v0.2d, v4.2d, #54
-; CHECK-GI-NEXT:    sshr v1.2d, v16.2d, #54
-; CHECK-GI-NEXT:    sshr v4.2d, v6.2d, #54
+; CHECK-GI-NEXT:    shl v6.2d, v6.2d, #54
+; CHECK-GI-NEXT:    shl v18.2d, v2.2d, #54
 ; CHECK-GI-NEXT:    shl v7.2d, v7.2d, #54
 ; CHECK-GI-NEXT:    shl v19.2d, v3.2d, #54
+; CHECK-GI-NEXT:    sshr v0.2d, v4.2d, #54
+; CHECK-GI-NEXT:    sshr v1.2d, v16.2d, #54
 ; CHECK-GI-NEXT:    sshr v2.2d, v5.2d, #54
 ; CHECK-GI-NEXT:    sshr v3.2d, v17.2d, #54
+; CHECK-GI-NEXT:    sshr v4.2d, v6.2d, #54
 ; CHECK-GI-NEXT:    sshr v5.2d, v18.2d, #54
 ; CHECK-GI-NEXT:    sshr v6.2d, v7.2d, #54
 ; CHECK-GI-NEXT:    sshr v7.2d, v19.2d, #54

diff  --git a/llvm/test/CodeGen/AArch64/shift.ll b/llvm/test/CodeGen/AArch64/shift.ll
index 9c8d3e0f07de87..951458da17c07e 100644
--- a/llvm/test/CodeGen/AArch64/shift.ll
+++ b/llvm/test/CodeGen/AArch64/shift.ll
@@ -537,22 +537,29 @@ define <4 x i8> @shl_v4i8(<4 x i8> %0, <4 x i8> %1){
 ; CHECK-GI-NEXT:    mov h3, v1.h[1]
 ; CHECK-GI-NEXT:    mov h4, v0.h[2]
 ; CHECK-GI-NEXT:    mov h5, v0.h[3]
-; CHECK-GI-NEXT:    mov h6, v1.h[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
+; CHECK-GI-NEXT:    fmov w8, s2
 ; CHECK-GI-NEXT:    mov h2, v1.h[2]
-; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    mov h3, v1.h[3]
+; CHECK-GI-NEXT:    mov v0.b[1], w8
+; CHECK-GI-NEXT:    mov v1.b[1], w9
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    fmov w9, s2
+; CHECK-GI-NEXT:    mov v0.b[2], w8
+; CHECK-GI-NEXT:    mov v1.b[2], w9
+; CHECK-GI-NEXT:    fmov w8, s5
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    mov v0.b[3], w8
+; CHECK-GI-NEXT:    mov v1.b[3], w9
 ; CHECK-GI-NEXT:    ushl v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v3.b[0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[2], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[3], v0.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v2.8b, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %3 = shl <4 x i8> %0, %1
@@ -587,10 +594,10 @@ define <2 x i16> @shl_v2i16(<2 x i16> %0, <2 x i16> %1){
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov s3, v1.s[1]
-; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v1.h[1], w9
 ; CHECK-GI-NEXT:    ushl v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
@@ -628,7 +635,7 @@ define <1 x i32> @shl_v1i32(<1 x i32> %0, <1 x i32> %1){
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    lsl w8, w8, w9
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %3 = shl <1 x i32> %0, %1
@@ -684,24 +691,31 @@ define <4 x i8> @ashr_v4i8(<4 x i8> %0, <4 x i8> %1){
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov h3, v0.h[1]
 ; CHECK-GI-NEXT:    mov h4, v1.h[2]
-; CHECK-GI-NEXT:    mov h5, v1.h[3]
-; CHECK-GI-NEXT:    mov h6, v0.h[3]
-; CHECK-GI-NEXT:    mov v1.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov v0.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v6.b[0]
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov h2, v1.h[3]
+; CHECK-GI-NEXT:    fmov w9, s4
+; CHECK-GI-NEXT:    mov h4, v0.h[3]
+; CHECK-GI-NEXT:    mov v1.b[1], w8
+; CHECK-GI-NEXT:    fmov w8, s3
+; CHECK-GI-NEXT:    mov h3, v0.h[2]
+; CHECK-GI-NEXT:    mov v0.b[1], w8
+; CHECK-GI-NEXT:    fmov w8, s3
+; CHECK-GI-NEXT:    mov v1.b[2], w9
+; CHECK-GI-NEXT:    mov v0.b[2], w8
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov v1.b[3], w8
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    mov v0.b[3], w8
 ; CHECK-GI-NEXT:    neg v1.8b, v1.8b
 ; CHECK-GI-NEXT:    sshl v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v3.b[0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[2], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[3], v0.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v2.8b, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %3 = ashr <4 x i8> %0, %1
@@ -734,11 +748,11 @@ define <2 x i16> @ashr_v2i16(<2 x i16> %0, <2 x i16> %1){
 ; CHECK-GI-LABEL: ashr_v2i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov s2, v1.s[1]
+; CHECK-GI-NEXT:    mov w8, v1.s[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s3, v0.s[1]
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v3.h[0]
+; CHECK-GI-NEXT:    mov w9, v0.s[1]
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[1], w9
 ; CHECK-GI-NEXT:    neg v1.4h, v1.4h
 ; CHECK-GI-NEXT:    sshl v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
@@ -774,7 +788,7 @@ define <1 x i32> @ashr_v1i32(<1 x i32> %0, <1 x i32> %1){
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    asr w8, w8, w9
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %3 = ashr <1 x i32> %0, %1
@@ -821,24 +835,31 @@ define <4 x i8> @lshr_v4i8(<4 x i8> %0, <4 x i8> %1){
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov h3, v0.h[1]
 ; CHECK-GI-NEXT:    mov h4, v1.h[2]
-; CHECK-GI-NEXT:    mov h5, v1.h[3]
-; CHECK-GI-NEXT:    mov h6, v0.h[3]
-; CHECK-GI-NEXT:    mov v1.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov v0.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v6.b[0]
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov h2, v1.h[3]
+; CHECK-GI-NEXT:    fmov w9, s4
+; CHECK-GI-NEXT:    mov h4, v0.h[3]
+; CHECK-GI-NEXT:    mov v1.b[1], w8
+; CHECK-GI-NEXT:    fmov w8, s3
+; CHECK-GI-NEXT:    mov h3, v0.h[2]
+; CHECK-GI-NEXT:    mov v0.b[1], w8
+; CHECK-GI-NEXT:    fmov w8, s3
+; CHECK-GI-NEXT:    mov v1.b[2], w9
+; CHECK-GI-NEXT:    mov v0.b[2], w8
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov v1.b[3], w8
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    mov v0.b[3], w8
 ; CHECK-GI-NEXT:    neg v1.8b, v1.8b
 ; CHECK-GI-NEXT:    ushl v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v3.b[0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b3, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v2.b[1], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[2], v3.b[0]
+; CHECK-GI-NEXT:    mov v2.b[3], v0.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v2.8b, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %3 = lshr <4 x i8> %0, %1
@@ -870,11 +891,11 @@ define <2 x i16> @lshr_v2i16(<2 x i16> %0, <2 x i16> %1){
 ; CHECK-GI-LABEL: lshr_v2i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov s2, v1.s[1]
+; CHECK-GI-NEXT:    mov w8, v1.s[1]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s3, v0.s[1]
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[1], v3.h[0]
+; CHECK-GI-NEXT:    mov w9, v0.s[1]
+; CHECK-GI-NEXT:    mov v1.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[1], w9
 ; CHECK-GI-NEXT:    neg v1.4h, v1.4h
 ; CHECK-GI-NEXT:    ushl v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
@@ -910,7 +931,7 @@ define <1 x i32> @lshr_v1i32(<1 x i32> %0, <1 x i32> %1){
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    lsr w8, w8, w9
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %3 = lshr <1 x i32> %0, %1
@@ -962,16 +983,12 @@ define <3 x i8> @shl_v3i8(<3 x i8> %0, <3 x i8> %1){
 ; CHECK-GI-LABEL: shl_v3i8:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
-; CHECK-GI-NEXT:    fmov s2, w3
-; CHECK-GI-NEXT:    fmov s3, w4
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w2
-; CHECK-GI-NEXT:    mov v2.b[1], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w5
-; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
-; CHECK-GI-NEXT:    mov v2.b[2], v3.b[0]
-; CHECK-GI-NEXT:    ushl v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    fmov s1, w3
+; CHECK-GI-NEXT:    mov v0.b[1], w1
+; CHECK-GI-NEXT:    mov v1.b[1], w4
+; CHECK-GI-NEXT:    mov v0.b[2], w2
+; CHECK-GI-NEXT:    mov v1.b[2], w5
+; CHECK-GI-NEXT:    ushl v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    umov w0, v0.b[0]
 ; CHECK-GI-NEXT:    umov w1, v0.b[1]
 ; CHECK-GI-NEXT:    umov w2, v0.b[2]
@@ -1038,15 +1055,11 @@ define <3 x i8> @ashr_v3i8(<3 x i8> %0, <3 x i8> %1){
 ; CHECK-GI-LABEL: ashr_v3i8:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    fmov s0, w3
-; CHECK-GI-NEXT:    fmov s1, w4
-; CHECK-GI-NEXT:    fmov s2, w1
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
 ; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    mov v1.b[1], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w5
-; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w2
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v0.b[1], w4
+; CHECK-GI-NEXT:    mov v1.b[1], w1
+; CHECK-GI-NEXT:    mov v0.b[2], w5
+; CHECK-GI-NEXT:    mov v1.b[2], w2
 ; CHECK-GI-NEXT:    neg v0.8b, v0.8b
 ; CHECK-GI-NEXT:    sshl v0.8b, v1.8b, v0.8b
 ; CHECK-GI-NEXT:    umov w0, v0.b[0]
@@ -1118,15 +1131,11 @@ define <3 x i8> @lshr_v3i8(<3 x i8> %0, <3 x i8> %1){
 ; CHECK-GI-LABEL: lshr_v3i8:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    fmov s0, w3
-; CHECK-GI-NEXT:    fmov s1, w4
-; CHECK-GI-NEXT:    fmov s2, w1
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
 ; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    mov v1.b[1], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w5
-; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w2
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v0.b[1], w4
+; CHECK-GI-NEXT:    mov v1.b[1], w1
+; CHECK-GI-NEXT:    mov v0.b[2], w5
+; CHECK-GI-NEXT:    mov v1.b[2], w2
 ; CHECK-GI-NEXT:    neg v0.8b, v0.8b
 ; CHECK-GI-NEXT:    ushl v0.8b, v1.8b, v0.8b
 ; CHECK-GI-NEXT:    umov w0, v0.b[0]

diff  --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll
index b1131f287fe9a9..954458e4459749 100644
--- a/llvm/test/CodeGen/AArch64/shufflevector.ll
+++ b/llvm/test/CodeGen/AArch64/shufflevector.ll
@@ -213,17 +213,23 @@ define i32 @shufflevector_v4i8(<4 x i8> %a, <4 x i8> %b){
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    mov h2, v0.h[1]
 ; CHECK-GI-NEXT:    mov h3, v1.h[1]
-; CHECK-GI-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-GI-NEXT:    mov h4, v0.h[2]
 ; CHECK-GI-NEXT:    mov h5, v0.h[3]
-; CHECK-GI-NEXT:    mov h6, v1.h[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
+; CHECK-GI-NEXT:    fmov w8, s2
 ; CHECK-GI-NEXT:    mov h2, v1.h[2]
-; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    mov h3, v1.h[3]
+; CHECK-GI-NEXT:    mov v0.b[1], w8
+; CHECK-GI-NEXT:    mov v1.b[1], w9
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    fmov w9, s2
+; CHECK-GI-NEXT:    mov v0.b[2], w8
+; CHECK-GI-NEXT:    mov v1.b[2], w9
+; CHECK-GI-NEXT:    fmov w8, s5
+; CHECK-GI-NEXT:    fmov w9, s3
+; CHECK-GI-NEXT:    mov v0.b[3], w8
+; CHECK-GI-NEXT:    mov v1.b[3], w9
+; CHECK-GI-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI15_0]
 ; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
@@ -280,11 +286,11 @@ define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
-; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v1.h[1], w9
 ; CHECK-GI-NEXT:    adrp x8, .LCPI17_0
-; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI17_0]
 ; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
@@ -397,8 +403,17 @@ define i32 @shufflevector_v4i8_zeroes(<4 x i8> %a, <4 x i8> %b){
 ;
 ; CHECK-GI-LABEL: shufflevector_v4i8_zeroes:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    dup v0.8b, w8
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov h1, v0.h[3]
+; CHECK-GI-NEXT:    mov v0.b[1], w8
+; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    mov v0.b[2], w8
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.b[3], w8
+; CHECK-GI-NEXT:    dup v0.8b, v0.b[0]
 ; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    ret
     %c = shufflevector <4 x i8> %a, <4 x i8> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
@@ -433,8 +448,10 @@ define i32 @shufflevector_v2i16_zeroes(<2 x i16> %a, <2 x i16> %b){
 ;
 ; CHECK-GI-LABEL: shufflevector_v2i16_zeroes:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    dup v0.4h, w8
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    dup v0.4h, v0.h[0]
 ; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    ret
     %c = shufflevector <2 x i16> %a, <2 x i16> %b, <2 x i32> <i32 0, i32 0>
@@ -493,18 +510,14 @@ define <3 x i8> @shufflevector_v3i8(<3 x i8> %a, <3 x i8> %b) {
 ; CHECK-GI-LABEL: shufflevector_v3i8:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
+; CHECK-GI-NEXT:    fmov s1, w3
 ; CHECK-GI-NEXT:    adrp x8, .LCPI30_0
-; CHECK-GI-NEXT:    fmov s2, w3
-; CHECK-GI-NEXT:    fmov s3, w4
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w2
-; CHECK-GI-NEXT:    mov v2.b[1], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w5
-; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
+; CHECK-GI-NEXT:    mov v0.b[1], w1
+; CHECK-GI-NEXT:    mov v1.b[1], w4
+; CHECK-GI-NEXT:    mov v0.b[2], w2
+; CHECK-GI-NEXT:    mov v1.b[2], w5
+; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI30_0]
-; CHECK-GI-NEXT:    mov v2.b[2], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.d[1], v2.d[0]
 ; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
 ; CHECK-GI-NEXT:    mov b1, v0.b[1]
 ; CHECK-GI-NEXT:    mov b2, v0.b[2]
@@ -614,7 +627,10 @@ define <3 x i8> @shufflevector_v3i8_zeroes(<3 x i8> %a, <3 x i8> %b) {
 ;
 ; CHECK-GI-LABEL: shufflevector_v3i8_zeroes:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    dup v0.8b, w0
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    mov v0.b[1], w1
+; CHECK-GI-NEXT:    mov v0.b[2], w2
+; CHECK-GI-NEXT:    dup v0.8b, v0.b[0]
 ; CHECK-GI-NEXT:    mov b1, v0.b[1]
 ; CHECK-GI-NEXT:    mov b2, v0.b[2]
 ; CHECK-GI-NEXT:    fmov w0, s0

diff  --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index d8b2762cf15e90..12371ef2c0021b 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -166,18 +166,20 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v1.b[1]
-; CHECK-GI-NEXT:    mov b4, v0.b[2]
-; CHECK-GI-NEXT:    mov b5, v0.b[3]
-; CHECK-GI-NEXT:    mov b6, v1.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[2]
-; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
-; CHECK-GI-NEXT:    sqsub v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    mov v3.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[1]
+; CHECK-GI-NEXT:    mov v5.b[0], v1.b[0]
+; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[2]
+; CHECK-GI-NEXT:    mov b1, v1.b[3]
+; CHECK-GI-NEXT:    mov v3.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v5.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v3.b[3], v0.b[0]
+; CHECK-GI-NEXT:    mov v5.b[3], v1.b[0]
+; CHECK-GI-NEXT:    sqsub v0.8b, v3.8b, v5.8b
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    str w8, [x2]
 ; CHECK-GI-NEXT:    ret
@@ -250,12 +252,12 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-GI-LABEL: v2i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    sqsub v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
+; CHECK-GI-NEXT:    sqsub v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    str h0, [x2]
 ; CHECK-GI-NEXT:    str h1, [x2, #2]

diff  --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll
index 907605494dfbd0..8e7586bd4843c7 100644
--- a/llvm/test/CodeGen/AArch64/sub.ll
+++ b/llvm/test/CodeGen/AArch64/sub.ll
@@ -71,13 +71,13 @@ define void @v2i8(ptr %p1, ptr %p2) {
 ;
 ; CHECK-GI-LABEL: v2i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr b0, [x0]
-; CHECK-GI-NEXT:    ldr b1, [x0, #1]
-; CHECK-GI-NEXT:    ldr b2, [x1]
+; CHECK-GI-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-GI-NEXT:    ld1 { v1.b }[0], [x1]
+; CHECK-GI-NEXT:    ldr b2, [x0, #1]
 ; CHECK-GI-NEXT:    ldr b3, [x1, #1]
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT:    sub v0.2s, v0.2s, v2.2s
+; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v3.s[0]
+; CHECK-GI-NEXT:    sub v0.2s, v0.2s, v1.2s
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    str b0, [x0]
 ; CHECK-GI-NEXT:    str b1, [x0, #1]
@@ -112,22 +112,18 @@ define void @v3i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: v3i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldrb w8, [x0]
-; CHECK-GI-NEXT:    ldrb w9, [x0, #1]
-; CHECK-GI-NEXT:    ldrb w10, [x1]
+; CHECK-GI-NEXT:    ldrb w9, [x1]
+; CHECK-GI-NEXT:    ldrb w10, [x0, #1]
 ; CHECK-GI-NEXT:    ldrb w11, [x1, #1]
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
-; CHECK-GI-NEXT:    fmov s2, w10
-; CHECK-GI-NEXT:    fmov s3, w11
 ; CHECK-GI-NEXT:    ldrb w8, [x0, #2]
 ; CHECK-GI-NEXT:    ldrb w9, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w9
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[2], v3.h[0]
-; CHECK-GI-NEXT:    sub v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    mov v0.h[1], w10
+; CHECK-GI-NEXT:    mov v1.h[1], w11
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    mov v1.h[2], w9
+; CHECK-GI-NEXT:    sub v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov h2, v0.h[2]
 ; CHECK-GI-NEXT:    str b0, [x0]
@@ -159,27 +155,27 @@ define void @v4i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v1.b[1]
-; CHECK-GI-NEXT:    mov b4, v0.b[2]
-; CHECK-GI-NEXT:    mov b5, v0.b[3]
-; CHECK-GI-NEXT:    mov b6, v1.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[2]
-; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    mov v3.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[1]
+; CHECK-GI-NEXT:    mov v5.b[0], v1.b[0]
+; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[2]
+; CHECK-GI-NEXT:    mov b1, v1.b[3]
+; CHECK-GI-NEXT:    mov v3.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v5.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v3.b[3], v0.b[0]
+; CHECK-GI-NEXT:    mov v5.b[3], v1.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v3.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v5.8b, #0
 ; CHECK-GI-NEXT:    sub v0.4h, v0.4h, v1.4h
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v3.h[0]
-; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
+; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
+; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
+; CHECK-GI-NEXT:    xtn v0.8b, v1.8h
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    str w8, [x0]
 ; CHECK-GI-NEXT:    ret
@@ -247,13 +243,13 @@ define void @v2i16(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: v2i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    sub v0.2s, v0.2s, v1.2s
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    str h0, [x0]
@@ -281,18 +277,16 @@ define void @v3i16(ptr %p1, ptr %p2) {
 ; CHECK-GI-LABEL: v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    ldr h1, [x1]
 ; CHECK-GI-NEXT:    add x8, x0, #2
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    add x10, x1, #4
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-GI-NEXT:    add x9, x0, #4
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #4]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    ldr h3, [x1, #4]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[2], v3.h[0]
-; CHECK-GI-NEXT:    sub v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    ld1 { v0.h }[2], [x9]
+; CHECK-GI-NEXT:    ld1 { v1.h }[2], [x10]
+; CHECK-GI-NEXT:    sub v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
 ; CHECK-GI-NEXT:    st1 { v0.h }[2], [x9]

diff  --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
index afc0d8704ebace..e99935e8677fc7 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
@@ -162,18 +162,20 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v1.b[1]
-; CHECK-GI-NEXT:    mov b4, v0.b[2]
-; CHECK-GI-NEXT:    mov b5, v0.b[3]
-; CHECK-GI-NEXT:    mov b6, v1.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[2]
-; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
-; CHECK-GI-NEXT:    uqadd v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    mov v3.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[1]
+; CHECK-GI-NEXT:    mov v5.b[0], v1.b[0]
+; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[2]
+; CHECK-GI-NEXT:    mov b1, v1.b[3]
+; CHECK-GI-NEXT:    mov v3.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v5.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v3.b[3], v0.b[0]
+; CHECK-GI-NEXT:    mov v5.b[3], v1.b[0]
+; CHECK-GI-NEXT:    uqadd v0.8b, v3.8b, v5.8b
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    str w8, [x2]
 ; CHECK-GI-NEXT:    ret
@@ -248,12 +250,12 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-GI-LABEL: v2i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    uqadd v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
+; CHECK-GI-NEXT:    uqadd v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    str h0, [x2]
 ; CHECK-GI-NEXT:    str h1, [x2, #2]

diff  --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
index dfcbe96ea948a8..cdba9625431a58 100644
--- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
@@ -163,18 +163,20 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    mov b2, v0.b[1]
-; CHECK-GI-NEXT:    mov b3, v1.b[1]
-; CHECK-GI-NEXT:    mov b4, v0.b[2]
-; CHECK-GI-NEXT:    mov b5, v0.b[3]
-; CHECK-GI-NEXT:    mov b6, v1.b[3]
-; CHECK-GI-NEXT:    mov v0.b[1], v2.b[0]
-; CHECK-GI-NEXT:    mov b2, v1.b[2]
-; CHECK-GI-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[2], v4.b[0]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
-; CHECK-GI-NEXT:    uqsub v0.8b, v0.8b, v1.8b
+; CHECK-GI-NEXT:    mov v3.b[0], v0.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[1]
+; CHECK-GI-NEXT:    mov v5.b[0], v1.b[0]
+; CHECK-GI-NEXT:    mov v3.b[1], v2.b[0]
+; CHECK-GI-NEXT:    mov b2, v0.b[2]
+; CHECK-GI-NEXT:    mov b0, v0.b[3]
+; CHECK-GI-NEXT:    mov v5.b[1], v4.b[0]
+; CHECK-GI-NEXT:    mov b4, v1.b[2]
+; CHECK-GI-NEXT:    mov b1, v1.b[3]
+; CHECK-GI-NEXT:    mov v3.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v5.b[2], v4.b[0]
+; CHECK-GI-NEXT:    mov v3.b[3], v0.b[0]
+; CHECK-GI-NEXT:    mov v5.b[3], v1.b[0]
+; CHECK-GI-NEXT:    uqsub v0.8b, v3.8b, v5.8b
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    str w8, [x2]
 ; CHECK-GI-NEXT:    ret
@@ -245,12 +247,12 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-GI-LABEL: v2i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    ldr h0, [x0]
-; CHECK-GI-NEXT:    ldr h1, [x0, #2]
-; CHECK-GI-NEXT:    ldr h2, [x1]
-; CHECK-GI-NEXT:    ldr h3, [x1, #2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    uqsub v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    ldr h1, [x1]
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x1, #2
+; CHECK-GI-NEXT:    ld1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v1.h }[1], [x9]
+; CHECK-GI-NEXT:    uqsub v0.4h, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    str h0, [x2]
 ; CHECK-GI-NEXT:    str h1, [x2, #2]

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 54ada05c904487..f46e6ae989ff20 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -3813,71 +3813,49 @@ define i16 @add_v24i8_v24i16_zext(<24 x i8> %x) {
 ; CHECK-GI-LABEL: add_v24i8_v24i16_zext:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
 ; CHECK-GI-NEXT:    ldr w8, [sp]
-; CHECK-GI-NEXT:    ldr w9, [sp, #64]
+; CHECK-GI-NEXT:    ldr w9, [sp, #8]
 ; CHECK-GI-NEXT:    ldr w10, [sp, #72]
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w2
-; CHECK-GI-NEXT:    fmov s2, w10
-; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w3
-; CHECK-GI-NEXT:    mov v0.b[3], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w4
-; CHECK-GI-NEXT:    mov v0.b[4], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w5
-; CHECK-GI-NEXT:    mov v0.b[5], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w6
-; CHECK-GI-NEXT:    mov v0.b[6], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w7
-; CHECK-GI-NEXT:    mov v0.b[7], v1.b[0]
+; CHECK-GI-NEXT:    mov v0.b[1], w1
+; CHECK-GI-NEXT:    mov v0.b[2], w2
+; CHECK-GI-NEXT:    mov v0.b[3], w3
+; CHECK-GI-NEXT:    mov v0.b[4], w4
+; CHECK-GI-NEXT:    mov v0.b[5], w5
+; CHECK-GI-NEXT:    mov v0.b[6], w6
+; CHECK-GI-NEXT:    mov v0.b[7], w7
+; CHECK-GI-NEXT:    mov v0.b[8], w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #64]
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    ldr w8, [sp, #8]
-; CHECK-GI-NEXT:    fmov s3, w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #16]
-; CHECK-GI-NEXT:    mov v0.b[8], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    mov v0.b[9], w9
 ; CHECK-GI-NEXT:    ldr w9, [sp, #80]
-; CHECK-GI-NEXT:    mov v1.b[1], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #88]
-; CHECK-GI-NEXT:    mov v0.b[9], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v1.b[1], w10
+; CHECK-GI-NEXT:    mov v0.b[10], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #24]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #96]
-; CHECK-GI-NEXT:    mov v0.b[10], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v1.b[2], w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #88]
+; CHECK-GI-NEXT:    mov v0.b[11], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #32]
-; CHECK-GI-NEXT:    mov v1.b[3], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #104]
-; CHECK-GI-NEXT:    mov v0.b[11], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v1.b[3], w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #96]
+; CHECK-GI-NEXT:    mov v0.b[12], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #40]
-; CHECK-GI-NEXT:    mov v1.b[4], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #112]
-; CHECK-GI-NEXT:    mov v0.b[12], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v1.b[4], w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #104]
+; CHECK-GI-NEXT:    mov v0.b[13], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #48]
-; CHECK-GI-NEXT:    mov v1.b[5], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #120]
-; CHECK-GI-NEXT:    mov v0.b[13], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v1.b[5], w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #112]
+; CHECK-GI-NEXT:    mov v0.b[14], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #56]
-; CHECK-GI-NEXT:    mov v1.b[6], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    mov v0.b[14], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w8
-; CHECK-GI-NEXT:    mov v1.b[7], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[15], v3.b[0]
-; CHECK-GI-NEXT:    uaddlv h1, v1.8b
+; CHECK-GI-NEXT:    mov v1.b[6], w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #120]
+; CHECK-GI-NEXT:    mov v0.b[15], w8
+; CHECK-GI-NEXT:    mov v1.b[7], w9
 ; CHECK-GI-NEXT:    uaddlv h0, v0.16b
-; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    uaddlv h1, v1.8b
 ; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    add w0, w8, w9
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -3960,71 +3938,49 @@ define i16 @add_v24i8_v24i16_sext(<24 x i8> %x) {
 ; CHECK-GI-LABEL: add_v24i8_v24i16_sext:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
 ; CHECK-GI-NEXT:    ldr w8, [sp]
-; CHECK-GI-NEXT:    ldr w9, [sp, #64]
+; CHECK-GI-NEXT:    ldr w9, [sp, #8]
 ; CHECK-GI-NEXT:    ldr w10, [sp, #72]
-; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w2
-; CHECK-GI-NEXT:    fmov s2, w10
-; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w3
-; CHECK-GI-NEXT:    mov v0.b[3], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w4
-; CHECK-GI-NEXT:    mov v0.b[4], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w5
-; CHECK-GI-NEXT:    mov v0.b[5], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w6
-; CHECK-GI-NEXT:    mov v0.b[6], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w7
-; CHECK-GI-NEXT:    mov v0.b[7], v1.b[0]
+; CHECK-GI-NEXT:    mov v0.b[1], w1
+; CHECK-GI-NEXT:    mov v0.b[2], w2
+; CHECK-GI-NEXT:    mov v0.b[3], w3
+; CHECK-GI-NEXT:    mov v0.b[4], w4
+; CHECK-GI-NEXT:    mov v0.b[5], w5
+; CHECK-GI-NEXT:    mov v0.b[6], w6
+; CHECK-GI-NEXT:    mov v0.b[7], w7
+; CHECK-GI-NEXT:    mov v0.b[8], w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #64]
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    ldr w8, [sp, #8]
-; CHECK-GI-NEXT:    fmov s3, w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #16]
-; CHECK-GI-NEXT:    mov v0.b[8], v1.b[0]
-; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    mov v0.b[9], w9
 ; CHECK-GI-NEXT:    ldr w9, [sp, #80]
-; CHECK-GI-NEXT:    mov v1.b[1], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #88]
-; CHECK-GI-NEXT:    mov v0.b[9], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v1.b[1], w10
+; CHECK-GI-NEXT:    mov v0.b[10], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #24]
-; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #96]
-; CHECK-GI-NEXT:    mov v0.b[10], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v1.b[2], w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #88]
+; CHECK-GI-NEXT:    mov v0.b[11], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #32]
-; CHECK-GI-NEXT:    mov v1.b[3], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #104]
-; CHECK-GI-NEXT:    mov v0.b[11], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v1.b[3], w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #96]
+; CHECK-GI-NEXT:    mov v0.b[12], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #40]
-; CHECK-GI-NEXT:    mov v1.b[4], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #112]
-; CHECK-GI-NEXT:    mov v0.b[12], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v1.b[4], w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #104]
+; CHECK-GI-NEXT:    mov v0.b[13], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #48]
-; CHECK-GI-NEXT:    mov v1.b[5], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #120]
-; CHECK-GI-NEXT:    mov v0.b[13], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v1.b[5], w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #112]
+; CHECK-GI-NEXT:    mov v0.b[14], w8
 ; CHECK-GI-NEXT:    ldr w8, [sp, #56]
-; CHECK-GI-NEXT:    mov v1.b[6], v2.b[0]
-; CHECK-GI-NEXT:    fmov s2, w9
-; CHECK-GI-NEXT:    mov v0.b[14], v3.b[0]
-; CHECK-GI-NEXT:    fmov s3, w8
-; CHECK-GI-NEXT:    mov v1.b[7], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[15], v3.b[0]
-; CHECK-GI-NEXT:    saddlv h1, v1.8b
+; CHECK-GI-NEXT:    mov v1.b[6], w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #120]
+; CHECK-GI-NEXT:    mov v0.b[15], w8
+; CHECK-GI-NEXT:    mov v1.b[7], w9
 ; CHECK-GI-NEXT:    saddlv h0, v0.16b
-; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    saddlv h1, v1.8b
 ; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    add w0, w8, w9
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -4168,71 +4124,49 @@ define i32 @add_v24i8_v24i32_zext(<24 x i8> %x) {
 ; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_zext:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
 ; CHECK-GI-BASE-NEXT:    fmov s0, w0
-; CHECK-GI-BASE-NEXT:    fmov s1, w1
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp]
-; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #64]
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #8]
 ; CHECK-GI-BASE-NEXT:    ldr w10, [sp, #72]
-; CHECK-GI-BASE-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w2
-; CHECK-GI-BASE-NEXT:    fmov s2, w10
-; CHECK-GI-BASE-NEXT:    mov v0.b[2], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w3
-; CHECK-GI-BASE-NEXT:    mov v0.b[3], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w4
-; CHECK-GI-BASE-NEXT:    mov v0.b[4], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w5
-; CHECK-GI-BASE-NEXT:    mov v0.b[5], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w6
-; CHECK-GI-BASE-NEXT:    mov v0.b[6], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w7
-; CHECK-GI-BASE-NEXT:    mov v0.b[7], v1.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[1], w1
+; CHECK-GI-BASE-NEXT:    mov v0.b[2], w2
+; CHECK-GI-BASE-NEXT:    mov v0.b[3], w3
+; CHECK-GI-BASE-NEXT:    mov v0.b[4], w4
+; CHECK-GI-BASE-NEXT:    mov v0.b[5], w5
+; CHECK-GI-BASE-NEXT:    mov v0.b[6], w6
+; CHECK-GI-BASE-NEXT:    mov v0.b[7], w7
+; CHECK-GI-BASE-NEXT:    mov v0.b[8], w8
+; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #64]
 ; CHECK-GI-BASE-NEXT:    fmov s1, w8
-; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #8]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #16]
-; CHECK-GI-BASE-NEXT:    mov v0.b[8], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w9
+; CHECK-GI-BASE-NEXT:    mov v0.b[9], w9
 ; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #80]
-; CHECK-GI-BASE-NEXT:    mov v1.b[1], v2.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s2, w9
-; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #88]
-; CHECK-GI-BASE-NEXT:    mov v0.b[9], v3.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    mov v1.b[1], w10
+; CHECK-GI-BASE-NEXT:    mov v0.b[10], w8
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #24]
-; CHECK-GI-BASE-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s2, w9
-; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #96]
-; CHECK-GI-BASE-NEXT:    mov v0.b[10], v3.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    mov v1.b[2], w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #88]
+; CHECK-GI-BASE-NEXT:    mov v0.b[11], w8
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #32]
-; CHECK-GI-BASE-NEXT:    mov v1.b[3], v2.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s2, w9
-; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #104]
-; CHECK-GI-BASE-NEXT:    mov v0.b[11], v3.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    mov v1.b[3], w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #96]
+; CHECK-GI-BASE-NEXT:    mov v0.b[12], w8
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #40]
-; CHECK-GI-BASE-NEXT:    mov v1.b[4], v2.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s2, w9
-; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #112]
-; CHECK-GI-BASE-NEXT:    mov v0.b[12], v3.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    mov v1.b[4], w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #104]
+; CHECK-GI-BASE-NEXT:    mov v0.b[13], w8
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #48]
-; CHECK-GI-BASE-NEXT:    mov v1.b[5], v2.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s2, w9
-; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #120]
-; CHECK-GI-BASE-NEXT:    mov v0.b[13], v3.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    mov v1.b[5], w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #112]
+; CHECK-GI-BASE-NEXT:    mov v0.b[14], w8
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #56]
-; CHECK-GI-BASE-NEXT:    mov v1.b[6], v2.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s2, w9
-; CHECK-GI-BASE-NEXT:    mov v0.b[14], v3.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
-; CHECK-GI-BASE-NEXT:    mov v1.b[7], v2.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[15], v3.b[0]
-; CHECK-GI-BASE-NEXT:    uaddlv h1, v1.8b
+; CHECK-GI-BASE-NEXT:    mov v1.b[6], w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #120]
+; CHECK-GI-BASE-NEXT:    mov v0.b[15], w8
+; CHECK-GI-BASE-NEXT:    mov v1.b[7], w9
 ; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.16b
-; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    uaddlv h1, v1.8b
 ; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    fmov w9, s1
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
 ; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
 ; CHECK-GI-BASE-NEXT:    ret
@@ -4240,76 +4174,54 @@ define i32 @add_v24i8_v24i32_zext(<24 x i8> %x) {
 ; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_zext:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
 ; CHECK-GI-DOT-NEXT:    fmov s0, w0
-; CHECK-GI-DOT-NEXT:    fmov s1, w1
-; CHECK-GI-DOT-NEXT:    ldr w8, [sp]
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #64]
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp]
 ; CHECK-GI-DOT-NEXT:    ldr w10, [sp, #72]
-; CHECK-GI-DOT-NEXT:    movi v4.8b, #1
-; CHECK-GI-DOT-NEXT:    fmov s2, w8
-; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #8]
-; CHECK-GI-DOT-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s1, w2
-; CHECK-GI-DOT-NEXT:    fmov s3, w10
-; CHECK-GI-DOT-NEXT:    mov v0.b[2], v1.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s1, w3
-; CHECK-GI-DOT-NEXT:    mov v0.b[3], v1.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s1, w4
-; CHECK-GI-DOT-NEXT:    mov v0.b[4], v1.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s1, w5
-; CHECK-GI-DOT-NEXT:    mov v0.b[5], v1.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s1, w6
-; CHECK-GI-DOT-NEXT:    mov v0.b[6], v1.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s1, w7
-; CHECK-GI-DOT-NEXT:    mov v0.b[7], v1.b[0]
+; CHECK-GI-DOT-NEXT:    movi v2.8b, #1
+; CHECK-GI-DOT-NEXT:    movi v3.8b, #1
 ; CHECK-GI-DOT-NEXT:    fmov s1, w9
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #80]
-; CHECK-GI-DOT-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    mov v0.b[1], w1
+; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    mov v1.b[1], w10
+; CHECK-GI-DOT-NEXT:    mov v3.d[1], v2.d[0]
+; CHECK-GI-DOT-NEXT:    mov v0.b[2], w2
+; CHECK-GI-DOT-NEXT:    mov v1.b[2], w9
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #88]
-; CHECK-GI-DOT-NEXT:    mov v0.b[8], v2.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s2, w8
-; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #16]
-; CHECK-GI-DOT-NEXT:    mov v1.b[2], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    mov v0.b[3], w3
+; CHECK-GI-DOT-NEXT:    mov v1.b[3], w9
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #96]
-; CHECK-GI-DOT-NEXT:    mov v0.b[9], v2.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s2, w8
-; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #24]
-; CHECK-GI-DOT-NEXT:    mov v1.b[3], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    mov v0.b[4], w4
+; CHECK-GI-DOT-NEXT:    mov v1.b[4], w9
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #104]
-; CHECK-GI-DOT-NEXT:    mov v0.b[10], v2.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s2, w8
-; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #32]
-; CHECK-GI-DOT-NEXT:    mov v1.b[4], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    mov v0.b[5], w5
+; CHECK-GI-DOT-NEXT:    mov v1.b[5], w9
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #112]
-; CHECK-GI-DOT-NEXT:    mov v0.b[11], v2.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s2, w8
-; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #40]
-; CHECK-GI-DOT-NEXT:    mov v1.b[5], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    mov v0.b[6], w6
+; CHECK-GI-DOT-NEXT:    mov v1.b[6], w9
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #120]
-; CHECK-GI-DOT-NEXT:    mov v0.b[12], v2.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s2, w8
+; CHECK-GI-DOT-NEXT:    mov v0.b[7], w7
+; CHECK-GI-DOT-NEXT:    mov v1.b[7], w9
+; CHECK-GI-DOT-NEXT:    mov v0.b[8], w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #8]
+; CHECK-GI-DOT-NEXT:    fmov d1, d1
+; CHECK-GI-DOT-NEXT:    mov v0.b[9], w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #16]
+; CHECK-GI-DOT-NEXT:    udot v4.4s, v1.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    mov v0.b[10], w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #24]
+; CHECK-GI-DOT-NEXT:    mov v0.b[11], w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #32]
+; CHECK-GI-DOT-NEXT:    mov v0.b[12], w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #40]
+; CHECK-GI-DOT-NEXT:    mov v0.b[13], w8
 ; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #48]
-; CHECK-GI-DOT-NEXT:    fmov s5, w9
-; CHECK-GI-DOT-NEXT:    mov v1.b[6], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w8
+; CHECK-GI-DOT-NEXT:    mov v0.b[14], w8
 ; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #56]
-; CHECK-GI-DOT-NEXT:    mov v0.b[13], v2.b[0]
-; CHECK-GI-DOT-NEXT:    movi v2.8b, #1
-; CHECK-GI-DOT-NEXT:    mov v1.b[7], v5.b[0]
-; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    mov v0.b[14], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w8
-; CHECK-GI-DOT-NEXT:    mov v4.d[1], v2.d[0]
-; CHECK-GI-DOT-NEXT:    fmov d1, d1
-; CHECK-GI-DOT-NEXT:    mov v0.b[15], v3.b[0]
-; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    udot v5.4s, v0.16b, v4.16b
-; CHECK-GI-DOT-NEXT:    udot v3.4s, v1.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v3.4s
+; CHECK-GI-DOT-NEXT:    mov v0.b[15], w8
+; CHECK-GI-DOT-NEXT:    udot v5.4s, v0.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v4.4s
 ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
 ; CHECK-GI-DOT-NEXT:    fmov w0, s0
 ; CHECK-GI-DOT-NEXT:    ret
@@ -4484,71 +4396,49 @@ define i32 @add_v24i8_v24i32_sext(<24 x i8> %x) {
 ; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_sext:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
 ; CHECK-GI-BASE-NEXT:    fmov s0, w0
-; CHECK-GI-BASE-NEXT:    fmov s1, w1
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp]
-; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #64]
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #8]
 ; CHECK-GI-BASE-NEXT:    ldr w10, [sp, #72]
-; CHECK-GI-BASE-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w2
-; CHECK-GI-BASE-NEXT:    fmov s2, w10
-; CHECK-GI-BASE-NEXT:    mov v0.b[2], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w3
-; CHECK-GI-BASE-NEXT:    mov v0.b[3], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w4
-; CHECK-GI-BASE-NEXT:    mov v0.b[4], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w5
-; CHECK-GI-BASE-NEXT:    mov v0.b[5], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w6
-; CHECK-GI-BASE-NEXT:    mov v0.b[6], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w7
-; CHECK-GI-BASE-NEXT:    mov v0.b[7], v1.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[1], w1
+; CHECK-GI-BASE-NEXT:    mov v0.b[2], w2
+; CHECK-GI-BASE-NEXT:    mov v0.b[3], w3
+; CHECK-GI-BASE-NEXT:    mov v0.b[4], w4
+; CHECK-GI-BASE-NEXT:    mov v0.b[5], w5
+; CHECK-GI-BASE-NEXT:    mov v0.b[6], w6
+; CHECK-GI-BASE-NEXT:    mov v0.b[7], w7
+; CHECK-GI-BASE-NEXT:    mov v0.b[8], w8
+; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #64]
 ; CHECK-GI-BASE-NEXT:    fmov s1, w8
-; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #8]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #16]
-; CHECK-GI-BASE-NEXT:    mov v0.b[8], v1.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s1, w9
+; CHECK-GI-BASE-NEXT:    mov v0.b[9], w9
 ; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #80]
-; CHECK-GI-BASE-NEXT:    mov v1.b[1], v2.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s2, w9
-; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #88]
-; CHECK-GI-BASE-NEXT:    mov v0.b[9], v3.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    mov v1.b[1], w10
+; CHECK-GI-BASE-NEXT:    mov v0.b[10], w8
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #24]
-; CHECK-GI-BASE-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s2, w9
-; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #96]
-; CHECK-GI-BASE-NEXT:    mov v0.b[10], v3.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    mov v1.b[2], w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #88]
+; CHECK-GI-BASE-NEXT:    mov v0.b[11], w8
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #32]
-; CHECK-GI-BASE-NEXT:    mov v1.b[3], v2.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s2, w9
-; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #104]
-; CHECK-GI-BASE-NEXT:    mov v0.b[11], v3.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    mov v1.b[3], w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #96]
+; CHECK-GI-BASE-NEXT:    mov v0.b[12], w8
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #40]
-; CHECK-GI-BASE-NEXT:    mov v1.b[4], v2.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s2, w9
-; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #112]
-; CHECK-GI-BASE-NEXT:    mov v0.b[12], v3.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    mov v1.b[4], w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #104]
+; CHECK-GI-BASE-NEXT:    mov v0.b[13], w8
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #48]
-; CHECK-GI-BASE-NEXT:    mov v1.b[5], v2.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s2, w9
-; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #120]
-; CHECK-GI-BASE-NEXT:    mov v0.b[13], v3.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    mov v1.b[5], w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #112]
+; CHECK-GI-BASE-NEXT:    mov v0.b[14], w8
 ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #56]
-; CHECK-GI-BASE-NEXT:    mov v1.b[6], v2.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s2, w9
-; CHECK-GI-BASE-NEXT:    mov v0.b[14], v3.b[0]
-; CHECK-GI-BASE-NEXT:    fmov s3, w8
-; CHECK-GI-BASE-NEXT:    mov v1.b[7], v2.b[0]
-; CHECK-GI-BASE-NEXT:    mov v0.b[15], v3.b[0]
-; CHECK-GI-BASE-NEXT:    saddlv h1, v1.8b
+; CHECK-GI-BASE-NEXT:    mov v1.b[6], w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #120]
+; CHECK-GI-BASE-NEXT:    mov v0.b[15], w8
+; CHECK-GI-BASE-NEXT:    mov v1.b[7], w9
 ; CHECK-GI-BASE-NEXT:    saddlv h0, v0.16b
-; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    saddlv h1, v1.8b
 ; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    fmov w9, s1
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
 ; CHECK-GI-BASE-NEXT:    sxth w0, w8
 ; CHECK-GI-BASE-NEXT:    ret
@@ -4556,76 +4446,54 @@ define i32 @add_v24i8_v24i32_sext(<24 x i8> %x) {
 ; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_sext:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
 ; CHECK-GI-DOT-NEXT:    fmov s0, w0
-; CHECK-GI-DOT-NEXT:    fmov s1, w1
-; CHECK-GI-DOT-NEXT:    ldr w8, [sp]
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #64]
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp]
 ; CHECK-GI-DOT-NEXT:    ldr w10, [sp, #72]
-; CHECK-GI-DOT-NEXT:    movi v4.8b, #1
-; CHECK-GI-DOT-NEXT:    fmov s2, w8
-; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #8]
-; CHECK-GI-DOT-NEXT:    mov v0.b[1], v1.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s1, w2
-; CHECK-GI-DOT-NEXT:    fmov s3, w10
-; CHECK-GI-DOT-NEXT:    mov v0.b[2], v1.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s1, w3
-; CHECK-GI-DOT-NEXT:    mov v0.b[3], v1.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s1, w4
-; CHECK-GI-DOT-NEXT:    mov v0.b[4], v1.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s1, w5
-; CHECK-GI-DOT-NEXT:    mov v0.b[5], v1.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s1, w6
-; CHECK-GI-DOT-NEXT:    mov v0.b[6], v1.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s1, w7
-; CHECK-GI-DOT-NEXT:    mov v0.b[7], v1.b[0]
+; CHECK-GI-DOT-NEXT:    movi v2.8b, #1
+; CHECK-GI-DOT-NEXT:    movi v3.8b, #1
 ; CHECK-GI-DOT-NEXT:    fmov s1, w9
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #80]
-; CHECK-GI-DOT-NEXT:    mov v1.b[1], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    mov v0.b[1], w1
+; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    mov v1.b[1], w10
+; CHECK-GI-DOT-NEXT:    mov v3.d[1], v2.d[0]
+; CHECK-GI-DOT-NEXT:    mov v0.b[2], w2
+; CHECK-GI-DOT-NEXT:    mov v1.b[2], w9
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #88]
-; CHECK-GI-DOT-NEXT:    mov v0.b[8], v2.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s2, w8
-; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #16]
-; CHECK-GI-DOT-NEXT:    mov v1.b[2], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    mov v0.b[3], w3
+; CHECK-GI-DOT-NEXT:    mov v1.b[3], w9
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #96]
-; CHECK-GI-DOT-NEXT:    mov v0.b[9], v2.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s2, w8
-; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #24]
-; CHECK-GI-DOT-NEXT:    mov v1.b[3], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    mov v0.b[4], w4
+; CHECK-GI-DOT-NEXT:    mov v1.b[4], w9
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #104]
-; CHECK-GI-DOT-NEXT:    mov v0.b[10], v2.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s2, w8
-; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #32]
-; CHECK-GI-DOT-NEXT:    mov v1.b[4], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    mov v0.b[5], w5
+; CHECK-GI-DOT-NEXT:    mov v1.b[5], w9
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #112]
-; CHECK-GI-DOT-NEXT:    mov v0.b[11], v2.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s2, w8
-; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #40]
-; CHECK-GI-DOT-NEXT:    mov v1.b[5], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    mov v0.b[6], w6
+; CHECK-GI-DOT-NEXT:    mov v1.b[6], w9
 ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #120]
-; CHECK-GI-DOT-NEXT:    mov v0.b[12], v2.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s2, w8
+; CHECK-GI-DOT-NEXT:    mov v0.b[7], w7
+; CHECK-GI-DOT-NEXT:    mov v1.b[7], w9
+; CHECK-GI-DOT-NEXT:    mov v0.b[8], w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #8]
+; CHECK-GI-DOT-NEXT:    fmov d1, d1
+; CHECK-GI-DOT-NEXT:    mov v0.b[9], w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #16]
+; CHECK-GI-DOT-NEXT:    sdot v4.4s, v1.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    mov v0.b[10], w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #24]
+; CHECK-GI-DOT-NEXT:    mov v0.b[11], w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #32]
+; CHECK-GI-DOT-NEXT:    mov v0.b[12], w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #40]
+; CHECK-GI-DOT-NEXT:    mov v0.b[13], w8
 ; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #48]
-; CHECK-GI-DOT-NEXT:    fmov s5, w9
-; CHECK-GI-DOT-NEXT:    mov v1.b[6], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w8
+; CHECK-GI-DOT-NEXT:    mov v0.b[14], w8
 ; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #56]
-; CHECK-GI-DOT-NEXT:    mov v0.b[13], v2.b[0]
-; CHECK-GI-DOT-NEXT:    movi v2.8b, #1
-; CHECK-GI-DOT-NEXT:    mov v1.b[7], v5.b[0]
-; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    mov v0.b[14], v3.b[0]
-; CHECK-GI-DOT-NEXT:    fmov s3, w8
-; CHECK-GI-DOT-NEXT:    mov v4.d[1], v2.d[0]
-; CHECK-GI-DOT-NEXT:    fmov d1, d1
-; CHECK-GI-DOT-NEXT:    mov v0.b[15], v3.b[0]
-; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    sdot v5.4s, v0.16b, v4.16b
-; CHECK-GI-DOT-NEXT:    sdot v3.4s, v1.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v3.4s
+; CHECK-GI-DOT-NEXT:    mov v0.b[15], w8
+; CHECK-GI-DOT-NEXT:    sdot v5.4s, v0.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v4.4s
 ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
 ; CHECK-GI-DOT-NEXT:    fmov w0, s0
 ; CHECK-GI-DOT-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/xtn.ll b/llvm/test/CodeGen/AArch64/xtn.ll
index e536ba240453e2..ead790203f9496 100644
--- a/llvm/test/CodeGen/AArch64/xtn.ll
+++ b/llvm/test/CodeGen/AArch64/xtn.ll
@@ -127,12 +127,19 @@ entry:
 }
 
 define <2 x i8> @xtn_v2i128_v2i8(<2 x i128> %a) {
-; CHECK-LABEL: xtn_v2i128_v2i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    mov v0.s[1], w2
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: xtn_v2i128_v2i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    mov v0.s[1], w2
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: xtn_v2i128_v2i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v0.s[0], w0
+; CHECK-GI-NEXT:    mov v0.s[1], w2
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = trunc <2 x i128> %a to <2 x i8>
   ret <2 x i8> %arg1
@@ -168,8 +175,7 @@ define <2 x i16> @xtn_v2i128_v2i16(<2 x i128> %a) {
 ; CHECK-GI-LABEL: xtn_v2i128_v2i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w2
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], w2
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
@@ -189,23 +195,36 @@ entry:
 }
 
 define <2 x i32> @xtn_v2i128_v2i32(<2 x i128> %a) {
-; CHECK-LABEL: xtn_v2i128_v2i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    mov v0.s[1], w2
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: xtn_v2i128_v2i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    mov v0.s[1], w2
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: xtn_v2i128_v2i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v0.s[0], w0
+; CHECK-GI-NEXT:    mov v0.s[1], w2
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = trunc <2 x i128> %a to <2 x i32>
   ret <2 x i32> %arg1
 }
 
 define <2 x i64> @xtn_v2i128_v2i64(<2 x i128> %a) {
-; CHECK-LABEL: xtn_v2i128_v2i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov d0, x0
-; CHECK-NEXT:    mov v0.d[1], x2
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: xtn_v2i128_v2i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov d0, x0
+; CHECK-SD-NEXT:    mov v0.d[1], x2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: xtn_v2i128_v2i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v0.d[0], x0
+; CHECK-GI-NEXT:    mov v0.d[1], x2
+; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = trunc <2 x i128> %a to <2 x i64>
   ret <2 x i64> %arg1
@@ -282,10 +301,10 @@ define <3 x i16> @xtn_v3i32_v3i16(<3 x i32> %a) {
 ;
 ; CHECK-GI-LABEL: xtn_v3i32_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov s2, v0.s[2]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-GI-NEXT:    mov w8, v0.s[1]
+; CHECK-GI-NEXT:    mov w9, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    mov v0.h[2], w9
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -309,11 +328,9 @@ define <3 x i16> @xtn_v3i64_v3i16(<3 x i64> %a) {
 ; CHECK-GI-NEXT:    fmov x8, d0
 ; CHECK-GI-NEXT:    fmov x9, d1
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    fmov x8, d2
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], w9
+; CHECK-GI-NEXT:    mov v0.h[2], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -334,10 +351,10 @@ define <3 x i32> @xtn_v3i64_v3i32(<3 x i64> %a) {
 ; CHECK-GI-LABEL: xtn_v3i64_v3i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    fmov x8, d2
-; CHECK-GI-NEXT:    mov v0.s[1], w9
 ; CHECK-GI-NEXT:    mov v0.s[2], w8
 ; CHECK-GI-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll
index bb968c8eb00fcb..7e95b6684e8211 100644
--- a/llvm/test/CodeGen/AArch64/zext.ll
+++ b/llvm/test/CodeGen/AArch64/zext.ll
@@ -245,11 +245,9 @@ define <3 x i16> @zext_v3i8_v3i16(<3 x i8> %a) {
 ; CHECK-GI-NEXT:    and w8, w0, #0xff
 ; CHECK-GI-NEXT:    and w9, w1, #0xff
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    and w8, w2, #0xff
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], w9
+; CHECK-GI-NEXT:    mov v0.h[2], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -271,10 +269,10 @@ define <3 x i32> @zext_v3i8_v3i32(<3 x i8> %a) {
 ; CHECK-GI-LABEL: zext_v3i8_v3i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    and w8, w0, #0xff
-; CHECK-GI-NEXT:    and w9, w1, #0xff
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    and w8, w1, #0xff
+; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    and w8, w2, #0xff
-; CHECK-GI-NEXT:    mov v0.s[1], w9
 ; CHECK-GI-NEXT:    mov v0.s[2], w8
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -328,7 +326,7 @@ define <3 x i32> @zext_v3i16_v3i32(<3 x i16> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    umov w8, v0.h[0]
 ; CHECK-GI-NEXT:    umov w9, v0.h[1]
-; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.s[0], w8
 ; CHECK-GI-NEXT:    umov w8, v0.h[2]
 ; CHECK-GI-NEXT:    mov v1.s[1], w9
 ; CHECK-GI-NEXT:    mov v1.s[2], w8
@@ -406,11 +404,9 @@ define <3 x i16> @zext_v3i10_v3i16(<3 x i10> %a) {
 ; CHECK-GI-NEXT:    and w8, w0, #0x3ff
 ; CHECK-GI-NEXT:    and w9, w1, #0x3ff
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    fmov s1, w9
 ; CHECK-GI-NEXT:    and w8, w2, #0x3ff
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[1], w9
+; CHECK-GI-NEXT:    mov v0.h[2], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -432,10 +428,10 @@ define <3 x i32> @zext_v3i10_v3i32(<3 x i10> %a) {
 ; CHECK-GI-LABEL: zext_v3i10_v3i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    and w8, w0, #0x3ff
-; CHECK-GI-NEXT:    and w9, w1, #0x3ff
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[0], w8
+; CHECK-GI-NEXT:    and w8, w1, #0x3ff
+; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    and w8, w2, #0x3ff
-; CHECK-GI-NEXT:    mov v0.s[1], w9
 ; CHECK-GI-NEXT:    mov v0.s[2], w8
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1089,51 +1085,39 @@ define <16 x i32> @zext_v16i10_v16i32(<16 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v16i10_v16i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
 ; CHECK-GI-NEXT:    ldr w8, [sp]
-; CHECK-GI-NEXT:    fmov s3, w5
-; CHECK-GI-NEXT:    ldr w9, [sp, #8]
-; CHECK-GI-NEXT:    ldr w10, [sp, #32]
+; CHECK-GI-NEXT:    ldr w9, [sp, #32]
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fmov s1, w4
+; CHECK-GI-NEXT:    ldr w10, [sp, #8]
 ; CHECK-GI-NEXT:    ldr w11, [sp, #40]
 ; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    fmov s3, w9
 ; CHECK-GI-NEXT:    ldr w8, [sp, #16]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    fmov s1, w4
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    fmov s5, w10
-; CHECK-GI-NEXT:    fmov s6, w11
+; CHECK-GI-NEXT:    mov v0.h[1], w1
 ; CHECK-GI-NEXT:    ldr w9, [sp, #48]
-; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w2
-; CHECK-GI-NEXT:    mov v2.h[1], v4.h[0]
-; CHECK-GI-NEXT:    mov v5.h[1], v6.h[0]
-; CHECK-GI-NEXT:    fmov s4, w8
-; CHECK-GI-NEXT:    fmov s6, w9
+; CHECK-GI-NEXT:    movi v4.4s, #3, msl #8
+; CHECK-GI-NEXT:    mov v1.h[1], w5
+; CHECK-GI-NEXT:    mov v2.h[1], w10
+; CHECK-GI-NEXT:    mov v3.h[1], w11
+; CHECK-GI-NEXT:    mov v0.h[2], w2
+; CHECK-GI-NEXT:    mov v1.h[2], w6
+; CHECK-GI-NEXT:    mov v2.h[2], w8
+; CHECK-GI-NEXT:    mov v3.h[2], w9
 ; CHECK-GI-NEXT:    ldr w8, [sp, #24]
 ; CHECK-GI-NEXT:    ldr w9, [sp, #56]
-; CHECK-GI-NEXT:    mov v0.h[2], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w6
-; CHECK-GI-NEXT:    mov v2.h[2], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w8
-; CHECK-GI-NEXT:    mov v5.h[2], v6.h[0]
-; CHECK-GI-NEXT:    fmov s6, w9
-; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w3
-; CHECK-GI-NEXT:    mov v2.h[3], v4.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w7
-; CHECK-GI-NEXT:    mov v5.h[3], v6.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
-; CHECK-GI-NEXT:    movi v3.4s, #3, msl #8
-; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    mov v0.h[3], w3
+; CHECK-GI-NEXT:    mov v1.h[3], w7
+; CHECK-GI-NEXT:    mov v2.h[3], w8
+; CHECK-GI-NEXT:    mov v3.h[3], w9
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v4.4s, v5.4h, #0
 ; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v3.16b
-; CHECK-GI-NEXT:    and v2.16b, v2.16b, v3.16b
-; CHECK-GI-NEXT:    and v1.16b, v1.16b, v3.16b
-; CHECK-GI-NEXT:    and v3.16b, v4.16b, v3.16b
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v4.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v4.16b
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v4.16b
+; CHECK-GI-NEXT:    and v3.16b, v3.16b, v4.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <16 x i10> %a to <16 x i32>
@@ -1185,62 +1169,50 @@ define <16 x i64> @zext_v16i10_v16i64(<16 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v16i10_v16i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w1
 ; CHECK-GI-NEXT:    ldr w8, [sp]
-; CHECK-GI-NEXT:    fmov s2, w5
-; CHECK-GI-NEXT:    ldr w9, [sp, #8]
 ; CHECK-GI-NEXT:    ldr w10, [sp, #32]
-; CHECK-GI-NEXT:    ldr w11, [sp, #40]
-; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    fmov s0, w0
 ; CHECK-GI-NEXT:    fmov s1, w4
-; CHECK-GI-NEXT:    fmov s3, w9
-; CHECK-GI-NEXT:    fmov s4, w11
-; CHECK-GI-NEXT:    ldr w9, [sp, #48]
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
+; CHECK-GI-NEXT:    ldr w9, [sp, #8]
+; CHECK-GI-NEXT:    ldr w11, [sp, #40]
 ; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    fmov s3, w10
 ; CHECK-GI-NEXT:    ldr w8, [sp, #16]
-; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    mov v0.h[1], w1
+; CHECK-GI-NEXT:    mov v1.h[1], w5
+; CHECK-GI-NEXT:    mov v2.h[1], w9
+; CHECK-GI-NEXT:    mov v3.h[1], w11
+; CHECK-GI-NEXT:    ldr w9, [sp, #48]
+; CHECK-GI-NEXT:    mov v0.h[2], w2
+; CHECK-GI-NEXT:    mov v1.h[2], w6
+; CHECK-GI-NEXT:    mov v2.h[2], w8
+; CHECK-GI-NEXT:    mov v3.h[2], w9
 ; CHECK-GI-NEXT:    ldr w8, [sp, #24]
-; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NEXT:    fmov s3, w10
-; CHECK-GI-NEXT:    mov v3.h[1], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w2
-; CHECK-GI-NEXT:    mov v2.h[2], v5.h[0]
-; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    ldr w9, [sp, #56]
+; CHECK-GI-NEXT:    mov v0.h[3], w3
+; CHECK-GI-NEXT:    mov v1.h[3], w7
+; CHECK-GI-NEXT:    mov v2.h[3], w8
+; CHECK-GI-NEXT:    mov v3.h[3], w9
 ; CHECK-GI-NEXT:    adrp x8, .LCPI54_0
 ; CHECK-GI-NEXT:    ldr q7, [x8, :lo12:.LCPI54_0]
-; CHECK-GI-NEXT:    mov v0.h[2], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w6
-; CHECK-GI-NEXT:    mov v2.h[3], v5.h[0]
-; CHECK-GI-NEXT:    mov v1.h[2], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w9
-; CHECK-GI-NEXT:    ldr w9, [sp, #56]
-; CHECK-GI-NEXT:    mov v3.h[2], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w3
-; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w7
-; CHECK-GI-NEXT:    ushll v17.2d, v2.2s, #0
-; CHECK-GI-NEXT:    ushll2 v18.2d, v2.4s, #0
-; CHECK-GI-NEXT:    mov v1.h[3], v4.h[0]
-; CHECK-GI-NEXT:    fmov s4, w9
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    mov v3.h[3], v4.h[0]
 ; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
 ; CHECK-GI-NEXT:    ushll v4.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    ushll2 v5.2d, v0.4s, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
 ; CHECK-GI-NEXT:    ushll v6.2d, v1.2s, #0
 ; CHECK-GI-NEXT:    ushll2 v16.2d, v1.4s, #0
-; CHECK-GI-NEXT:    and v0.16b, v4.16b, v7.16b
-; CHECK-GI-NEXT:    and v1.16b, v5.16b, v7.16b
-; CHECK-GI-NEXT:    and v4.16b, v17.16b, v7.16b
-; CHECK-GI-NEXT:    and v5.16b, v18.16b, v7.16b
+; CHECK-GI-NEXT:    ushll v17.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll2 v18.2d, v2.4s, #0
 ; CHECK-GI-NEXT:    ushll v19.2d, v3.2s, #0
 ; CHECK-GI-NEXT:    ushll2 v20.2d, v3.4s, #0
+; CHECK-GI-NEXT:    and v0.16b, v4.16b, v7.16b
+; CHECK-GI-NEXT:    and v1.16b, v5.16b, v7.16b
 ; CHECK-GI-NEXT:    and v2.16b, v6.16b, v7.16b
 ; CHECK-GI-NEXT:    and v3.16b, v16.16b, v7.16b
+; CHECK-GI-NEXT:    and v4.16b, v17.16b, v7.16b
+; CHECK-GI-NEXT:    and v5.16b, v18.16b, v7.16b
 ; CHECK-GI-NEXT:    and v6.16b, v19.16b, v7.16b
 ; CHECK-GI-NEXT:    and v7.16b, v20.16b, v7.16b
 ; CHECK-GI-NEXT:    ret


        


More information about the llvm-commits mailing list