[llvm] [AArch64][GloablISel] Refactor Combine G_CONCAT_VECTOR (PR #80866)

via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 7 02:25:43 PST 2024


https://github.com/chuongg3 updated https://github.com/llvm/llvm-project/pull/80866

>From 5e36dff851caa191f4230b095eee0ab48e0313b5 Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Mon, 5 Feb 2024 16:42:20 +0000
Subject: [PATCH 1/2] [AArch64][GloablISel] Refactor Combine G_CONCAT_VECTOR

The combine now works using tablegen and checks if new instruction is
legal before creating it.
---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |  24 +-
 .../include/llvm/Target/GlobalISel/Combine.td |  11 +-
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  40 +-
 llvm/lib/Target/AArch64/AArch64Combine.td     |   4 +-
 .../GISel/AArch64O0PreLegalizerCombiner.cpp   |   2 -
 .../GISel/AArch64PreLegalizerCombiner.cpp     |   2 -
 .../AMDGPU/AMDGPUPreLegalizerCombiner.cpp     |   2 -
 llvm/test/CodeGen/AArch64/itofp.ll            | 570 +++++++++---------
 .../AArch64/neon-bitwise-instructions.ll      |  14 +-
 llvm/test/CodeGen/AArch64/vecreduce-add.ll    | 352 +++++------
 10 files changed, 510 insertions(+), 511 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 10eeafdd09a8e..133b5d2c4e6b7 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -224,22 +224,22 @@ class CombinerHelper {
   /// - concat_vector(undef, undef) => undef
   /// - concat_vector(build_vector(A, B), build_vector(C, D)) =>
   ///   build_vector(A, B, C, D)
-  ///
-  /// \pre MI.getOpcode() == G_CONCAT_VECTORS.
-  bool tryCombineConcatVectors(MachineInstr &MI);
+  /// ==========================================================
   /// Check if the G_CONCAT_VECTORS \p MI is undef or if it
   /// can be flattened into a build_vector.
-  /// In the first case \p IsUndef will be true.
-  /// In the second case \p Ops will contain the operands needed
-  /// to produce the flattened build_vector.
+  /// In the first case \p bool will be true.
+  /// In the second case \p SmallVector<Register> will contain the operands
+  /// needed to produce the flattened build_vector.
   ///
   /// \pre MI.getOpcode() == G_CONCAT_VECTORS.
-  bool matchCombineConcatVectors(MachineInstr &MI, bool &IsUndef,
-                                 SmallVectorImpl<Register> &Ops);
-  /// Replace \p MI with a flattened build_vector with \p Ops or an
-  /// implicit_def if IsUndef is true.
-  void applyCombineConcatVectors(MachineInstr &MI, bool IsUndef,
-                                 const ArrayRef<Register> Ops);
+  bool
+  matchCombineConcatVectors(MachineInstr &MI,
+                            std::pair<bool, SmallVector<Register>> &matchinfo);
+  /// Replace \p MI with a flattened build_vector with \p SmallVector<Register>
+  /// or an implicit_def if \p bool is true.
+  void
+  applyCombineConcatVectors(MachineInstr &MI,
+                            std::pair<bool, SmallVector<Register>> &matchinfo);
 
   /// Try to combine G_SHUFFLE_VECTOR into G_CONCAT_VECTORS.
   /// Returns true if MI changed.
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 9b0e1b0d7c4f9..77a6faaf837d5 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1253,6 +1253,14 @@ def match_ors : GICombineRule<
         [{ return Helper.matchOr(*${root}, ${matchinfo}); }]),
   (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
 
+// Combines concat operations
+def concat_matchinfo : GIDefMatchData<"std::pair<bool, SmallVector<Register>>">;
+def combine_concat_vector : GICombineRule<
+  (defs root:$root, concat_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_CONCAT_VECTORS):$root,
+        [{ return Helper.matchCombineConcatVectors(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyCombineConcatVectors(*${root}, ${matchinfo}); }])>;
+
 // FIXME: These should use the custom predicate feature once it lands.
 def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero,
                                      undef_to_negative_one,
@@ -1326,7 +1334,8 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
     intdiv_combines, mulh_combines, redundant_neg_operands,
     and_or_disjoint_mask, fma_combines, fold_binop_into_select,
     sub_add_reg, select_to_minmax, redundant_binop_in_equality,
-    fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors]>;
+    fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors, 
+    combine_concat_vector]>;
 
 // A combine group used to for prelegalizer combiners at -O0. The combines in
 // this group have been selected based on experiments to balance code size and
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 1b199cfd41d23..71b383b54a141 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -222,21 +222,11 @@ void CombinerHelper::applyCombineCopy(MachineInstr &MI) {
   replaceRegWith(MRI, DstReg, SrcReg);
 }
 
-bool CombinerHelper::tryCombineConcatVectors(MachineInstr &MI) {
-  bool IsUndef = false;
-  SmallVector<Register, 4> Ops;
-  if (matchCombineConcatVectors(MI, IsUndef, Ops)) {
-    applyCombineConcatVectors(MI, IsUndef, Ops);
-    return true;
-  }
-  return false;
-}
-
-bool CombinerHelper::matchCombineConcatVectors(MachineInstr &MI, bool &IsUndef,
-                                               SmallVectorImpl<Register> &Ops) {
+bool CombinerHelper::matchCombineConcatVectors(
+    MachineInstr &MI, std::pair<bool, SmallVector<Register>> &matchinfo) {
   assert(MI.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
          "Invalid instruction");
-  IsUndef = true;
+  matchinfo.first = true;
   MachineInstr *Undef = nullptr;
 
   // Walk over all the operands of concat vectors and check if they are
@@ -246,13 +236,15 @@ bool CombinerHelper::matchCombineConcatVectors(MachineInstr &MI, bool &IsUndef,
     Register Reg = MO.getReg();
     MachineInstr *Def = MRI.getVRegDef(Reg);
     assert(Def && "Operand not defined");
+    if (!MRI.hasOneNonDBGUse(Reg))
+      return false;
     switch (Def->getOpcode()) {
     case TargetOpcode::G_BUILD_VECTOR:
-      IsUndef = false;
+      matchinfo.first = false;
       // Remember the operands of the build_vector to fold
       // them into the yet-to-build flattened concat vectors.
       for (const MachineOperand &BuildVecMO : Def->uses())
-        Ops.push_back(BuildVecMO.getReg());
+        matchinfo.second.push_back(BuildVecMO.getReg());
       break;
     case TargetOpcode::G_IMPLICIT_DEF: {
       LLT OpType = MRI.getType(Reg);
@@ -268,17 +260,25 @@ bool CombinerHelper::matchCombineConcatVectors(MachineInstr &MI, bool &IsUndef,
       // for the flattening.
       for (unsigned EltIdx = 0, EltEnd = OpType.getNumElements();
            EltIdx != EltEnd; ++EltIdx)
-        Ops.push_back(Undef->getOperand(0).getReg());
+        matchinfo.second.push_back(Undef->getOperand(0).getReg());
       break;
     }
     default:
       return false;
     }
   }
+
+  // Check if the combine is illegal
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+  if (!isLegalOrBeforeLegalizer({TargetOpcode::G_BUILD_VECTOR,
+                                 {DstTy, MRI.getType(matchinfo.second[0])}})) {
+    return false;
+  }
+
   return true;
 }
 void CombinerHelper::applyCombineConcatVectors(
-    MachineInstr &MI, bool IsUndef, const ArrayRef<Register> Ops) {
+    MachineInstr &MI, std::pair<bool, SmallVector<Register>> &matchinfo) {
   // We determined that the concat_vectors can be flatten.
   // Generate the flattened build_vector.
   Register DstReg = MI.getOperand(0).getReg();
@@ -289,12 +289,12 @@ void CombinerHelper::applyCombineConcatVectors(
   // checking that at all Ops are undef.  Alternatively, we could have
   // generate a build_vector of undefs and rely on another combine to
   // clean that up.  For now, given we already gather this information
-  // in tryCombineConcatVectors, just save compile time and issue the
+  // in matchCombineConcatVectors, just save compile time and issue the
   // right thing.
-  if (IsUndef)
+  if (matchinfo.first)
     Builder.buildUndef(NewDstReg);
   else
-    Builder.buildBuildVector(NewDstReg, Ops);
+    Builder.buildBuildVector(NewDstReg, matchinfo.second);
   MI.eraseFromParent();
   replaceRegWith(MRI, DstReg, NewDstReg);
 }
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 1daa7d5fe6a7a..e4d8359c71e62 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -64,7 +64,7 @@ def AArch64PreLegalizerCombiner: GICombiner<
 }
 
 def AArch64O0PreLegalizerCombiner: GICombiner<
-  "AArch64O0PreLegalizerCombinerImpl", [optnone_combines]> {
+  "AArch64O0PreLegalizerCombinerImpl", [optnone_combines, combine_concat_vector]> {
   let CombineAllMethodName = "tryCombineAllImpl";
 }
 
@@ -288,5 +288,5 @@ def AArch64PostLegalizerCombiner
                         constant_fold_binops, identity_combines,
                         ptr_add_immed_chain, overlapping_and,
                         split_store_zero_128, undef_combines,
-                        select_to_minmax, or_to_bsp]> {
+                        select_to_minmax, or_to_bsp, combine_concat_vector]> {
 }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp
index 0b82ed1280ddd..17dd8f2314a2b 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp
@@ -91,8 +91,6 @@ bool AArch64O0PreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
 
   unsigned Opc = MI.getOpcode();
   switch (Opc) {
-  case TargetOpcode::G_CONCAT_VECTORS:
-    return Helper.tryCombineConcatVectors(MI);
   case TargetOpcode::G_SHUFFLE_VECTOR:
     return Helper.tryCombineShuffleVector(MI);
   case TargetOpcode::G_MEMCPY_INLINE:
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
index 574d065ab01bb..a82d3cd095659 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
@@ -720,8 +720,6 @@ bool AArch64PreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
 
   unsigned Opc = MI.getOpcode();
   switch (Opc) {
-  case TargetOpcode::G_CONCAT_VECTORS:
-    return Helper.tryCombineConcatVectors(MI);
   case TargetOpcode::G_SHUFFLE_VECTOR:
     return Helper.tryCombineShuffleVector(MI);
   case TargetOpcode::G_UADDO:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index 0c7e198810da7..f14d970f1e5de 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -106,8 +106,6 @@ bool AMDGPUPreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
     return true;
 
   switch (MI.getOpcode()) {
-  case TargetOpcode::G_CONCAT_VECTORS:
-    return Helper.tryCombineConcatVectors(MI);
   case TargetOpcode::G_SHUFFLE_VECTOR:
     return Helper.tryCombineShuffleVector(MI);
   }
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index fa1ab61a6216f..0965d82f707e6 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -4043,28 +4043,28 @@ define <8 x half> @stofp_v8i64_v8f16(<8 x i64> %a) {
 ; CHECK-GI-FP16-LABEL: stofp_v8i64_v8f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-GI-FP16-NEXT:    scvtf v2.2d, v2.2d
 ; CHECK-GI-FP16-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-GI-FP16-NEXT:    scvtf v2.2d, v2.2d
 ; CHECK-GI-FP16-NEXT:    scvtf v3.2d, v3.2d
 ; CHECK-GI-FP16-NEXT:    mov d4, v0.d[1]
-; CHECK-GI-FP16-NEXT:    mov d5, v2.d[1]
 ; CHECK-GI-FP16-NEXT:    fcvt h0, d0
-; CHECK-GI-FP16-NEXT:    fcvt h2, d2
-; CHECK-GI-FP16-NEXT:    fcvt h4, d4
-; CHECK-GI-FP16-NEXT:    fcvt h5, d5
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v4.h[0]
-; CHECK-GI-FP16-NEXT:    mov d4, v1.d[1]
+; CHECK-GI-FP16-NEXT:    mov d5, v1.d[1]
 ; CHECK-GI-FP16-NEXT:    fcvt h1, d1
-; CHECK-GI-FP16-NEXT:    mov v2.h[1], v5.h[0]
-; CHECK-GI-FP16-NEXT:    mov d5, v3.d[1]
-; CHECK-GI-FP16-NEXT:    fcvt h3, d3
 ; CHECK-GI-FP16-NEXT:    fcvt h4, d4
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v4.h[0]
+; CHECK-GI-FP16-NEXT:    fcvt h4, d5
 ; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-FP16-NEXT:    fcvt h1, d5
-; CHECK-GI-FP16-NEXT:    mov v2.h[2], v3.h[0]
+; CHECK-GI-FP16-NEXT:    mov d1, v2.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h2, d2
 ; CHECK-GI-FP16-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-FP16-NEXT:    mov v2.h[3], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.d[1], v2.d[0]
+; CHECK-GI-FP16-NEXT:    fcvt h1, d1
+; CHECK-GI-FP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-FP16-NEXT:    mov d2, v3.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h3, d3
+; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[0]
+; CHECK-GI-FP16-NEXT:    fcvt h1, d2
+; CHECK-GI-FP16-NEXT:    mov v0.h[6], v3.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[7], v1.h[0]
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = sitofp <8 x i64> %a to <8 x half>
@@ -4103,28 +4103,28 @@ define <8 x half> @utofp_v8i64_v8f16(<8 x i64> %a) {
 ; CHECK-GI-FP16-LABEL: utofp_v8i64_v8f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-GI-FP16-NEXT:    ucvtf v2.2d, v2.2d
 ; CHECK-GI-FP16-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-GI-FP16-NEXT:    ucvtf v2.2d, v2.2d
 ; CHECK-GI-FP16-NEXT:    ucvtf v3.2d, v3.2d
 ; CHECK-GI-FP16-NEXT:    mov d4, v0.d[1]
-; CHECK-GI-FP16-NEXT:    mov d5, v2.d[1]
 ; CHECK-GI-FP16-NEXT:    fcvt h0, d0
-; CHECK-GI-FP16-NEXT:    fcvt h2, d2
-; CHECK-GI-FP16-NEXT:    fcvt h4, d4
-; CHECK-GI-FP16-NEXT:    fcvt h5, d5
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v4.h[0]
-; CHECK-GI-FP16-NEXT:    mov d4, v1.d[1]
+; CHECK-GI-FP16-NEXT:    mov d5, v1.d[1]
 ; CHECK-GI-FP16-NEXT:    fcvt h1, d1
-; CHECK-GI-FP16-NEXT:    mov v2.h[1], v5.h[0]
-; CHECK-GI-FP16-NEXT:    mov d5, v3.d[1]
-; CHECK-GI-FP16-NEXT:    fcvt h3, d3
 ; CHECK-GI-FP16-NEXT:    fcvt h4, d4
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v4.h[0]
+; CHECK-GI-FP16-NEXT:    fcvt h4, d5
 ; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-FP16-NEXT:    fcvt h1, d5
-; CHECK-GI-FP16-NEXT:    mov v2.h[2], v3.h[0]
+; CHECK-GI-FP16-NEXT:    mov d1, v2.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h2, d2
 ; CHECK-GI-FP16-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-FP16-NEXT:    mov v2.h[3], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.d[1], v2.d[0]
+; CHECK-GI-FP16-NEXT:    fcvt h1, d1
+; CHECK-GI-FP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-FP16-NEXT:    mov d2, v3.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h3, d3
+; CHECK-GI-FP16-NEXT:    mov v0.h[5], v1.h[0]
+; CHECK-GI-FP16-NEXT:    fcvt h1, d2
+; CHECK-GI-FP16-NEXT:    mov v0.h[6], v3.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[7], v1.h[0]
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = uitofp <8 x i64> %a to <8 x half>
@@ -4183,51 +4183,51 @@ define <16 x half> @stofp_v16i64_v16f16(<16 x i64> %a) {
 ; CHECK-GI-FP16-LABEL: stofp_v16i64_v16f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-GI-FP16-NEXT:    scvtf v16.2d, v2.2d
 ; CHECK-GI-FP16-NEXT:    scvtf v4.2d, v4.2d
-; CHECK-GI-FP16-NEXT:    scvtf v2.2d, v6.2d
-; CHECK-GI-FP16-NEXT:    scvtf v20.2d, v1.2d
-; CHECK-GI-FP16-NEXT:    scvtf v3.2d, v3.2d
+; CHECK-GI-FP16-NEXT:    scvtf v18.2d, v1.2d
 ; CHECK-GI-FP16-NEXT:    scvtf v5.2d, v5.2d
-; CHECK-GI-FP16-NEXT:    scvtf v7.2d, v7.2d
-; CHECK-GI-FP16-NEXT:    mov d6, v0.d[1]
-; CHECK-GI-FP16-NEXT:    mov d17, v16.d[1]
-; CHECK-GI-FP16-NEXT:    mov d18, v4.d[1]
-; CHECK-GI-FP16-NEXT:    mov d19, v2.d[1]
+; CHECK-GI-FP16-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-GI-FP16-NEXT:    scvtf v3.2d, v3.2d
+; CHECK-GI-FP16-NEXT:    mov d16, v0.d[1]
+; CHECK-GI-FP16-NEXT:    mov d17, v4.d[1]
 ; CHECK-GI-FP16-NEXT:    fcvt h0, d0
-; CHECK-GI-FP16-NEXT:    fcvt h16, d16
 ; CHECK-GI-FP16-NEXT:    fcvt h1, d4
-; CHECK-GI-FP16-NEXT:    fcvt h2, d2
-; CHECK-GI-FP16-NEXT:    fcvt h6, d6
-; CHECK-GI-FP16-NEXT:    fcvt h17, d17
-; CHECK-GI-FP16-NEXT:    fcvt h4, d18
-; CHECK-GI-FP16-NEXT:    fcvt h18, d19
-; CHECK-GI-FP16-NEXT:    fcvt h19, d20
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v6.h[0]
-; CHECK-GI-FP16-NEXT:    mov d6, v20.d[1]
-; CHECK-GI-FP16-NEXT:    mov v16.h[1], v17.h[0]
-; CHECK-GI-FP16-NEXT:    mov d17, v3.d[1]
-; CHECK-GI-FP16-NEXT:    fcvt h3, d3
-; CHECK-GI-FP16-NEXT:    mov v1.h[1], v4.h[0]
-; CHECK-GI-FP16-NEXT:    mov d4, v5.d[1]
+; CHECK-GI-FP16-NEXT:    mov d19, v5.d[1]
 ; CHECK-GI-FP16-NEXT:    fcvt h5, d5
-; CHECK-GI-FP16-NEXT:    mov v2.h[1], v18.h[0]
-; CHECK-GI-FP16-NEXT:    mov d18, v7.d[1]
-; CHECK-GI-FP16-NEXT:    fcvt h7, d7
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v19.h[0]
-; CHECK-GI-FP16-NEXT:    mov v16.h[2], v3.h[0]
-; CHECK-GI-FP16-NEXT:    fcvt h3, d6
+; CHECK-GI-FP16-NEXT:    fcvt h16, d16
+; CHECK-GI-FP16-NEXT:    fcvt h4, d17
+; CHECK-GI-FP16-NEXT:    mov d17, v18.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h18, d18
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v16.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], v4.h[0]
+; CHECK-GI-FP16-NEXT:    scvtf v4.2d, v6.2d
 ; CHECK-GI-FP16-NEXT:    fcvt h6, d17
-; CHECK-GI-FP16-NEXT:    fcvt h4, d4
+; CHECK-GI-FP16-NEXT:    fcvt h16, d19
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], v18.h[0]
 ; CHECK-GI-FP16-NEXT:    mov v1.h[2], v5.h[0]
-; CHECK-GI-FP16-NEXT:    fcvt h5, d18
-; CHECK-GI-FP16-NEXT:    mov v2.h[2], v7.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v3.h[0]
-; CHECK-GI-FP16-NEXT:    mov v16.h[3], v6.h[0]
-; CHECK-GI-FP16-NEXT:    mov v1.h[3], v4.h[0]
-; CHECK-GI-FP16-NEXT:    mov v2.h[3], v5.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.d[1], v16.d[0]
-; CHECK-GI-FP16-NEXT:    mov v1.d[1], v2.d[0]
+; CHECK-GI-FP16-NEXT:    mov d5, v2.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h2, d2
+; CHECK-GI-FP16-NEXT:    mov d17, v4.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h4, d4
+; CHECK-GI-FP16-NEXT:    mov v0.h[3], v6.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[3], v16.h[0]
+; CHECK-GI-FP16-NEXT:    scvtf v6.2d, v7.2d
+; CHECK-GI-FP16-NEXT:    fcvt h5, d5
+; CHECK-GI-FP16-NEXT:    fcvt h7, d17
+; CHECK-GI-FP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[4], v4.h[0]
+; CHECK-GI-FP16-NEXT:    mov d2, v3.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h3, d3
+; CHECK-GI-FP16-NEXT:    mov d4, v6.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h6, d6
+; CHECK-GI-FP16-NEXT:    mov v0.h[5], v5.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[5], v7.h[0]
+; CHECK-GI-FP16-NEXT:    fcvt h2, d2
+; CHECK-GI-FP16-NEXT:    fcvt h4, d4
+; CHECK-GI-FP16-NEXT:    mov v0.h[6], v3.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[6], v6.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[7], v2.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[7], v4.h[0]
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = sitofp <16 x i64> %a to <16 x half>
@@ -4286,51 +4286,51 @@ define <16 x half> @utofp_v16i64_v16f16(<16 x i64> %a) {
 ; CHECK-GI-FP16-LABEL: utofp_v16i64_v16f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-GI-FP16-NEXT:    ucvtf v16.2d, v2.2d
 ; CHECK-GI-FP16-NEXT:    ucvtf v4.2d, v4.2d
-; CHECK-GI-FP16-NEXT:    ucvtf v2.2d, v6.2d
-; CHECK-GI-FP16-NEXT:    ucvtf v20.2d, v1.2d
-; CHECK-GI-FP16-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-GI-FP16-NEXT:    ucvtf v18.2d, v1.2d
 ; CHECK-GI-FP16-NEXT:    ucvtf v5.2d, v5.2d
-; CHECK-GI-FP16-NEXT:    ucvtf v7.2d, v7.2d
-; CHECK-GI-FP16-NEXT:    mov d6, v0.d[1]
-; CHECK-GI-FP16-NEXT:    mov d17, v16.d[1]
-; CHECK-GI-FP16-NEXT:    mov d18, v4.d[1]
-; CHECK-GI-FP16-NEXT:    mov d19, v2.d[1]
+; CHECK-GI-FP16-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-GI-FP16-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-GI-FP16-NEXT:    mov d16, v0.d[1]
+; CHECK-GI-FP16-NEXT:    mov d17, v4.d[1]
 ; CHECK-GI-FP16-NEXT:    fcvt h0, d0
-; CHECK-GI-FP16-NEXT:    fcvt h16, d16
 ; CHECK-GI-FP16-NEXT:    fcvt h1, d4
-; CHECK-GI-FP16-NEXT:    fcvt h2, d2
-; CHECK-GI-FP16-NEXT:    fcvt h6, d6
-; CHECK-GI-FP16-NEXT:    fcvt h17, d17
-; CHECK-GI-FP16-NEXT:    fcvt h4, d18
-; CHECK-GI-FP16-NEXT:    fcvt h18, d19
-; CHECK-GI-FP16-NEXT:    fcvt h19, d20
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v6.h[0]
-; CHECK-GI-FP16-NEXT:    mov d6, v20.d[1]
-; CHECK-GI-FP16-NEXT:    mov v16.h[1], v17.h[0]
-; CHECK-GI-FP16-NEXT:    mov d17, v3.d[1]
-; CHECK-GI-FP16-NEXT:    fcvt h3, d3
-; CHECK-GI-FP16-NEXT:    mov v1.h[1], v4.h[0]
-; CHECK-GI-FP16-NEXT:    mov d4, v5.d[1]
+; CHECK-GI-FP16-NEXT:    mov d19, v5.d[1]
 ; CHECK-GI-FP16-NEXT:    fcvt h5, d5
-; CHECK-GI-FP16-NEXT:    mov v2.h[1], v18.h[0]
-; CHECK-GI-FP16-NEXT:    mov d18, v7.d[1]
-; CHECK-GI-FP16-NEXT:    fcvt h7, d7
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v19.h[0]
-; CHECK-GI-FP16-NEXT:    mov v16.h[2], v3.h[0]
-; CHECK-GI-FP16-NEXT:    fcvt h3, d6
+; CHECK-GI-FP16-NEXT:    fcvt h16, d16
+; CHECK-GI-FP16-NEXT:    fcvt h4, d17
+; CHECK-GI-FP16-NEXT:    mov d17, v18.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h18, d18
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v16.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], v4.h[0]
+; CHECK-GI-FP16-NEXT:    ucvtf v4.2d, v6.2d
 ; CHECK-GI-FP16-NEXT:    fcvt h6, d17
-; CHECK-GI-FP16-NEXT:    fcvt h4, d4
+; CHECK-GI-FP16-NEXT:    fcvt h16, d19
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], v18.h[0]
 ; CHECK-GI-FP16-NEXT:    mov v1.h[2], v5.h[0]
-; CHECK-GI-FP16-NEXT:    fcvt h5, d18
-; CHECK-GI-FP16-NEXT:    mov v2.h[2], v7.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v3.h[0]
-; CHECK-GI-FP16-NEXT:    mov v16.h[3], v6.h[0]
-; CHECK-GI-FP16-NEXT:    mov v1.h[3], v4.h[0]
-; CHECK-GI-FP16-NEXT:    mov v2.h[3], v5.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.d[1], v16.d[0]
-; CHECK-GI-FP16-NEXT:    mov v1.d[1], v2.d[0]
+; CHECK-GI-FP16-NEXT:    mov d5, v2.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h2, d2
+; CHECK-GI-FP16-NEXT:    mov d17, v4.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h4, d4
+; CHECK-GI-FP16-NEXT:    mov v0.h[3], v6.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[3], v16.h[0]
+; CHECK-GI-FP16-NEXT:    ucvtf v6.2d, v7.2d
+; CHECK-GI-FP16-NEXT:    fcvt h5, d5
+; CHECK-GI-FP16-NEXT:    fcvt h7, d17
+; CHECK-GI-FP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[4], v4.h[0]
+; CHECK-GI-FP16-NEXT:    mov d2, v3.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h3, d3
+; CHECK-GI-FP16-NEXT:    mov d4, v6.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h6, d6
+; CHECK-GI-FP16-NEXT:    mov v0.h[5], v5.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[5], v7.h[0]
+; CHECK-GI-FP16-NEXT:    fcvt h2, d2
+; CHECK-GI-FP16-NEXT:    fcvt h4, d4
+; CHECK-GI-FP16-NEXT:    mov v0.h[6], v3.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[6], v6.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[7], v2.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[7], v4.h[0]
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = uitofp <16 x i64> %a to <16 x half>
@@ -4436,103 +4436,104 @@ define <32 x half> @stofp_v32i64_v32f16(<32 x i64> %a) {
 ;
 ; CHECK-GI-FP16-LABEL: stofp_v32i64_v32f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    scvtf v16.2d, v2.2d
+; CHECK-GI-FP16-NEXT:    ldp q16, q18, [sp]
 ; CHECK-GI-FP16-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-GI-FP16-NEXT:    scvtf v18.2d, v4.2d
-; CHECK-GI-FP16-NEXT:    scvtf v17.2d, v6.2d
-; CHECK-GI-FP16-NEXT:    scvtf v4.2d, v1.2d
-; CHECK-GI-FP16-NEXT:    scvtf v3.2d, v3.2d
-; CHECK-GI-FP16-NEXT:    ldp q1, q23, [sp]
+; CHECK-GI-FP16-NEXT:    ldp q17, q19, [sp, #64]
+; CHECK-GI-FP16-NEXT:    scvtf v4.2d, v4.2d
+; CHECK-GI-FP16-NEXT:    scvtf v1.2d, v1.2d
 ; CHECK-GI-FP16-NEXT:    scvtf v5.2d, v5.2d
-; CHECK-GI-FP16-NEXT:    scvtf v6.2d, v7.2d
-; CHECK-GI-FP16-NEXT:    mov d20, v16.d[1]
-; CHECK-GI-FP16-NEXT:    mov d19, v0.d[1]
-; CHECK-GI-FP16-NEXT:    mov d21, v18.d[1]
-; CHECK-GI-FP16-NEXT:    mov d22, v17.d[1]
-; CHECK-GI-FP16-NEXT:    fcvt h16, d16
-; CHECK-GI-FP16-NEXT:    scvtf v2.2d, v1.2d
+; CHECK-GI-FP16-NEXT:    scvtf v6.2d, v6.2d
+; CHECK-GI-FP16-NEXT:    scvtf v20.2d, v16.2d
+; CHECK-GI-FP16-NEXT:    scvtf v24.2d, v18.2d
+; CHECK-GI-FP16-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-GI-FP16-NEXT:    scvtf v16.2d, v17.2d
+; CHECK-GI-FP16-NEXT:    mov d21, v0.d[1]
+; CHECK-GI-FP16-NEXT:    scvtf v25.2d, v19.2d
+; CHECK-GI-FP16-NEXT:    mov d22, v4.d[1]
 ; CHECK-GI-FP16-NEXT:    fcvt h0, d0
-; CHECK-GI-FP16-NEXT:    fcvt h1, d18
-; CHECK-GI-FP16-NEXT:    ldr q18, [sp, #32]
-; CHECK-GI-FP16-NEXT:    fcvt h7, d17
-; CHECK-GI-FP16-NEXT:    ldp q25, q17, [sp, #48]
-; CHECK-GI-FP16-NEXT:    fcvt h20, d20
-; CHECK-GI-FP16-NEXT:    fcvt h24, d19
-; CHECK-GI-FP16-NEXT:    fcvt h21, d21
-; CHECK-GI-FP16-NEXT:    fcvt h22, d22
-; CHECK-GI-FP16-NEXT:    scvtf v18.2d, v18.2d
-; CHECK-GI-FP16-NEXT:    fcvt h26, d4
-; CHECK-GI-FP16-NEXT:    scvtf v17.2d, v17.2d
-; CHECK-GI-FP16-NEXT:    fcvt h27, d3
-; CHECK-GI-FP16-NEXT:    fcvt h28, d6
-; CHECK-GI-FP16-NEXT:    scvtf v23.2d, v23.2d
-; CHECK-GI-FP16-NEXT:    scvtf v25.2d, v25.2d
-; CHECK-GI-FP16-NEXT:    mov d4, v4.d[1]
-; CHECK-GI-FP16-NEXT:    mov v16.h[1], v20.h[0]
-; CHECK-GI-FP16-NEXT:    ldp q19, q20, [sp, #80]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v24.h[0]
-; CHECK-GI-FP16-NEXT:    fcvt h24, d5
-; CHECK-GI-FP16-NEXT:    mov v1.h[1], v21.h[0]
-; CHECK-GI-FP16-NEXT:    ldr q21, [sp, #112]
-; CHECK-GI-FP16-NEXT:    mov v7.h[1], v22.h[0]
-; CHECK-GI-FP16-NEXT:    mov d22, v2.d[1]
-; CHECK-GI-FP16-NEXT:    scvtf v20.2d, v20.2d
-; CHECK-GI-FP16-NEXT:    fcvt h2, d2
-; CHECK-GI-FP16-NEXT:    scvtf v19.2d, v19.2d
-; CHECK-GI-FP16-NEXT:    mov v16.h[2], v27.h[0]
-; CHECK-GI-FP16-NEXT:    scvtf v21.2d, v21.2d
-; CHECK-GI-FP16-NEXT:    mov d5, v5.d[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v26.h[0]
-; CHECK-GI-FP16-NEXT:    mov d26, v18.d[1]
-; CHECK-GI-FP16-NEXT:    mov v1.h[2], v24.h[0]
-; CHECK-GI-FP16-NEXT:    mov d24, v17.d[1]
-; CHECK-GI-FP16-NEXT:    fcvt h22, d22
-; CHECK-GI-FP16-NEXT:    mov v7.h[2], v28.h[0]
-; CHECK-GI-FP16-NEXT:    mov d27, v20.d[1]
+; CHECK-GI-FP16-NEXT:    scvtf v3.2d, v3.2d
+; CHECK-GI-FP16-NEXT:    mov d18, v1.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h17, d1
+; CHECK-GI-FP16-NEXT:    mov d19, v5.d[1]
+; CHECK-GI-FP16-NEXT:    mov d23, v20.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h1, d4
+; CHECK-GI-FP16-NEXT:    fcvt h4, d20
+; CHECK-GI-FP16-NEXT:    mov d26, v16.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h20, d5
+; CHECK-GI-FP16-NEXT:    fcvt h5, d16
+; CHECK-GI-FP16-NEXT:    fcvt h28, d21
+; CHECK-GI-FP16-NEXT:    fcvt h29, d22
+; CHECK-GI-FP16-NEXT:    fcvt h22, d24
+; CHECK-GI-FP16-NEXT:    fcvt h21, d25
 ; CHECK-GI-FP16-NEXT:    fcvt h18, d18
-; CHECK-GI-FP16-NEXT:    mov d28, v3.d[1]
-; CHECK-GI-FP16-NEXT:    fcvt h3, d17
-; CHECK-GI-FP16-NEXT:    fcvt h20, d20
-; CHECK-GI-FP16-NEXT:    mov d6, v6.d[1]
-; CHECK-GI-FP16-NEXT:    fcvt h26, d26
-; CHECK-GI-FP16-NEXT:    fcvt h4, d4
-; CHECK-GI-FP16-NEXT:    fcvt h5, d5
-; CHECK-GI-FP16-NEXT:    fcvt h17, d24
-; CHECK-GI-FP16-NEXT:    mov v2.h[1], v22.h[0]
-; CHECK-GI-FP16-NEXT:    fcvt h24, d23
-; CHECK-GI-FP16-NEXT:    fcvt h22, d27
-; CHECK-GI-FP16-NEXT:    mov d23, v23.d[1]
-; CHECK-GI-FP16-NEXT:    fcvt h6, d6
-; CHECK-GI-FP16-NEXT:    mov v18.h[1], v26.h[0]
-; CHECK-GI-FP16-NEXT:    fcvt h26, d25
-; CHECK-GI-FP16-NEXT:    mov d25, v25.d[1]
-; CHECK-GI-FP16-NEXT:    mov v3.h[1], v17.h[0]
-; CHECK-GI-FP16-NEXT:    mov d17, v19.d[1]
 ; CHECK-GI-FP16-NEXT:    fcvt h19, d19
-; CHECK-GI-FP16-NEXT:    mov v20.h[1], v22.h[0]
-; CHECK-GI-FP16-NEXT:    mov d22, v21.d[1]
-; CHECK-GI-FP16-NEXT:    fcvt h21, d21
-; CHECK-GI-FP16-NEXT:    mov v2.h[2], v24.h[0]
-; CHECK-GI-FP16-NEXT:    fcvt h24, d28
+; CHECK-GI-FP16-NEXT:    fcvt h27, d23
+; CHECK-GI-FP16-NEXT:    mov d23, v24.d[1]
+; CHECK-GI-FP16-NEXT:    mov d24, v25.d[1]
+; CHECK-GI-FP16-NEXT:    ldp q25, q16, [sp, #32]
+; CHECK-GI-FP16-NEXT:    fcvt h26, d26
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v28.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], v29.h[0]
+; CHECK-GI-FP16-NEXT:    scvtf v7.2d, v7.2d
+; CHECK-GI-FP16-NEXT:    mov v4.h[1], v27.h[0]
+; CHECK-GI-FP16-NEXT:    scvtf v25.2d, v25.2d
 ; CHECK-GI-FP16-NEXT:    fcvt h23, d23
-; CHECK-GI-FP16-NEXT:    mov v18.h[2], v26.h[0]
-; CHECK-GI-FP16-NEXT:    fcvt h25, d25
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v4.h[0]
+; CHECK-GI-FP16-NEXT:    mov v5.h[1], v26.h[0]
+; CHECK-GI-FP16-NEXT:    ldp q26, q27, [sp, #96]
+; CHECK-GI-FP16-NEXT:    fcvt h24, d24
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], v17.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[2], v20.h[0]
+; CHECK-GI-FP16-NEXT:    mov d20, v6.d[1]
+; CHECK-GI-FP16-NEXT:    mov d17, v2.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h2, d2
+; CHECK-GI-FP16-NEXT:    scvtf v26.2d, v26.2d
+; CHECK-GI-FP16-NEXT:    mov v4.h[2], v22.h[0]
+; CHECK-GI-FP16-NEXT:    fcvt h22, d25
+; CHECK-GI-FP16-NEXT:    mov v5.h[2], v21.h[0]
+; CHECK-GI-FP16-NEXT:    mov d21, v25.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h6, d6
+; CHECK-GI-FP16-NEXT:    mov v0.h[3], v18.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[3], v19.h[0]
+; CHECK-GI-FP16-NEXT:    scvtf v16.2d, v16.2d
+; CHECK-GI-FP16-NEXT:    scvtf v18.2d, v27.2d
+; CHECK-GI-FP16-NEXT:    fcvt h19, d20
 ; CHECK-GI-FP16-NEXT:    fcvt h17, d17
-; CHECK-GI-FP16-NEXT:    mov v3.h[2], v19.h[0]
-; CHECK-GI-FP16-NEXT:    mov v1.h[3], v5.h[0]
-; CHECK-GI-FP16-NEXT:    fcvt h19, d22
-; CHECK-GI-FP16-NEXT:    mov v20.h[2], v21.h[0]
-; CHECK-GI-FP16-NEXT:    mov v7.h[3], v6.h[0]
-; CHECK-GI-FP16-NEXT:    mov v16.h[3], v24.h[0]
-; CHECK-GI-FP16-NEXT:    mov v2.h[3], v23.h[0]
-; CHECK-GI-FP16-NEXT:    mov v18.h[3], v25.h[0]
-; CHECK-GI-FP16-NEXT:    mov v3.h[3], v17.h[0]
-; CHECK-GI-FP16-NEXT:    mov v20.h[3], v19.h[0]
-; CHECK-GI-FP16-NEXT:    mov v1.d[1], v7.d[0]
-; CHECK-GI-FP16-NEXT:    mov v0.d[1], v16.d[0]
-; CHECK-GI-FP16-NEXT:    mov v2.d[1], v18.d[0]
-; CHECK-GI-FP16-NEXT:    mov v3.d[1], v20.d[0]
+; CHECK-GI-FP16-NEXT:    mov d25, v26.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h26, d26
+; CHECK-GI-FP16-NEXT:    mov v4.h[3], v23.h[0]
+; CHECK-GI-FP16-NEXT:    mov v5.h[3], v24.h[0]
+; CHECK-GI-FP16-NEXT:    fcvt h20, d21
+; CHECK-GI-FP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[4], v6.h[0]
+; CHECK-GI-FP16-NEXT:    mov d2, v3.d[1]
+; CHECK-GI-FP16-NEXT:    mov d23, v18.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h18, d18
+; CHECK-GI-FP16-NEXT:    fcvt h3, d3
+; CHECK-GI-FP16-NEXT:    fcvt h21, d25
+; CHECK-GI-FP16-NEXT:    mov v4.h[4], v22.h[0]
+; CHECK-GI-FP16-NEXT:    mov d22, v16.d[1]
+; CHECK-GI-FP16-NEXT:    mov v5.h[4], v26.h[0]
+; CHECK-GI-FP16-NEXT:    fcvt h16, d16
+; CHECK-GI-FP16-NEXT:    mov d6, v7.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h7, d7
+; CHECK-GI-FP16-NEXT:    mov v0.h[5], v17.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[5], v19.h[0]
+; CHECK-GI-FP16-NEXT:    fcvt h19, d23
+; CHECK-GI-FP16-NEXT:    fcvt h2, d2
+; CHECK-GI-FP16-NEXT:    mov v4.h[5], v20.h[0]
+; CHECK-GI-FP16-NEXT:    fcvt h17, d22
+; CHECK-GI-FP16-NEXT:    mov v5.h[5], v21.h[0]
+; CHECK-GI-FP16-NEXT:    fcvt h6, d6
+; CHECK-GI-FP16-NEXT:    mov v0.h[6], v3.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[6], v7.h[0]
+; CHECK-GI-FP16-NEXT:    mov v4.h[6], v16.h[0]
+; CHECK-GI-FP16-NEXT:    mov v5.h[6], v18.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[7], v2.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[7], v6.h[0]
+; CHECK-GI-FP16-NEXT:    mov v4.h[7], v17.h[0]
+; CHECK-GI-FP16-NEXT:    mov v5.h[7], v19.h[0]
+; CHECK-GI-FP16-NEXT:    mov v2.16b, v4.16b
+; CHECK-GI-FP16-NEXT:    mov v3.16b, v5.16b
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = sitofp <32 x i64> %a to <32 x half>
@@ -4638,103 +4639,104 @@ define <32 x half> @utofp_v32i64_v32f16(<32 x i64> %a) {
 ;
 ; CHECK-GI-FP16-LABEL: utofp_v32i64_v32f16:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    ucvtf v16.2d, v2.2d
+; CHECK-GI-FP16-NEXT:    ldp q16, q18, [sp]
 ; CHECK-GI-FP16-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-GI-FP16-NEXT:    ucvtf v18.2d, v4.2d
-; CHECK-GI-FP16-NEXT:    ucvtf v17.2d, v6.2d
-; CHECK-GI-FP16-NEXT:    ucvtf v4.2d, v1.2d
-; CHECK-GI-FP16-NEXT:    ucvtf v3.2d, v3.2d
-; CHECK-GI-FP16-NEXT:    ldp q1, q23, [sp]
+; CHECK-GI-FP16-NEXT:    ldp q17, q19, [sp, #64]
+; CHECK-GI-FP16-NEXT:    ucvtf v4.2d, v4.2d
+; CHECK-GI-FP16-NEXT:    ucvtf v1.2d, v1.2d
 ; CHECK-GI-FP16-NEXT:    ucvtf v5.2d, v5.2d
-; CHECK-GI-FP16-NEXT:    ucvtf v6.2d, v7.2d
-; CHECK-GI-FP16-NEXT:    mov d20, v16.d[1]
-; CHECK-GI-FP16-NEXT:    mov d19, v0.d[1]
-; CHECK-GI-FP16-NEXT:    mov d21, v18.d[1]
-; CHECK-GI-FP16-NEXT:    mov d22, v17.d[1]
-; CHECK-GI-FP16-NEXT:    fcvt h16, d16
-; CHECK-GI-FP16-NEXT:    ucvtf v2.2d, v1.2d
+; CHECK-GI-FP16-NEXT:    ucvtf v6.2d, v6.2d
+; CHECK-GI-FP16-NEXT:    ucvtf v20.2d, v16.2d
+; CHECK-GI-FP16-NEXT:    ucvtf v24.2d, v18.2d
+; CHECK-GI-FP16-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-GI-FP16-NEXT:    ucvtf v16.2d, v17.2d
+; CHECK-GI-FP16-NEXT:    mov d21, v0.d[1]
+; CHECK-GI-FP16-NEXT:    ucvtf v25.2d, v19.2d
+; CHECK-GI-FP16-NEXT:    mov d22, v4.d[1]
 ; CHECK-GI-FP16-NEXT:    fcvt h0, d0
-; CHECK-GI-FP16-NEXT:    fcvt h1, d18
-; CHECK-GI-FP16-NEXT:    ldr q18, [sp, #32]
-; CHECK-GI-FP16-NEXT:    fcvt h7, d17
-; CHECK-GI-FP16-NEXT:    ldp q25, q17, [sp, #48]
-; CHECK-GI-FP16-NEXT:    fcvt h20, d20
-; CHECK-GI-FP16-NEXT:    fcvt h24, d19
-; CHECK-GI-FP16-NEXT:    fcvt h21, d21
-; CHECK-GI-FP16-NEXT:    fcvt h22, d22
-; CHECK-GI-FP16-NEXT:    ucvtf v18.2d, v18.2d
-; CHECK-GI-FP16-NEXT:    fcvt h26, d4
-; CHECK-GI-FP16-NEXT:    ucvtf v17.2d, v17.2d
-; CHECK-GI-FP16-NEXT:    fcvt h27, d3
-; CHECK-GI-FP16-NEXT:    fcvt h28, d6
-; CHECK-GI-FP16-NEXT:    ucvtf v23.2d, v23.2d
-; CHECK-GI-FP16-NEXT:    ucvtf v25.2d, v25.2d
-; CHECK-GI-FP16-NEXT:    mov d4, v4.d[1]
-; CHECK-GI-FP16-NEXT:    mov v16.h[1], v20.h[0]
-; CHECK-GI-FP16-NEXT:    ldp q19, q20, [sp, #80]
-; CHECK-GI-FP16-NEXT:    mov v0.h[1], v24.h[0]
-; CHECK-GI-FP16-NEXT:    fcvt h24, d5
-; CHECK-GI-FP16-NEXT:    mov v1.h[1], v21.h[0]
-; CHECK-GI-FP16-NEXT:    ldr q21, [sp, #112]
-; CHECK-GI-FP16-NEXT:    mov v7.h[1], v22.h[0]
-; CHECK-GI-FP16-NEXT:    mov d22, v2.d[1]
-; CHECK-GI-FP16-NEXT:    ucvtf v20.2d, v20.2d
-; CHECK-GI-FP16-NEXT:    fcvt h2, d2
-; CHECK-GI-FP16-NEXT:    ucvtf v19.2d, v19.2d
-; CHECK-GI-FP16-NEXT:    mov v16.h[2], v27.h[0]
-; CHECK-GI-FP16-NEXT:    ucvtf v21.2d, v21.2d
-; CHECK-GI-FP16-NEXT:    mov d5, v5.d[1]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v26.h[0]
-; CHECK-GI-FP16-NEXT:    mov d26, v18.d[1]
-; CHECK-GI-FP16-NEXT:    mov v1.h[2], v24.h[0]
-; CHECK-GI-FP16-NEXT:    mov d24, v17.d[1]
-; CHECK-GI-FP16-NEXT:    fcvt h22, d22
-; CHECK-GI-FP16-NEXT:    mov v7.h[2], v28.h[0]
-; CHECK-GI-FP16-NEXT:    mov d27, v20.d[1]
+; CHECK-GI-FP16-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-GI-FP16-NEXT:    mov d18, v1.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h17, d1
+; CHECK-GI-FP16-NEXT:    mov d19, v5.d[1]
+; CHECK-GI-FP16-NEXT:    mov d23, v20.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h1, d4
+; CHECK-GI-FP16-NEXT:    fcvt h4, d20
+; CHECK-GI-FP16-NEXT:    mov d26, v16.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h20, d5
+; CHECK-GI-FP16-NEXT:    fcvt h5, d16
+; CHECK-GI-FP16-NEXT:    fcvt h28, d21
+; CHECK-GI-FP16-NEXT:    fcvt h29, d22
+; CHECK-GI-FP16-NEXT:    fcvt h22, d24
+; CHECK-GI-FP16-NEXT:    fcvt h21, d25
 ; CHECK-GI-FP16-NEXT:    fcvt h18, d18
-; CHECK-GI-FP16-NEXT:    mov d28, v3.d[1]
-; CHECK-GI-FP16-NEXT:    fcvt h3, d17
-; CHECK-GI-FP16-NEXT:    fcvt h20, d20
-; CHECK-GI-FP16-NEXT:    mov d6, v6.d[1]
-; CHECK-GI-FP16-NEXT:    fcvt h26, d26
-; CHECK-GI-FP16-NEXT:    fcvt h4, d4
-; CHECK-GI-FP16-NEXT:    fcvt h5, d5
-; CHECK-GI-FP16-NEXT:    fcvt h17, d24
-; CHECK-GI-FP16-NEXT:    mov v2.h[1], v22.h[0]
-; CHECK-GI-FP16-NEXT:    fcvt h24, d23
-; CHECK-GI-FP16-NEXT:    fcvt h22, d27
-; CHECK-GI-FP16-NEXT:    mov d23, v23.d[1]
-; CHECK-GI-FP16-NEXT:    fcvt h6, d6
-; CHECK-GI-FP16-NEXT:    mov v18.h[1], v26.h[0]
-; CHECK-GI-FP16-NEXT:    fcvt h26, d25
-; CHECK-GI-FP16-NEXT:    mov d25, v25.d[1]
-; CHECK-GI-FP16-NEXT:    mov v3.h[1], v17.h[0]
-; CHECK-GI-FP16-NEXT:    mov d17, v19.d[1]
 ; CHECK-GI-FP16-NEXT:    fcvt h19, d19
-; CHECK-GI-FP16-NEXT:    mov v20.h[1], v22.h[0]
-; CHECK-GI-FP16-NEXT:    mov d22, v21.d[1]
-; CHECK-GI-FP16-NEXT:    fcvt h21, d21
-; CHECK-GI-FP16-NEXT:    mov v2.h[2], v24.h[0]
-; CHECK-GI-FP16-NEXT:    fcvt h24, d28
+; CHECK-GI-FP16-NEXT:    fcvt h27, d23
+; CHECK-GI-FP16-NEXT:    mov d23, v24.d[1]
+; CHECK-GI-FP16-NEXT:    mov d24, v25.d[1]
+; CHECK-GI-FP16-NEXT:    ldp q25, q16, [sp, #32]
+; CHECK-GI-FP16-NEXT:    fcvt h26, d26
+; CHECK-GI-FP16-NEXT:    mov v0.h[1], v28.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[1], v29.h[0]
+; CHECK-GI-FP16-NEXT:    ucvtf v7.2d, v7.2d
+; CHECK-GI-FP16-NEXT:    mov v4.h[1], v27.h[0]
+; CHECK-GI-FP16-NEXT:    ucvtf v25.2d, v25.2d
 ; CHECK-GI-FP16-NEXT:    fcvt h23, d23
-; CHECK-GI-FP16-NEXT:    mov v18.h[2], v26.h[0]
-; CHECK-GI-FP16-NEXT:    fcvt h25, d25
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v4.h[0]
+; CHECK-GI-FP16-NEXT:    mov v5.h[1], v26.h[0]
+; CHECK-GI-FP16-NEXT:    ldp q26, q27, [sp, #96]
+; CHECK-GI-FP16-NEXT:    fcvt h24, d24
+; CHECK-GI-FP16-NEXT:    mov v0.h[2], v17.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[2], v20.h[0]
+; CHECK-GI-FP16-NEXT:    mov d20, v6.d[1]
+; CHECK-GI-FP16-NEXT:    mov d17, v2.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h2, d2
+; CHECK-GI-FP16-NEXT:    ucvtf v26.2d, v26.2d
+; CHECK-GI-FP16-NEXT:    mov v4.h[2], v22.h[0]
+; CHECK-GI-FP16-NEXT:    fcvt h22, d25
+; CHECK-GI-FP16-NEXT:    mov v5.h[2], v21.h[0]
+; CHECK-GI-FP16-NEXT:    mov d21, v25.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h6, d6
+; CHECK-GI-FP16-NEXT:    mov v0.h[3], v18.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[3], v19.h[0]
+; CHECK-GI-FP16-NEXT:    ucvtf v16.2d, v16.2d
+; CHECK-GI-FP16-NEXT:    ucvtf v18.2d, v27.2d
+; CHECK-GI-FP16-NEXT:    fcvt h19, d20
 ; CHECK-GI-FP16-NEXT:    fcvt h17, d17
-; CHECK-GI-FP16-NEXT:    mov v3.h[2], v19.h[0]
-; CHECK-GI-FP16-NEXT:    mov v1.h[3], v5.h[0]
-; CHECK-GI-FP16-NEXT:    fcvt h19, d22
-; CHECK-GI-FP16-NEXT:    mov v20.h[2], v21.h[0]
-; CHECK-GI-FP16-NEXT:    mov v7.h[3], v6.h[0]
-; CHECK-GI-FP16-NEXT:    mov v16.h[3], v24.h[0]
-; CHECK-GI-FP16-NEXT:    mov v2.h[3], v23.h[0]
-; CHECK-GI-FP16-NEXT:    mov v18.h[3], v25.h[0]
-; CHECK-GI-FP16-NEXT:    mov v3.h[3], v17.h[0]
-; CHECK-GI-FP16-NEXT:    mov v20.h[3], v19.h[0]
-; CHECK-GI-FP16-NEXT:    mov v1.d[1], v7.d[0]
-; CHECK-GI-FP16-NEXT:    mov v0.d[1], v16.d[0]
-; CHECK-GI-FP16-NEXT:    mov v2.d[1], v18.d[0]
-; CHECK-GI-FP16-NEXT:    mov v3.d[1], v20.d[0]
+; CHECK-GI-FP16-NEXT:    mov d25, v26.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h26, d26
+; CHECK-GI-FP16-NEXT:    mov v4.h[3], v23.h[0]
+; CHECK-GI-FP16-NEXT:    mov v5.h[3], v24.h[0]
+; CHECK-GI-FP16-NEXT:    fcvt h20, d21
+; CHECK-GI-FP16-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[4], v6.h[0]
+; CHECK-GI-FP16-NEXT:    mov d2, v3.d[1]
+; CHECK-GI-FP16-NEXT:    mov d23, v18.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h18, d18
+; CHECK-GI-FP16-NEXT:    fcvt h3, d3
+; CHECK-GI-FP16-NEXT:    fcvt h21, d25
+; CHECK-GI-FP16-NEXT:    mov v4.h[4], v22.h[0]
+; CHECK-GI-FP16-NEXT:    mov d22, v16.d[1]
+; CHECK-GI-FP16-NEXT:    mov v5.h[4], v26.h[0]
+; CHECK-GI-FP16-NEXT:    fcvt h16, d16
+; CHECK-GI-FP16-NEXT:    mov d6, v7.d[1]
+; CHECK-GI-FP16-NEXT:    fcvt h7, d7
+; CHECK-GI-FP16-NEXT:    mov v0.h[5], v17.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[5], v19.h[0]
+; CHECK-GI-FP16-NEXT:    fcvt h19, d23
+; CHECK-GI-FP16-NEXT:    fcvt h2, d2
+; CHECK-GI-FP16-NEXT:    mov v4.h[5], v20.h[0]
+; CHECK-GI-FP16-NEXT:    fcvt h17, d22
+; CHECK-GI-FP16-NEXT:    mov v5.h[5], v21.h[0]
+; CHECK-GI-FP16-NEXT:    fcvt h6, d6
+; CHECK-GI-FP16-NEXT:    mov v0.h[6], v3.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[6], v7.h[0]
+; CHECK-GI-FP16-NEXT:    mov v4.h[6], v16.h[0]
+; CHECK-GI-FP16-NEXT:    mov v5.h[6], v18.h[0]
+; CHECK-GI-FP16-NEXT:    mov v0.h[7], v2.h[0]
+; CHECK-GI-FP16-NEXT:    mov v1.h[7], v6.h[0]
+; CHECK-GI-FP16-NEXT:    mov v4.h[7], v17.h[0]
+; CHECK-GI-FP16-NEXT:    mov v5.h[7], v19.h[0]
+; CHECK-GI-FP16-NEXT:    mov v2.16b, v4.16b
+; CHECK-GI-FP16-NEXT:    mov v3.16b, v5.16b
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = uitofp <32 x i64> %a to <32 x half>
diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
index f17b9724aadba..01620652301ed 100644
--- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -1137,11 +1137,8 @@ define <4 x i32> @vselect_constant_cond_zero_v4i32(<4 x i32> %a) {
 ;
 ; CHECK-GI-LABEL: vselect_constant_cond_zero_v4i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI85_1
-; CHECK-GI-NEXT:    adrp x9, .LCPI85_0
-; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI85_1]
-; CHECK-GI-NEXT:    ldr d2, [x9, :lo12:.LCPI85_0]
-; CHECK-GI-NEXT:    mov v1.d[1], v2.d[0]
+; CHECK-GI-NEXT:    adrp x8, .LCPI85_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI85_0]
 ; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #31
 ; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #31
 ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
@@ -1204,11 +1201,8 @@ define <4 x i32> @vselect_constant_cond_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ;
 ; CHECK-GI-LABEL: vselect_constant_cond_v4i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI88_1
-; CHECK-GI-NEXT:    adrp x9, .LCPI88_0
-; CHECK-GI-NEXT:    ldr d2, [x8, :lo12:.LCPI88_1]
-; CHECK-GI-NEXT:    ldr d3, [x9, :lo12:.LCPI88_0]
-; CHECK-GI-NEXT:    mov v2.d[1], v3.d[0]
+; CHECK-GI-NEXT:    adrp x8, .LCPI88_0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI88_0]
 ; CHECK-GI-NEXT:    shl v2.4s, v2.4s, #31
 ; CHECK-GI-NEXT:    sshr v2.4s, v2.4s, #31
 ; CHECK-GI-NEXT:    bif v0.16b, v1.16b, v2.16b
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 1531154b8a03c..86dd1bdd511eb 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -2092,104 +2092,104 @@ define i32 @test_udot_v24i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-DOT-LABEL: test_udot_v24i8:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
 ; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #1]
+; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #1]
 ; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #8]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #9]
-; CHECK-GI-DOT-NEXT:    ldr b2, [x0, #16]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1]
-; CHECK-GI-DOT-NEXT:    mov v1.b[1], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #1]
+; CHECK-GI-DOT-NEXT:    ldr b2, [x1]
+; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #1]
+; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #8]
+; CHECK-GI-DOT-NEXT:    mov v1.b[1], v3.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #2]
 ; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #8]
+; CHECK-GI-DOT-NEXT:    mov v2.b[1], v4.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #2]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #17]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #17]
+; CHECK-GI-DOT-NEXT:    mov v1.b[2], v3.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #3]
+; CHECK-GI-DOT-NEXT:    mov v2.b[2], v4.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #3]
+; CHECK-GI-DOT-NEXT:    mov v1.b[3], v3.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #4]
+; CHECK-GI-DOT-NEXT:    mov v2.b[3], v4.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #4]
+; CHECK-GI-DOT-NEXT:    mov v1.b[4], v3.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #5]
+; CHECK-GI-DOT-NEXT:    mov v2.b[4], v4.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #5]
+; CHECK-GI-DOT-NEXT:    mov v1.b[5], v3.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #6]
+; CHECK-GI-DOT-NEXT:    mov v2.b[5], v4.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #6]
+; CHECK-GI-DOT-NEXT:    mov v1.b[6], v3.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #7]
+; CHECK-GI-DOT-NEXT:    mov v2.b[6], v4.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #7]
+; CHECK-GI-DOT-NEXT:    mov v1.b[7], v3.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #16]
+; CHECK-GI-DOT-NEXT:    mov v2.b[7], v4.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #16]
 ; CHECK-GI-DOT-NEXT:    mov v3.b[1], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #9]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #17]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #18]
 ; CHECK-GI-DOT-NEXT:    mov v4.b[1], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x1, #16]
-; CHECK-GI-DOT-NEXT:    ldr b19, [x1, #17]
-; CHECK-GI-DOT-NEXT:    mov v6.b[1], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #2]
-; CHECK-GI-DOT-NEXT:    mov v2.b[1], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #10]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #2]
-; CHECK-GI-DOT-NEXT:    mov v5.b[1], v19.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #10]
-; CHECK-GI-DOT-NEXT:    mov v1.b[2], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b19, [x0, #18]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #18]
-; CHECK-GI-DOT-NEXT:    mov v3.b[2], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v4.b[2], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[2], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #3]
-; CHECK-GI-DOT-NEXT:    mov v2.b[2], v19.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #3]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #11]
-; CHECK-GI-DOT-NEXT:    mov v5.b[2], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #11]
-; CHECK-GI-DOT-NEXT:    ldr b19, [x0, #19]
-; CHECK-GI-DOT-NEXT:    mov v1.b[3], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #18]
+; CHECK-GI-DOT-NEXT:    mov v1.b[8], v5.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #9]
+; CHECK-GI-DOT-NEXT:    mov v2.b[8], v6.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #9]
+; CHECK-GI-DOT-NEXT:    mov v3.b[2], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #19]
+; CHECK-GI-DOT-NEXT:    mov v4.b[2], v16.b[0]
 ; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #19]
+; CHECK-GI-DOT-NEXT:    mov v1.b[9], v5.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #10]
+; CHECK-GI-DOT-NEXT:    mov v2.b[9], v6.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #10]
 ; CHECK-GI-DOT-NEXT:    mov v3.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    mov v4.b[3], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[3], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #4]
-; CHECK-GI-DOT-NEXT:    mov v2.b[3], v19.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #4]
-; CHECK-GI-DOT-NEXT:    mov v5.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #12]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #12]
-; CHECK-GI-DOT-NEXT:    ldr b19, [x0, #20]
-; CHECK-GI-DOT-NEXT:    mov v1.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #20]
-; CHECK-GI-DOT-NEXT:    mov v3.b[4], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v4.b[4], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[4], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #5]
-; CHECK-GI-DOT-NEXT:    mov v2.b[4], v19.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #5]
-; CHECK-GI-DOT-NEXT:    mov v5.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #13]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #13]
-; CHECK-GI-DOT-NEXT:    ldr b19, [x0, #21]
-; CHECK-GI-DOT-NEXT:    mov v1.b[5], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #20]
+; CHECK-GI-DOT-NEXT:    mov v4.b[3], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #20]
+; CHECK-GI-DOT-NEXT:    mov v1.b[10], v5.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #11]
+; CHECK-GI-DOT-NEXT:    mov v2.b[10], v6.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #11]
+; CHECK-GI-DOT-NEXT:    mov v3.b[4], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #21]
+; CHECK-GI-DOT-NEXT:    mov v4.b[4], v16.b[0]
 ; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #21]
+; CHECK-GI-DOT-NEXT:    mov v1.b[11], v5.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #12]
+; CHECK-GI-DOT-NEXT:    mov v2.b[11], v6.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #12]
 ; CHECK-GI-DOT-NEXT:    mov v3.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    mov v4.b[5], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[5], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #6]
-; CHECK-GI-DOT-NEXT:    mov v2.b[5], v19.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #6]
-; CHECK-GI-DOT-NEXT:    mov v5.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #14]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #14]
-; CHECK-GI-DOT-NEXT:    ldr b19, [x0, #22]
-; CHECK-GI-DOT-NEXT:    mov v1.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #22]
-; CHECK-GI-DOT-NEXT:    mov v3.b[6], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v4.b[6], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[6], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #7]
-; CHECK-GI-DOT-NEXT:    mov v2.b[6], v19.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #7]
-; CHECK-GI-DOT-NEXT:    mov v5.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #15]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #15]
-; CHECK-GI-DOT-NEXT:    ldr b19, [x0, #23]
-; CHECK-GI-DOT-NEXT:    mov v1.b[7], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #22]
+; CHECK-GI-DOT-NEXT:    mov v4.b[5], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #22]
+; CHECK-GI-DOT-NEXT:    mov v1.b[12], v5.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #13]
+; CHECK-GI-DOT-NEXT:    mov v2.b[12], v6.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #13]
+; CHECK-GI-DOT-NEXT:    mov v3.b[6], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #23]
+; CHECK-GI-DOT-NEXT:    mov v4.b[6], v16.b[0]
 ; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #23]
+; CHECK-GI-DOT-NEXT:    mov v1.b[13], v5.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #14]
+; CHECK-GI-DOT-NEXT:    mov v2.b[13], v6.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #14]
 ; CHECK-GI-DOT-NEXT:    mov v3.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    mov v4.b[7], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[7], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v2.b[7], v19.b[0]
-; CHECK-GI-DOT-NEXT:    mov v5.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v4.d[1], v6.d[0]
-; CHECK-GI-DOT-NEXT:    mov v1.d[1], v3.d[0]
-; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    mov v2.d[1], v0.d[0]
-; CHECK-GI-DOT-NEXT:    mov v5.d[1], v0.d[0]
-; CHECK-GI-DOT-NEXT:    udot v3.4s, v4.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    udot v0.4s, v5.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    add v0.4s, v3.4s, v0.4s
+; CHECK-GI-DOT-NEXT:    mov v4.b[7], v16.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[14], v5.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #15]
+; CHECK-GI-DOT-NEXT:    mov v2.b[14], v6.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #15]
+; CHECK-GI-DOT-NEXT:    mov v3.d[1], v0.d[0]
+; CHECK-GI-DOT-NEXT:    mov v4.d[1], v0.d[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[15], v5.b[0]
+; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    mov v2.b[15], v6.b[0]
+; CHECK-GI-DOT-NEXT:    udot v0.4s, v4.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    udot v5.4s, v2.16b, v1.16b
+; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v0.4s
 ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
 ; CHECK-GI-DOT-NEXT:    fmov w0, s0
 ; CHECK-GI-DOT-NEXT:    ret
@@ -2670,104 +2670,104 @@ define i32 @test_sdot_v24i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-DOT-LABEL: test_sdot_v24i8:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
 ; CHECK-GI-DOT-NEXT:    ldr b1, [x0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #1]
+; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #1]
 ; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #8]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #9]
-; CHECK-GI-DOT-NEXT:    ldr b2, [x0, #16]
-; CHECK-GI-DOT-NEXT:    ldr b4, [x1]
-; CHECK-GI-DOT-NEXT:    mov v1.b[1], v5.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #1]
+; CHECK-GI-DOT-NEXT:    ldr b2, [x1]
+; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #1]
+; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #8]
+; CHECK-GI-DOT-NEXT:    mov v1.b[1], v3.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #2]
 ; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #8]
+; CHECK-GI-DOT-NEXT:    mov v2.b[1], v4.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #2]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #17]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #17]
+; CHECK-GI-DOT-NEXT:    mov v1.b[2], v3.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #3]
+; CHECK-GI-DOT-NEXT:    mov v2.b[2], v4.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #3]
+; CHECK-GI-DOT-NEXT:    mov v1.b[3], v3.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #4]
+; CHECK-GI-DOT-NEXT:    mov v2.b[3], v4.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #4]
+; CHECK-GI-DOT-NEXT:    mov v1.b[4], v3.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #5]
+; CHECK-GI-DOT-NEXT:    mov v2.b[4], v4.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #5]
+; CHECK-GI-DOT-NEXT:    mov v1.b[5], v3.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #6]
+; CHECK-GI-DOT-NEXT:    mov v2.b[5], v4.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #6]
+; CHECK-GI-DOT-NEXT:    mov v1.b[6], v3.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #7]
+; CHECK-GI-DOT-NEXT:    mov v2.b[6], v4.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #7]
+; CHECK-GI-DOT-NEXT:    mov v1.b[7], v3.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b3, [x0, #16]
+; CHECK-GI-DOT-NEXT:    mov v2.b[7], v4.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b4, [x1, #16]
 ; CHECK-GI-DOT-NEXT:    mov v3.b[1], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #9]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x0, #17]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #18]
 ; CHECK-GI-DOT-NEXT:    mov v4.b[1], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b5, [x1, #16]
-; CHECK-GI-DOT-NEXT:    ldr b19, [x1, #17]
-; CHECK-GI-DOT-NEXT:    mov v6.b[1], v17.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #2]
-; CHECK-GI-DOT-NEXT:    mov v2.b[1], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #10]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #2]
-; CHECK-GI-DOT-NEXT:    mov v5.b[1], v19.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #10]
-; CHECK-GI-DOT-NEXT:    mov v1.b[2], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b19, [x0, #18]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #18]
-; CHECK-GI-DOT-NEXT:    mov v3.b[2], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v4.b[2], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[2], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #3]
-; CHECK-GI-DOT-NEXT:    mov v2.b[2], v19.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #3]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #11]
-; CHECK-GI-DOT-NEXT:    mov v5.b[2], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #11]
-; CHECK-GI-DOT-NEXT:    ldr b19, [x0, #19]
-; CHECK-GI-DOT-NEXT:    mov v1.b[3], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #18]
+; CHECK-GI-DOT-NEXT:    mov v1.b[8], v5.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #9]
+; CHECK-GI-DOT-NEXT:    mov v2.b[8], v6.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #9]
+; CHECK-GI-DOT-NEXT:    mov v3.b[2], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #19]
+; CHECK-GI-DOT-NEXT:    mov v4.b[2], v16.b[0]
 ; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #19]
+; CHECK-GI-DOT-NEXT:    mov v1.b[9], v5.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #10]
+; CHECK-GI-DOT-NEXT:    mov v2.b[9], v6.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #10]
 ; CHECK-GI-DOT-NEXT:    mov v3.b[3], v7.b[0]
-; CHECK-GI-DOT-NEXT:    mov v4.b[3], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[3], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #4]
-; CHECK-GI-DOT-NEXT:    mov v2.b[3], v19.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #4]
-; CHECK-GI-DOT-NEXT:    mov v5.b[3], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #12]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #12]
-; CHECK-GI-DOT-NEXT:    ldr b19, [x0, #20]
-; CHECK-GI-DOT-NEXT:    mov v1.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #20]
-; CHECK-GI-DOT-NEXT:    mov v3.b[4], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v4.b[4], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[4], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #5]
-; CHECK-GI-DOT-NEXT:    mov v2.b[4], v19.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #5]
-; CHECK-GI-DOT-NEXT:    mov v5.b[4], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #13]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #13]
-; CHECK-GI-DOT-NEXT:    ldr b19, [x0, #21]
-; CHECK-GI-DOT-NEXT:    mov v1.b[5], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #20]
+; CHECK-GI-DOT-NEXT:    mov v4.b[3], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #20]
+; CHECK-GI-DOT-NEXT:    mov v1.b[10], v5.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #11]
+; CHECK-GI-DOT-NEXT:    mov v2.b[10], v6.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #11]
+; CHECK-GI-DOT-NEXT:    mov v3.b[4], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #21]
+; CHECK-GI-DOT-NEXT:    mov v4.b[4], v16.b[0]
 ; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #21]
+; CHECK-GI-DOT-NEXT:    mov v1.b[11], v5.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #12]
+; CHECK-GI-DOT-NEXT:    mov v2.b[11], v6.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #12]
 ; CHECK-GI-DOT-NEXT:    mov v3.b[5], v7.b[0]
-; CHECK-GI-DOT-NEXT:    mov v4.b[5], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[5], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #6]
-; CHECK-GI-DOT-NEXT:    mov v2.b[5], v19.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #6]
-; CHECK-GI-DOT-NEXT:    mov v5.b[5], v16.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #14]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #14]
-; CHECK-GI-DOT-NEXT:    ldr b19, [x0, #22]
-; CHECK-GI-DOT-NEXT:    mov v1.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x1, #22]
-; CHECK-GI-DOT-NEXT:    mov v3.b[6], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v4.b[6], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[6], v18.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b16, [x0, #7]
-; CHECK-GI-DOT-NEXT:    mov v2.b[6], v19.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b17, [x1, #7]
-; CHECK-GI-DOT-NEXT:    mov v5.b[6], v7.b[0]
-; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #15]
-; CHECK-GI-DOT-NEXT:    ldr b18, [x1, #15]
-; CHECK-GI-DOT-NEXT:    ldr b19, [x0, #23]
-; CHECK-GI-DOT-NEXT:    mov v1.b[7], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #22]
+; CHECK-GI-DOT-NEXT:    mov v4.b[5], v16.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #22]
+; CHECK-GI-DOT-NEXT:    mov v1.b[12], v5.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #13]
+; CHECK-GI-DOT-NEXT:    mov v2.b[12], v6.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #13]
+; CHECK-GI-DOT-NEXT:    mov v3.b[6], v7.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b7, [x0, #23]
+; CHECK-GI-DOT-NEXT:    mov v4.b[6], v16.b[0]
 ; CHECK-GI-DOT-NEXT:    ldr b16, [x1, #23]
+; CHECK-GI-DOT-NEXT:    mov v1.b[13], v5.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #14]
+; CHECK-GI-DOT-NEXT:    mov v2.b[13], v6.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #14]
 ; CHECK-GI-DOT-NEXT:    mov v3.b[7], v7.b[0]
-; CHECK-GI-DOT-NEXT:    mov v4.b[7], v17.b[0]
-; CHECK-GI-DOT-NEXT:    mov v6.b[7], v18.b[0]
-; CHECK-GI-DOT-NEXT:    mov v2.b[7], v19.b[0]
-; CHECK-GI-DOT-NEXT:    mov v5.b[7], v16.b[0]
-; CHECK-GI-DOT-NEXT:    mov v4.d[1], v6.d[0]
-; CHECK-GI-DOT-NEXT:    mov v1.d[1], v3.d[0]
-; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    mov v2.d[1], v0.d[0]
-; CHECK-GI-DOT-NEXT:    mov v5.d[1], v0.d[0]
-; CHECK-GI-DOT-NEXT:    sdot v3.4s, v4.16b, v1.16b
-; CHECK-GI-DOT-NEXT:    sdot v0.4s, v5.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    add v0.4s, v3.4s, v0.4s
+; CHECK-GI-DOT-NEXT:    mov v4.b[7], v16.b[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[14], v5.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #15]
+; CHECK-GI-DOT-NEXT:    mov v2.b[14], v6.b[0]
+; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #15]
+; CHECK-GI-DOT-NEXT:    mov v3.d[1], v0.d[0]
+; CHECK-GI-DOT-NEXT:    mov v4.d[1], v0.d[0]
+; CHECK-GI-DOT-NEXT:    mov v1.b[15], v5.b[0]
+; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    mov v2.b[15], v6.b[0]
+; CHECK-GI-DOT-NEXT:    sdot v0.4s, v4.16b, v3.16b
+; CHECK-GI-DOT-NEXT:    sdot v5.4s, v2.16b, v1.16b
+; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v0.4s
 ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
 ; CHECK-GI-DOT-NEXT:    fmov w0, s0
 ; CHECK-GI-DOT-NEXT:    ret

>From b3cb99b9456b4c771e12ac0452500d0bf96ae568 Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Wed, 7 Feb 2024 10:21:28 +0000
Subject: [PATCH 2/2] fixup! [AArch64][GloablISel] Refactor Combine
 G_CONCAT_VECTOR

---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  | 16 +++++------
 .../include/llvm/Target/GlobalISel/Combine.td |  4 +--
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 27 ++++++++++---------
 llvm/lib/Target/AArch64/AArch64Combine.td     |  2 +-
 4 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 133b5d2c4e6b7..6a805ee40a7d8 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -227,19 +227,15 @@ class CombinerHelper {
   /// ==========================================================
   /// Check if the G_CONCAT_VECTORS \p MI is undef or if it
   /// can be flattened into a build_vector.
-  /// In the first case \p bool will be true.
-  /// In the second case \p SmallVector<Register> will contain the operands
+  /// In the first case \p Ops will be empty
+  /// In the second case \p Ops will contain the operands
   /// needed to produce the flattened build_vector.
   ///
   /// \pre MI.getOpcode() == G_CONCAT_VECTORS.
-  bool
-  matchCombineConcatVectors(MachineInstr &MI,
-                            std::pair<bool, SmallVector<Register>> &matchinfo);
-  /// Replace \p MI with a flattened build_vector with \p SmallVector<Register>
-  /// or an implicit_def if \p bool is true.
-  void
-  applyCombineConcatVectors(MachineInstr &MI,
-                            std::pair<bool, SmallVector<Register>> &matchinfo);
+  bool matchCombineConcatVectors(MachineInstr &MI, SmallVector<Register> &Ops);
+  /// Replace \p MI with a flattened build_vector with \p Ops
+  /// or an implicit_def if \p Ops is empty.
+  void applyCombineConcatVectors(MachineInstr &MI, SmallVector<Register> &Ops);
 
   /// Try to combine G_SHUFFLE_VECTOR into G_CONCAT_VECTORS.
   /// Returns true if MI changed.
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 77a6faaf837d5..7eadb718f1641 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1254,7 +1254,7 @@ def match_ors : GICombineRule<
   (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
 
 // Combines concat operations
-def concat_matchinfo : GIDefMatchData<"std::pair<bool, SmallVector<Register>>">;
+def concat_matchinfo : GIDefMatchData<"SmallVector<Register>">;
 def combine_concat_vector : GICombineRule<
   (defs root:$root, concat_matchinfo:$matchinfo),
   (match (wip_match_opcode G_CONCAT_VECTORS):$root,
@@ -1342,4 +1342,4 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
 // compile time performance.
 def optnone_combines : GICombineGroup<[trivial_combines,
     ptr_add_immed_chain, combines_for_extload,
-    not_cmp_fold, opt_brcond_by_inverting_cond]>;
+    not_cmp_fold, opt_brcond_by_inverting_cond, combine_concat_vector]>;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 71b383b54a141..b400eb34e2901 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -222,11 +222,11 @@ void CombinerHelper::applyCombineCopy(MachineInstr &MI) {
   replaceRegWith(MRI, DstReg, SrcReg);
 }
 
-bool CombinerHelper::matchCombineConcatVectors(
-    MachineInstr &MI, std::pair<bool, SmallVector<Register>> &matchinfo) {
+bool CombinerHelper::matchCombineConcatVectors(MachineInstr &MI,
+                                               SmallVector<Register> &Ops) {
   assert(MI.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
          "Invalid instruction");
-  matchinfo.first = true;
+  bool IsUndef = true;
   MachineInstr *Undef = nullptr;
 
   // Walk over all the operands of concat vectors and check if they are
@@ -240,11 +240,11 @@ bool CombinerHelper::matchCombineConcatVectors(
       return false;
     switch (Def->getOpcode()) {
     case TargetOpcode::G_BUILD_VECTOR:
-      matchinfo.first = false;
+      IsUndef = false;
       // Remember the operands of the build_vector to fold
       // them into the yet-to-build flattened concat vectors.
       for (const MachineOperand &BuildVecMO : Def->uses())
-        matchinfo.second.push_back(BuildVecMO.getReg());
+        Ops.push_back(BuildVecMO.getReg());
       break;
     case TargetOpcode::G_IMPLICIT_DEF: {
       LLT OpType = MRI.getType(Reg);
@@ -260,7 +260,7 @@ bool CombinerHelper::matchCombineConcatVectors(
       // for the flattening.
       for (unsigned EltIdx = 0, EltEnd = OpType.getNumElements();
            EltIdx != EltEnd; ++EltIdx)
-        matchinfo.second.push_back(Undef->getOperand(0).getReg());
+        Ops.push_back(Undef->getOperand(0).getReg());
       break;
     }
     default:
@@ -270,15 +270,18 @@ bool CombinerHelper::matchCombineConcatVectors(
 
   // Check if the combine is illegal
   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
-  if (!isLegalOrBeforeLegalizer({TargetOpcode::G_BUILD_VECTOR,
-                                 {DstTy, MRI.getType(matchinfo.second[0])}})) {
+  if (!isLegalOrBeforeLegalizer(
+          {TargetOpcode::G_BUILD_VECTOR, {DstTy, MRI.getType(Ops[0])}})) {
     return false;
   }
 
+  if (IsUndef)
+    Ops.clear();
+
   return true;
 }
-void CombinerHelper::applyCombineConcatVectors(
-    MachineInstr &MI, std::pair<bool, SmallVector<Register>> &matchinfo) {
+void CombinerHelper::applyCombineConcatVectors(MachineInstr &MI,
+                                               SmallVector<Register> &Ops) {
   // We determined that the concat_vectors can be flatten.
   // Generate the flattened build_vector.
   Register DstReg = MI.getOperand(0).getReg();
@@ -291,10 +294,10 @@ void CombinerHelper::applyCombineConcatVectors(
   // clean that up.  For now, given we already gather this information
   // in matchCombineConcatVectors, just save compile time and issue the
   // right thing.
-  if (matchinfo.first)
+  if (Ops.empty())
     Builder.buildUndef(NewDstReg);
   else
-    Builder.buildBuildVector(NewDstReg, matchinfo.second);
+    Builder.buildBuildVector(NewDstReg, Ops);
   MI.eraseFromParent();
   replaceRegWith(MRI, DstReg, NewDstReg);
 }
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index e4d8359c71e62..b839e597d7c46 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -64,7 +64,7 @@ def AArch64PreLegalizerCombiner: GICombiner<
 }
 
 def AArch64O0PreLegalizerCombiner: GICombiner<
-  "AArch64O0PreLegalizerCombinerImpl", [optnone_combines, combine_concat_vector]> {
+  "AArch64O0PreLegalizerCombinerImpl", [optnone_combines]> {
   let CombineAllMethodName = "tryCombineAllImpl";
 }
 



More information about the llvm-commits mailing list