[llvm] [GlobalIsel] Pust cast through build vector (PR #104634)

Thorsten Schütt via llvm-commits llvm-commits at lists.llvm.org
Fri Aug 16 12:26:13 PDT 2024


https://github.com/tschuett updated https://github.com/llvm/llvm-project/pull/104634

>From 0f93b8e2173645043466ffc08f98699f8d239a1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Fri, 16 Aug 2024 20:57:01 +0200
Subject: [PATCH 1/2] [GlobalIsel] Pust cast through build vector

Credits: https://github.com/llvm/llvm-project/pull/100563
---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |   3 +
 .../include/llvm/Target/GlobalISel/Combine.td |  17 +-
 .../GlobalISel/CombinerHelperCasts.cpp        |  39 +
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |   1 +
 .../AArch64/GlobalISel/combine-cast.mir       |  92 +++
 .../GlobalISel/combine-extract-vec-elt.mir    |   4 +-
 .../AArch64/GlobalISel/combine-with-flags.mir |  45 +-
 .../CodeGen/AArch64/arm64-subvector-extend.ll | 456 +++++++-----
 llvm/test/CodeGen/AArch64/arm64-vadd.ll       |  46 +-
 llvm/test/CodeGen/AArch64/neon-extadd.ll      | 376 ++++++----
 llvm/test/CodeGen/AArch64/sext.ll             | 354 ++++++----
 llvm/test/CodeGen/AArch64/vecreduce-add.ll    | 664 +++++++++++-------
 llvm/test/CodeGen/AArch64/xtn.ll              |  46 +-
 llvm/test/CodeGen/AArch64/zext.ll             | 263 ++++---
 ...mbine-shl-from-extend-narrow.postlegal.mir |  12 +-
 ...ombine-shl-from-extend-narrow.prelegal.mir |  10 +-
 16 files changed, 1491 insertions(+), 937 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 32effc536eb35d..9b62d6067be39c 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -900,6 +900,9 @@ class CombinerHelper {
   bool matchExtOfExt(const MachineInstr &FirstMI, const MachineInstr &SecondMI,
                      BuildFnTy &MatchInfo);
 
+  bool matchCastOfBuildVector(const MachineInstr &CastMI,
+                              const MachineInstr &BVMI, BuildFnTy &MatchInfo);
+
 private:
   /// Checks for legality of an indexed variant of \p LdSt.
   bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 814c5e789cb374..c95f542757c66b 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1857,6 +1857,18 @@ def anyext_of_anyext : ext_of_ext_opcodes<G_ANYEXT, G_ANYEXT>;
 def anyext_of_zext : ext_of_ext_opcodes<G_ANYEXT, G_ZEXT>;
 def anyext_of_sext : ext_of_ext_opcodes<G_ANYEXT, G_SEXT>;
 
+// Push cast through build vector.
+class buildvector_of_opcode<Instruction castOpcode> : GICombineRule <
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (G_BUILD_VECTOR $bv, GIVariadic<>:$unused):$Build,
+         (castOpcode $root, $bv):$Cast,
+         [{ return Helper.matchCastOfBuildVector(*${Cast}, *${Build}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${Cast}, ${matchinfo}); }])>;
+
+def buildvector_of_zext : buildvector_of_opcode<G_ZEXT>;
+def buildvector_of_anyext : buildvector_of_opcode<G_ANYEXT>;
+def buildvector_of_truncate : buildvector_of_opcode<G_TRUNC>;
+
 def cast_combines: GICombineGroup<[
   truncate_of_zext,
   truncate_of_sext,
@@ -1870,7 +1882,10 @@ def cast_combines: GICombineGroup<[
   sext_of_anyext,
   anyext_of_anyext,
   anyext_of_zext,
-  anyext_of_sext
+  anyext_of_sext,
+  buildvector_of_zext,
+  buildvector_of_anyext,
+  buildvector_of_truncate
 ]>;
 
 
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
index 494d8da84445d1..e3208c16cfd586 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
@@ -273,3 +273,42 @@ bool CombinerHelper::matchExtOfExt(const MachineInstr &FirstMI,
 
   return false;
 }
+
+bool CombinerHelper::matchCastOfBuildVector(const MachineInstr &CastMI,
+                                            const MachineInstr &BVMI,
+                                            BuildFnTy &MatchInfo) {
+  const GExtOrTruncOp *Cast = cast<GExtOrTruncOp>(&CastMI);
+  const GBuildVector *BV = cast<GBuildVector>(&BVMI);
+
+  if (!MRI.hasOneNonDBGUse(BV->getReg(0)))
+    return false;
+
+  Register Dst = Cast->getReg(0);
+  // The type of the new build vector.
+  LLT DstTy = MRI.getType(Dst);
+  // The scalar or element type of the new build vector.
+  LLT ElemTy = DstTy.getScalarType();
+  // The scalar or element type of the old build vector.
+  LLT InputElemTy = MRI.getType(BV->getReg(0)).getScalarType();
+
+  // Check legality of new build vector, the scalar casts, and profitability of
+  // the many casts.
+  if (!isLegalOrBeforeLegalizer(
+          {TargetOpcode::G_BUILD_VECTOR, {DstTy, ElemTy}}) ||
+      !isLegalOrBeforeLegalizer({Cast->getOpcode(), {ElemTy, InputElemTy}}) ||
+      !isCastFree(Cast->getOpcode(), ElemTy, InputElemTy))
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    SmallVector<Register> Casts;
+    unsigned Elements = BV->getNumSources();
+    for (unsigned I = 0; I < Elements; ++I)
+      Casts.push_back(
+          B.buildInstr(Cast->getOpcode(), {ElemTy}, {BV->getSourceReg(I)})
+              .getReg(0));
+
+    B.buildBuildVector(Dst, Casts);
+  };
+
+  return true;
+}
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index d3c5742cee3eb4..33a1fa1ad04fdf 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -953,6 +953,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampNumElements(0, v2s64, v2s64)
       .minScalarOrElt(0, s8)
       .widenVectorEltsToVectorMinSize(0, 64)
+      .widenScalarOrEltToNextPow2(0)
       .minScalarSameAs(1, 0);
 
   getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC).lower();
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir
index 0f436127ea2eb6..9eef79a9c4bbee 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir
@@ -129,3 +129,95 @@ body:             |
     %res:_(<2 x s64>) = G_SELECT %cond(<2 x s32>), %bv, %bv2
     %small:_(<2 x s32>) = G_TRUNC %res(<2 x s64>)
     $x0 = COPY %small(<2 x s32>)
+...
+---
+name:            test_combine_trunc_build_vector
+legalized: true
+body:             |
+  bb.1:
+    ; CHECK-PRE-LABEL: name: test_combine_trunc_build_vector
+    ; CHECK-PRE: %arg1:_(s64) = COPY $x0
+    ; CHECK-PRE-NEXT: %arg2:_(s64) = COPY $x0
+    ; CHECK-PRE-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %arg1(s64)
+    ; CHECK-PRE-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC %arg2(s64)
+    ; CHECK-PRE-NEXT: %small:_(<2 x s32>) = G_BUILD_VECTOR [[TRUNC]](s32), [[TRUNC1]](s32)
+    ; CHECK-PRE-NEXT: $x0 = COPY %small(<2 x s32>)
+    ;
+    ; CHECK-POST-LABEL: name: test_combine_trunc_build_vector
+    ; CHECK-POST: %arg1:_(s64) = COPY $x0
+    ; CHECK-POST-NEXT: %arg2:_(s64) = COPY $x0
+    ; CHECK-POST-NEXT: %bv:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64)
+    ; CHECK-POST-NEXT: %small:_(<2 x s32>) = G_TRUNC %bv(<2 x s64>)
+    ; CHECK-POST-NEXT: $x0 = COPY %small(<2 x s32>)
+    %arg1:_(s64) = COPY $x0
+    %arg2:_(s64) = COPY $x0
+    %bv:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64)
+    %small:_(<2 x s32>) = G_TRUNC %bv(<2 x s64>)
+    $x0 = COPY %small(<2 x s32>)
+...
+---
+name:            test_combine_zext_build_vector
+legalized: true
+body:             |
+  bb.1:
+    ; CHECK-PRE-LABEL: name: test_combine_zext_build_vector
+    ; CHECK-PRE: %arg1:_(s32) = COPY $w0
+    ; CHECK-PRE-NEXT: %arg2:_(s32) = COPY $w0
+    ; CHECK-PRE-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %arg1(s32)
+    ; CHECK-PRE-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT %arg2(s32)
+    ; CHECK-PRE-NEXT: %large:_(<2 x s64>) = G_BUILD_VECTOR [[ZEXT]](s64), [[ZEXT1]](s64)
+    ; CHECK-PRE-NEXT: $q0 = COPY %large(<2 x s64>)
+    ;
+    ; CHECK-POST-LABEL: name: test_combine_zext_build_vector
+    ; CHECK-POST: %arg1:_(s32) = COPY $w0
+    ; CHECK-POST-NEXT: %arg2:_(s32) = COPY $w0
+    ; CHECK-POST-NEXT: %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32)
+    ; CHECK-POST-NEXT: %large:_(<2 x s64>) = G_ZEXT %bv(<2 x s32>)
+    ; CHECK-POST-NEXT: $q0 = COPY %large(<2 x s64>)
+    %arg1:_(s32) = COPY $w0
+    %arg2:_(s32) = COPY $w0
+    %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32)
+    %large:_(<2 x s64>) = G_ZEXT %bv(<2 x s32>)
+    $q0 = COPY %large(<2 x s64>)
+...
+---
+name:            test_combine_anyext_build_vector
+legalized: true
+body:             |
+  bb.1:
+    ; CHECK-PRE-LABEL: name: test_combine_anyext_build_vector
+    ; CHECK-PRE: %arg1:_(s32) = COPY $w0
+    ; CHECK-PRE-NEXT: %arg2:_(s32) = COPY $w0
+    ; CHECK-PRE-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT %arg1(s32)
+    ; CHECK-PRE-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT %arg2(s32)
+    ; CHECK-PRE-NEXT: %large:_(<2 x s64>) = G_BUILD_VECTOR [[ANYEXT]](s64), [[ANYEXT1]](s64)
+    ; CHECK-PRE-NEXT: $q0 = COPY %large(<2 x s64>)
+    ;
+    ; CHECK-POST-LABEL: name: test_combine_anyext_build_vector
+    ; CHECK-POST: %arg1:_(s32) = COPY $w0
+    ; CHECK-POST-NEXT: %arg2:_(s32) = COPY $w0
+    ; CHECK-POST-NEXT: %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32)
+    ; CHECK-POST-NEXT: %large:_(<2 x s64>) = G_ANYEXT %bv(<2 x s32>)
+    ; CHECK-POST-NEXT: $q0 = COPY %large(<2 x s64>)
+    %arg1:_(s32) = COPY $w0
+    %arg2:_(s32) = COPY $w0
+    %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32)
+    %large:_(<2 x s64>) = G_ANYEXT %bv(<2 x s32>)
+    $q0 = COPY %large(<2 x s64>)
+...
+---
+name:            test_combine_sext_build_vector
+legalized: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_combine_sext_build_vector
+    ; CHECK: %arg1:_(s32) = COPY $w0
+    ; CHECK-NEXT: %arg2:_(s32) = COPY $w0
+    ; CHECK-NEXT: %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32)
+    ; CHECK-NEXT: %large:_(<2 x s64>) = G_SEXT %bv(<2 x s32>)
+    ; CHECK-NEXT: $q0 = COPY %large(<2 x s64>)
+    %arg1:_(s32) = COPY $w0
+    %arg2:_(s32) = COPY $w0
+    %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32)
+    %large:_(<2 x s64>) = G_SEXT %bv(<2 x s32>)
+    $q0 = COPY %large(<2 x s64>)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir
index 70241e71aa593f..c98dcf6ccb7966 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir
@@ -49,8 +49,8 @@ body:             |
     ; CHECK: liveins: $x0, $x1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %arg1:_(s64) = COPY $x0
-    ; CHECK-NEXT: %extract:_(s32) = G_TRUNC %arg1(s64)
-    ; CHECK-NEXT: %zext:_(s64) = G_ZEXT %extract(s32)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %arg1(s64)
+    ; CHECK-NEXT: %zext:_(s64) = G_ZEXT [[TRUNC]](s32)
     ; CHECK-NEXT: $x0 = COPY %zext(s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %arg1:_(s64) = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-with-flags.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-with-flags.mir
index 6eece5c56258dc..8cb44605246ffa 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-with-flags.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-with-flags.mir
@@ -60,8 +60,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
-    ; CHECK-NEXT: %bv0:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY]](s32), [[COPY1]](s32)
-    ; CHECK-NEXT: $q0 = COPY %bv0(<4 x s32>)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; CHECK-NEXT: %trunc:_(<4 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; CHECK-NEXT: %zext:_(<4 x s32>) = G_ZEXT %trunc(<4 x s16>)
+    ; CHECK-NEXT: $q0 = COPY %zext(<4 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %0:_(s32) = COPY $w0
     %1:_(s32) = COPY $w1
@@ -165,8 +168,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $w3
-    ; CHECK-NEXT: %bv0:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-    ; CHECK-NEXT: $q0 = COPY %bv0(<4 x s32>)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
+    ; CHECK-NEXT: %t:_(<4 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; CHECK-NEXT: %s:_(<4 x s32>) = G_SEXT %t(<4 x s16>)
+    ; CHECK-NEXT: $q0 = COPY %s(<4 x s32>)
     %0:_(s32) = COPY $w0
     %1:_(s32) = COPY $w1
     %2:_(s32) = COPY $w2
@@ -188,8 +196,11 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $w3
-    ; CHECK-NEXT: %bv0:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-    ; CHECK-NEXT: %t:_(<4 x s16>) = G_TRUNC %bv0(<4 x s32>)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
+    ; CHECK-NEXT: %t:_(<4 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16)
     ; CHECK-NEXT: %z:_(<4 x s32>) = G_ZEXT %t(<4 x s16>)
     ; CHECK-NEXT: $q0 = COPY %z(<4 x s32>)
     %0:_(s32) = COPY $w0
@@ -213,8 +224,11 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $w3
-    ; CHECK-NEXT: %bv0:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-    ; CHECK-NEXT: %t:_(<4 x s16>) = nsw G_TRUNC %bv0(<4 x s32>)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
+    ; CHECK-NEXT: %t:_(<4 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16)
     ; CHECK-NEXT: %z:_(<4 x s32>) = G_ZEXT %t(<4 x s16>)
     ; CHECK-NEXT: $q0 = COPY %z(<4 x s32>)
     %0:_(s32) = COPY $w0
@@ -238,8 +252,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $w3
-    ; CHECK-NEXT: %bv0:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-    ; CHECK-NEXT: $q0 = COPY %bv0(<4 x s32>)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
+    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
+    ; CHECK-NEXT: %t:_(<4 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; CHECK-NEXT: %z:_(<4 x s32>) = G_ZEXT %t(<4 x s16>)
+    ; CHECK-NEXT: $q0 = COPY %z(<4 x s32>)
     %0:_(s32) = COPY $w0
     %1:_(s32) = COPY $w1
     %2:_(s32) = COPY $w2
@@ -259,8 +278,10 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
-    ; CHECK-NEXT: %bv0:_(<2 x s64>) = G_BUILD_VECTOR [[COPY]](s64), [[COPY1]](s64)
-    ; CHECK-NEXT: %z:_(<2 x s32>) = nuw G_TRUNC %bv0(<2 x s64>)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: %t:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; CHECK-NEXT: %z:_(<2 x s32>) = G_ZEXT %t(<2 x s16>)
     ; CHECK-NEXT: $d0 = COPY %z(<2 x s32>)
     %0:_(s64) = COPY $x0
     %1:_(s64) = COPY $x1
diff --git a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
index abf2e1272d6450..1f5654d59926dc 100644
--- a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
@@ -466,62 +466,92 @@ define <32 x i8> @sext_v32i1(<32 x i1> %arg) {
 ;
 ; CHECK-GI-LABEL: sext_v32i1:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov s17, w0
-; CHECK-GI-NEXT:    fmov s19, w4
-; CHECK-GI-NEXT:    ldr s0, [sp]
-; CHECK-GI-NEXT:    ldr s21, [sp, #8]
-; CHECK-GI-NEXT:    ldr s1, [sp, #32]
-; CHECK-GI-NEXT:    ldr s22, [sp, #40]
-; CHECK-GI-NEXT:    ldr s2, [sp, #64]
-; CHECK-GI-NEXT:    ldr s23, [sp, #72]
-; CHECK-GI-NEXT:    ldr s3, [sp, #96]
-; CHECK-GI-NEXT:    ldr s24, [sp, #104]
-; CHECK-GI-NEXT:    mov.s v17[1], w1
-; CHECK-GI-NEXT:    mov.s v19[1], w5
-; CHECK-GI-NEXT:    ldr s5, [sp, #128]
-; CHECK-GI-NEXT:    ldr s20, [sp, #136]
-; CHECK-GI-NEXT:    mov.s v0[1], v21[0]
-; CHECK-GI-NEXT:    ldr s7, [sp, #160]
-; CHECK-GI-NEXT:    ldr s25, [sp, #168]
-; CHECK-GI-NEXT:    mov.s v1[1], v22[0]
-; CHECK-GI-NEXT:    mov.s v2[1], v23[0]
-; CHECK-GI-NEXT:    mov.s v3[1], v24[0]
-; CHECK-GI-NEXT:    mov.s v5[1], v20[0]
-; CHECK-GI-NEXT:    mov.s v7[1], v25[0]
-; CHECK-GI-NEXT:    ldr s16, [sp, #16]
-; CHECK-GI-NEXT:    ldr s18, [sp, #48]
-; CHECK-GI-NEXT:    ldr s20, [sp, #80]
-; CHECK-GI-NEXT:    ldr s21, [sp, #112]
-; CHECK-GI-NEXT:    ldr s22, [sp, #144]
-; CHECK-GI-NEXT:    ldr s23, [sp, #176]
-; CHECK-GI-NEXT:    mov.s v17[2], w2
-; CHECK-GI-NEXT:    mov.s v19[2], w6
-; CHECK-GI-NEXT:    mov.s v0[2], v16[0]
-; CHECK-GI-NEXT:    mov.s v1[2], v18[0]
-; CHECK-GI-NEXT:    mov.s v2[2], v20[0]
-; CHECK-GI-NEXT:    mov.s v3[2], v21[0]
-; CHECK-GI-NEXT:    mov.s v5[2], v22[0]
-; CHECK-GI-NEXT:    mov.s v7[2], v23[0]
-; CHECK-GI-NEXT:    ldr s4, [sp, #24]
-; CHECK-GI-NEXT:    ldr s6, [sp, #56]
-; CHECK-GI-NEXT:    ldr s16, [sp, #88]
-; CHECK-GI-NEXT:    ldr s18, [sp, #120]
-; CHECK-GI-NEXT:    ldr s20, [sp, #152]
-; CHECK-GI-NEXT:    ldr s21, [sp, #184]
-; CHECK-GI-NEXT:    mov.s v17[3], w3
-; CHECK-GI-NEXT:    mov.s v19[3], w7
-; CHECK-GI-NEXT:    mov.s v0[3], v4[0]
-; CHECK-GI-NEXT:    mov.s v1[3], v6[0]
-; CHECK-GI-NEXT:    mov.s v2[3], v16[0]
-; CHECK-GI-NEXT:    mov.s v3[3], v18[0]
-; CHECK-GI-NEXT:    mov.s v5[3], v20[0]
-; CHECK-GI-NEXT:    mov.s v7[3], v21[0]
-; CHECK-GI-NEXT:    uzp1.8h v4, v17, v19
-; CHECK-GI-NEXT:    uzp1.8h v0, v0, v1
-; CHECK-GI-NEXT:    uzp1.8h v1, v2, v3
-; CHECK-GI-NEXT:    uzp1.8h v2, v5, v7
-; CHECK-GI-NEXT:    uzp1.16b v0, v4, v0
-; CHECK-GI-NEXT:    uzp1.16b v1, v1, v2
+; CHECK-GI-NEXT:    ldr w9, [sp, #64]
+; CHECK-GI-NEXT:    ldr w8, [sp, #72]
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fmov s2, w1
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #80]
+; CHECK-GI-NEXT:    ldr w9, [sp, #128]
+; CHECK-GI-NEXT:    mov.b v0[1], v2[0]
+; CHECK-GI-NEXT:    fmov s2, w2
+; CHECK-GI-NEXT:    mov.b v1[1], v3[0]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #88]
+; CHECK-GI-NEXT:    mov.b v0[2], v2[0]
+; CHECK-GI-NEXT:    fmov s2, w3
+; CHECK-GI-NEXT:    mov.b v1[2], v3[0]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #96]
+; CHECK-GI-NEXT:    mov.b v0[3], v2[0]
+; CHECK-GI-NEXT:    fmov s2, w4
+; CHECK-GI-NEXT:    mov.b v1[3], v3[0]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #104]
+; CHECK-GI-NEXT:    mov.b v0[4], v2[0]
+; CHECK-GI-NEXT:    fmov s2, w5
+; CHECK-GI-NEXT:    mov.b v1[4], v3[0]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #112]
+; CHECK-GI-NEXT:    mov.b v0[5], v2[0]
+; CHECK-GI-NEXT:    fmov s2, w6
+; CHECK-GI-NEXT:    mov.b v1[5], v3[0]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #120]
+; CHECK-GI-NEXT:    mov.b v0[6], v2[0]
+; CHECK-GI-NEXT:    fmov s2, w7
+; CHECK-GI-NEXT:    mov.b v1[6], v3[0]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w8, [sp]
+; CHECK-GI-NEXT:    mov.b v0[7], v2[0]
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #8]
+; CHECK-GI-NEXT:    mov.b v1[7], v3[0]
+; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #136]
+; CHECK-GI-NEXT:    mov.b v0[8], v2[0]
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #16]
+; CHECK-GI-NEXT:    mov.b v1[8], v3[0]
+; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #144]
+; CHECK-GI-NEXT:    mov.b v0[9], v2[0]
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #24]
+; CHECK-GI-NEXT:    mov.b v1[9], v3[0]
+; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #152]
+; CHECK-GI-NEXT:    mov.b v0[10], v2[0]
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #32]
+; CHECK-GI-NEXT:    mov.b v1[10], v3[0]
+; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #160]
+; CHECK-GI-NEXT:    mov.b v0[11], v2[0]
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #40]
+; CHECK-GI-NEXT:    mov.b v1[11], v3[0]
+; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #168]
+; CHECK-GI-NEXT:    mov.b v0[12], v2[0]
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #48]
+; CHECK-GI-NEXT:    mov.b v1[12], v3[0]
+; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #176]
+; CHECK-GI-NEXT:    mov.b v0[13], v2[0]
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #56]
+; CHECK-GI-NEXT:    mov.b v1[13], v3[0]
+; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #184]
+; CHECK-GI-NEXT:    mov.b v0[14], v2[0]
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    mov.b v1[14], v3[0]
+; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    mov.b v0[15], v2[0]
+; CHECK-GI-NEXT:    mov.b v1[15], v3[0]
 ; CHECK-GI-NEXT:    shl.16b v0, v0, #7
 ; CHECK-GI-NEXT:    shl.16b v1, v1, #7
 ; CHECK-GI-NEXT:    sshr.16b v0, v0, #7
@@ -807,140 +837,198 @@ define <64 x i8> @sext_v64i1(<64 x i1> %arg) {
 ;
 ; CHECK-GI-LABEL: sext_v64i1:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-GI-NEXT:    str x29, [sp, #16] // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-GI-NEXT:    .cfi_offset w29, -16
-; CHECK-GI-NEXT:    .cfi_offset b8, -24
-; CHECK-GI-NEXT:    .cfi_offset b9, -32
-; CHECK-GI-NEXT:    ldr s0, [sp, #32]
-; CHECK-GI-NEXT:    ldr s4, [sp, #40]
-; CHECK-GI-NEXT:    ldr s2, [sp, #96]
-; CHECK-GI-NEXT:    ldr s5, [sp, #104]
-; CHECK-GI-NEXT:    ldr s1, [sp, #64]
-; CHECK-GI-NEXT:    ldr s23, [sp, #72]
-; CHECK-GI-NEXT:    mov.s v0[1], v4[0]
-; CHECK-GI-NEXT:    ldr s28, [sp, #200]
-; CHECK-GI-NEXT:    ldr s3, [sp, #128]
-; CHECK-GI-NEXT:    mov.s v2[1], v5[0]
-; CHECK-GI-NEXT:    mov.s v1[1], v23[0]
-; CHECK-GI-NEXT:    ldr s5, [sp, #192]
-; CHECK-GI-NEXT:    ldr s7, [sp, #136]
-; CHECK-GI-NEXT:    ldr s4, [sp, #160]
-; CHECK-GI-NEXT:    ldr s24, [sp, #168]
-; CHECK-GI-NEXT:    mov.s v5[1], v28[0]
-; CHECK-GI-NEXT:    ldr s6, [sp, #48]
-; CHECK-GI-NEXT:    ldr s21, [sp, #80]
-; CHECK-GI-NEXT:    mov.s v3[1], v7[0]
-; CHECK-GI-NEXT:    mov.s v4[1], v24[0]
-; CHECK-GI-NEXT:    ldr s16, [sp, #112]
-; CHECK-GI-NEXT:    ldr s29, [sp, #208]
-; CHECK-GI-NEXT:    mov.s v0[2], v6[0]
-; CHECK-GI-NEXT:    mov.s v1[2], v21[0]
-; CHECK-GI-NEXT:    ldr s6, [sp, #224]
-; CHECK-GI-NEXT:    ldr s30, [sp, #232]
-; CHECK-GI-NEXT:    mov.s v2[2], v16[0]
-; CHECK-GI-NEXT:    ldr s20, [sp, #144]
-; CHECK-GI-NEXT:    ldr s27, [sp, #176]
-; CHECK-GI-NEXT:    mov.s v5[2], v29[0]
-; CHECK-GI-NEXT:    mov.s v6[1], v30[0]
-; CHECK-GI-NEXT:    ldr s18, [sp, #88]
-; CHECK-GI-NEXT:    ldr s19, [sp, #120]
-; CHECK-GI-NEXT:    ldr s7, [sp, #256]
-; CHECK-GI-NEXT:    ldr s31, [sp, #264]
-; CHECK-GI-NEXT:    mov.s v3[2], v20[0]
-; CHECK-GI-NEXT:    mov.s v4[2], v27[0]
-; CHECK-GI-NEXT:    ldr s25, [sp, #216]
-; CHECK-GI-NEXT:    ldr s26, [sp, #240]
-; CHECK-GI-NEXT:    ldr s17, [sp, #56]
-; CHECK-GI-NEXT:    ldr s22, [sp, #152]
-; CHECK-GI-NEXT:    mov.s v1[3], v18[0]
-; CHECK-GI-NEXT:    ldr s23, [sp, #184]
-; CHECK-GI-NEXT:    mov.s v2[3], v19[0]
-; CHECK-GI-NEXT:    ldr s18, [sp, #320]
-; CHECK-GI-NEXT:    ldr s27, [sp, #328]
-; CHECK-GI-NEXT:    mov.s v7[1], v31[0]
-; CHECK-GI-NEXT:    ldr s19, [sp, #352]
-; CHECK-GI-NEXT:    ldr s29, [sp, #360]
-; CHECK-GI-NEXT:    mov.s v5[3], v25[0]
-; CHECK-GI-NEXT:    mov.s v6[2], v26[0]
-; CHECK-GI-NEXT:    fmov s25, w0
-; CHECK-GI-NEXT:    fmov s26, w4
-; CHECK-GI-NEXT:    ldr s28, [sp, #272]
-; CHECK-GI-NEXT:    mov.s v0[3], v17[0]
-; CHECK-GI-NEXT:    ldr s17, [sp, #288]
-; CHECK-GI-NEXT:    ldr s8, [sp, #296]
-; CHECK-GI-NEXT:    mov.s v3[3], v22[0]
-; CHECK-GI-NEXT:    ldr s20, [sp, #384]
-; CHECK-GI-NEXT:    mov.s v4[3], v23[0]
-; CHECK-GI-NEXT:    ldr s30, [sp, #392]
-; CHECK-GI-NEXT:    ldr s22, [sp, #416]
-; CHECK-GI-NEXT:    ldr s31, [sp, #424]
-; CHECK-GI-NEXT:    ldr s23, [sp, #448]
-; CHECK-GI-NEXT:    mov.s v18[1], v27[0]
-; CHECK-GI-NEXT:    mov.s v19[1], v29[0]
-; CHECK-GI-NEXT:    ldr s27, [sp, #456]
-; CHECK-GI-NEXT:    ldr s24, [sp, #336]
-; CHECK-GI-NEXT:    mov.s v17[1], v8[0]
-; CHECK-GI-NEXT:    mov.s v7[2], v28[0]
-; CHECK-GI-NEXT:    mov.s v25[1], w1
-; CHECK-GI-NEXT:    mov.s v26[1], w5
-; CHECK-GI-NEXT:    mov.s v20[1], v30[0]
-; CHECK-GI-NEXT:    ldr s28, [sp, #368]
-; CHECK-GI-NEXT:    mov.s v22[1], v31[0]
-; CHECK-GI-NEXT:    mov.s v23[1], v27[0]
-; CHECK-GI-NEXT:    ldr s9, [sp, #304]
-; CHECK-GI-NEXT:    ldr s27, [sp, #400]
-; CHECK-GI-NEXT:    mov.s v18[2], v24[0]
-; CHECK-GI-NEXT:    ldr s24, [sp, #432]
-; CHECK-GI-NEXT:    mov.s v19[2], v28[0]
-; CHECK-GI-NEXT:    ldr s28, [sp, #464]
-; CHECK-GI-NEXT:    ldr s16, [sp, #248]
-; CHECK-GI-NEXT:    ldr s21, [sp, #280]
-; CHECK-GI-NEXT:    mov.s v17[2], v9[0]
-; CHECK-GI-NEXT:    mov.s v25[2], w2
-; CHECK-GI-NEXT:    mov.s v26[2], w6
-; CHECK-GI-NEXT:    mov.s v20[2], v27[0]
-; CHECK-GI-NEXT:    mov.s v22[2], v24[0]
-; CHECK-GI-NEXT:    mov.s v23[2], v28[0]
-; CHECK-GI-NEXT:    ldr s29, [sp, #312]
-; CHECK-GI-NEXT:    ldr s27, [sp, #344]
-; CHECK-GI-NEXT:    ldr s24, [sp, #376]
-; CHECK-GI-NEXT:    ldr s28, [sp, #408]
-; CHECK-GI-NEXT:    mov.s v6[3], v16[0]
-; CHECK-GI-NEXT:    ldr s16, [sp, #440]
-; CHECK-GI-NEXT:    mov.s v7[3], v21[0]
-; CHECK-GI-NEXT:    ldr s21, [sp, #472]
-; CHECK-GI-NEXT:    mov.s v25[3], w3
-; CHECK-GI-NEXT:    mov.s v26[3], w7
-; CHECK-GI-NEXT:    mov.s v17[3], v29[0]
-; CHECK-GI-NEXT:    mov.s v18[3], v27[0]
-; CHECK-GI-NEXT:    mov.s v19[3], v24[0]
-; CHECK-GI-NEXT:    mov.s v20[3], v28[0]
-; CHECK-GI-NEXT:    mov.s v22[3], v16[0]
-; CHECK-GI-NEXT:    mov.s v23[3], v21[0]
-; CHECK-GI-NEXT:    uzp1.8h v0, v0, v1
-; CHECK-GI-NEXT:    uzp1.8h v1, v2, v3
-; CHECK-GI-NEXT:    uzp1.8h v2, v4, v5
-; CHECK-GI-NEXT:    uzp1.8h v3, v6, v7
-; CHECK-GI-NEXT:    ldr x29, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    uzp1.8h v16, v25, v26
-; CHECK-GI-NEXT:    uzp1.8h v4, v17, v18
-; CHECK-GI-NEXT:    uzp1.8h v5, v19, v20
-; CHECK-GI-NEXT:    uzp1.8h v6, v22, v23
-; CHECK-GI-NEXT:    uzp1.16b v1, v1, v2
-; CHECK-GI-NEXT:    uzp1.16b v0, v16, v0
-; CHECK-GI-NEXT:    uzp1.16b v2, v3, v4
-; CHECK-GI-NEXT:    uzp1.16b v3, v5, v6
+; CHECK-GI-NEXT:    ldr w9, [sp, #80]
+; CHECK-GI-NEXT:    ldr w11, [sp, #88]
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fmov s3, w1
+; CHECK-GI-NEXT:    ldr w8, [sp, #208]
+; CHECK-GI-NEXT:    ldr w10, [sp, #216]
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    fmov s4, w11
+; CHECK-GI-NEXT:    ldr w9, [sp, #336]
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    fmov s5, w10
+; CHECK-GI-NEXT:    ldr w11, [sp, #344]
+; CHECK-GI-NEXT:    mov.b v0[1], v3[0]
+; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #224]
+; CHECK-GI-NEXT:    mov.b v1[1], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w2
+; CHECK-GI-NEXT:    fmov s6, w11
+; CHECK-GI-NEXT:    mov.b v2[1], v5[0]
+; CHECK-GI-NEXT:    ldr w8, [sp, #96]
+; CHECK-GI-NEXT:    ldr w10, [sp, #352]
+; CHECK-GI-NEXT:    ldr w11, [sp, #16]
+; CHECK-GI-NEXT:    mov.b v0[2], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #232]
+; CHECK-GI-NEXT:    mov.b v3[1], v6[0]
+; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    fmov s6, w10
+; CHECK-GI-NEXT:    ldr w8, [sp, #104]
+; CHECK-GI-NEXT:    ldr w10, [sp, #360]
+; CHECK-GI-NEXT:    mov.b v2[2], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w3
+; CHECK-GI-NEXT:    mov.b v1[2], v5[0]
+; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #112]
+; CHECK-GI-NEXT:    mov.b v3[2], v6[0]
+; CHECK-GI-NEXT:    fmov s6, w10
+; CHECK-GI-NEXT:    ldr w10, [sp, #368]
+; CHECK-GI-NEXT:    mov.b v0[3], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #240]
+; CHECK-GI-NEXT:    mov.b v1[3], v5[0]
+; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #120]
+; CHECK-GI-NEXT:    mov.b v2[3], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w4
+; CHECK-GI-NEXT:    mov.b v3[3], v6[0]
+; CHECK-GI-NEXT:    fmov s6, w10
+; CHECK-GI-NEXT:    ldr w10, [sp, #376]
+; CHECK-GI-NEXT:    mov.b v0[4], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #248]
+; CHECK-GI-NEXT:    mov.b v1[4], v5[0]
+; CHECK-GI-NEXT:    mov.b v3[4], v6[0]
+; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    fmov s6, w10
+; CHECK-GI-NEXT:    ldr w8, [sp, #128]
+; CHECK-GI-NEXT:    ldr w10, [sp, #384]
+; CHECK-GI-NEXT:    mov.b v2[4], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w5
+; CHECK-GI-NEXT:    mov.b v1[5], v5[0]
+; CHECK-GI-NEXT:    mov.b v3[5], v6[0]
+; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    mov.b v0[5], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #256]
+; CHECK-GI-NEXT:    fmov s6, w10
+; CHECK-GI-NEXT:    ldr w8, [sp, #136]
+; CHECK-GI-NEXT:    ldr w10, [sp, #392]
+; CHECK-GI-NEXT:    mov.b v2[5], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w6
+; CHECK-GI-NEXT:    mov.b v1[6], v5[0]
+; CHECK-GI-NEXT:    mov.b v3[6], v6[0]
+; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    fmov s6, w10
+; CHECK-GI-NEXT:    ldr w8, [sp, #144]
+; CHECK-GI-NEXT:    ldr w10, [sp, #400]
+; CHECK-GI-NEXT:    mov.b v0[6], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #264]
+; CHECK-GI-NEXT:    mov.b v1[7], v5[0]
+; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #152]
+; CHECK-GI-NEXT:    mov.b v3[7], v6[0]
+; CHECK-GI-NEXT:    fmov s6, w10
+; CHECK-GI-NEXT:    ldr w10, [sp, #408]
+; CHECK-GI-NEXT:    mov.b v2[6], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w7
+; CHECK-GI-NEXT:    mov.b v1[8], v5[0]
+; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #160]
+; CHECK-GI-NEXT:    mov.b v0[7], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #272]
+; CHECK-GI-NEXT:    mov.b v3[8], v6[0]
+; CHECK-GI-NEXT:    fmov s6, w10
+; CHECK-GI-NEXT:    ldr w10, [sp, #416]
+; CHECK-GI-NEXT:    mov.b v2[7], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w11
+; CHECK-GI-NEXT:    ldr w11, [sp, #24]
+; CHECK-GI-NEXT:    mov.b v1[9], v5[0]
+; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #168]
+; CHECK-GI-NEXT:    mov.b v3[9], v6[0]
+; CHECK-GI-NEXT:    fmov s6, w10
+; CHECK-GI-NEXT:    ldr w10, [sp, #424]
+; CHECK-GI-NEXT:    mov.b v0[8], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #280]
+; CHECK-GI-NEXT:    mov.b v1[10], v5[0]
+; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #176]
+; CHECK-GI-NEXT:    mov.b v2[8], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w11
+; CHECK-GI-NEXT:    ldr w11, [sp, #32]
+; CHECK-GI-NEXT:    mov.b v3[10], v6[0]
+; CHECK-GI-NEXT:    fmov s6, w10
+; CHECK-GI-NEXT:    ldr w10, [sp, #432]
+; CHECK-GI-NEXT:    mov.b v0[9], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #288]
+; CHECK-GI-NEXT:    mov.b v1[11], v5[0]
+; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #184]
+; CHECK-GI-NEXT:    mov.b v3[11], v6[0]
+; CHECK-GI-NEXT:    fmov s6, w10
+; CHECK-GI-NEXT:    ldr w10, [sp, #440]
+; CHECK-GI-NEXT:    mov.b v2[9], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w11
+; CHECK-GI-NEXT:    ldr w11, [sp, #40]
+; CHECK-GI-NEXT:    mov.b v1[12], v5[0]
+; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #192]
+; CHECK-GI-NEXT:    mov.b v0[10], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #296]
+; CHECK-GI-NEXT:    mov.b v3[12], v6[0]
+; CHECK-GI-NEXT:    fmov s6, w10
+; CHECK-GI-NEXT:    ldr w10, [sp, #448]
+; CHECK-GI-NEXT:    mov.b v2[10], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w11
+; CHECK-GI-NEXT:    ldr w11, [sp, #48]
+; CHECK-GI-NEXT:    mov.b v1[13], v5[0]
+; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #200]
+; CHECK-GI-NEXT:    mov.b v3[13], v6[0]
+; CHECK-GI-NEXT:    fmov s6, w10
+; CHECK-GI-NEXT:    ldr w10, [sp, #456]
+; CHECK-GI-NEXT:    mov.b v0[11], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #304]
+; CHECK-GI-NEXT:    fmov s7, w10
+; CHECK-GI-NEXT:    mov.b v1[14], v5[0]
+; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    mov.b v2[11], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w11
+; CHECK-GI-NEXT:    ldr w11, [sp, #56]
+; CHECK-GI-NEXT:    mov.b v3[14], v6[0]
+; CHECK-GI-NEXT:    mov.b v0[12], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #312]
+; CHECK-GI-NEXT:    mov.b v1[15], v5[0]
+; CHECK-GI-NEXT:    mov.b v3[15], v7[0]
+; CHECK-GI-NEXT:    mov.b v2[12], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w11
+; CHECK-GI-NEXT:    ldr w11, [sp, #64]
 ; CHECK-GI-NEXT:    shl.16b v1, v1, #7
-; CHECK-GI-NEXT:    shl.16b v0, v0, #7
-; CHECK-GI-NEXT:    shl.16b v2, v2, #7
+; CHECK-GI-NEXT:    mov.b v0[13], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #320]
 ; CHECK-GI-NEXT:    shl.16b v3, v3, #7
 ; CHECK-GI-NEXT:    sshr.16b v1, v1, #7
+; CHECK-GI-NEXT:    mov.b v2[13], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w11
+; CHECK-GI-NEXT:    ldr w11, [sp, #72]
+; CHECK-GI-NEXT:    sshr.16b v3, v3, #7
+; CHECK-GI-NEXT:    mov.b v0[14], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #328]
+; CHECK-GI-NEXT:    fmov s6, w9
+; CHECK-GI-NEXT:    mov.b v2[14], v4[0]
+; CHECK-GI-NEXT:    fmov s4, w11
+; CHECK-GI-NEXT:    mov.b v0[15], v4[0]
+; CHECK-GI-NEXT:    mov.b v2[15], v6[0]
+; CHECK-GI-NEXT:    shl.16b v0, v0, #7
+; CHECK-GI-NEXT:    shl.16b v2, v2, #7
 ; CHECK-GI-NEXT:    sshr.16b v0, v0, #7
 ; CHECK-GI-NEXT:    sshr.16b v2, v2, #7
-; CHECK-GI-NEXT:    sshr.16b v3, v3, #7
-; CHECK-GI-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ret
   %res = sext <64 x i1> %arg to <64 x i8>
   ret <64 x i8> %res
diff --git a/llvm/test/CodeGen/AArch64/arm64-vadd.ll b/llvm/test/CodeGen/AArch64/arm64-vadd.ll
index 38a568ac919168..a724958474cfb8 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vadd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vadd.ll
@@ -1022,11 +1022,18 @@ declare <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float>, <4 x float>) nou
 declare <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <2 x i64> @uaddl_duprhs(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: uaddl_duprhs:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v1.2s, w0
-; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uaddl_duprhs:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.2s, w0
+; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uaddl_duprhs:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    dup v1.2d, x8
+; CHECK-GI-NEXT:    uaddw v0.2d, v1.2d, v0.2s
+; CHECK-GI-NEXT:    ret
   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
   %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
 
@@ -1048,8 +1055,8 @@ define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
 ;
 ; CHECK-GI-LABEL: uaddl2_duprhs:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    dup v1.2s, w0
-; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    dup v1.2d, x8
 ; CHECK-GI-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
 ; CHECK-GI-NEXT:    ret
   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
@@ -1108,11 +1115,19 @@ define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
 }
 
 define <2 x i64> @usubl_duprhs(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: usubl_duprhs:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    dup v1.2s, w0
-; CHECK-NEXT:    usubl v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: usubl_duprhs:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.2s, w0
+; CHECK-SD-NEXT:    usubl v0.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: usubl_duprhs:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    dup v1.2d, x8
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ret
   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
   %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
 
@@ -1134,9 +1149,10 @@ define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
 ;
 ; CHECK-GI-LABEL: usubl2_duprhs:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    dup v1.2s, w0
-; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    usubl v0.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-GI-NEXT:    dup v1.2d, x8
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
 ; CHECK-GI-NEXT:    ret
   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
   %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
diff --git a/llvm/test/CodeGen/AArch64/neon-extadd.ll b/llvm/test/CodeGen/AArch64/neon-extadd.ll
index 402682c89124bd..6f4b090fb22bd6 100644
--- a/llvm/test/CodeGen/AArch64/neon-extadd.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extadd.ll
@@ -1266,95 +1266,133 @@ define <20 x i32> @v20(<20 x i8> %s0, <20 x i8> %s1) {
 ;
 ; CHECK-GI-LABEL: v20:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr s0, [sp]
-; CHECK-GI-NEXT:    ldr s4, [sp, #8]
-; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    ldr s2, [sp, #32]
-; CHECK-GI-NEXT:    ldr s19, [sp, #40]
-; CHECK-GI-NEXT:    fmov s3, w4
-; CHECK-GI-NEXT:    mov v0.s[1], v4.s[0]
-; CHECK-GI-NEXT:    ldr s16, [sp, #96]
-; CHECK-GI-NEXT:    ldr s22, [sp, #104]
-; CHECK-GI-NEXT:    mov v2.s[1], v19.s[0]
-; CHECK-GI-NEXT:    ldr s19, [sp, #128]
-; CHECK-GI-NEXT:    ldr s23, [sp, #136]
-; CHECK-GI-NEXT:    ldr s18, [sp, #16]
-; CHECK-GI-NEXT:    mov v1.s[1], w1
-; CHECK-GI-NEXT:    mov v3.s[1], w5
-; CHECK-GI-NEXT:    mov v16.s[1], v22.s[0]
-; CHECK-GI-NEXT:    mov v19.s[1], v23.s[0]
-; CHECK-GI-NEXT:    ldr s4, [sp, #64]
-; CHECK-GI-NEXT:    ldr s21, [sp, #72]
-; CHECK-GI-NEXT:    mov v0.s[2], v18.s[0]
-; CHECK-GI-NEXT:    ldr s18, [sp, #160]
-; CHECK-GI-NEXT:    ldr s24, [sp, #168]
-; CHECK-GI-NEXT:    ldr s20, [sp, #192]
-; CHECK-GI-NEXT:    ldr s25, [sp, #200]
-; CHECK-GI-NEXT:    ldr s22, [sp, #224]
-; CHECK-GI-NEXT:    ldr s27, [sp, #232]
-; CHECK-GI-NEXT:    ldr s23, [sp, #112]
-; CHECK-GI-NEXT:    ldr s26, [sp, #144]
-; CHECK-GI-NEXT:    mov v18.s[1], v24.s[0]
-; CHECK-GI-NEXT:    mov v20.s[1], v25.s[0]
-; CHECK-GI-NEXT:    mov v4.s[1], v21.s[0]
-; CHECK-GI-NEXT:    mov v22.s[1], v27.s[0]
-; CHECK-GI-NEXT:    mov v1.s[2], w2
-; CHECK-GI-NEXT:    ldr s17, [sp, #48]
-; CHECK-GI-NEXT:    mov v3.s[2], w6
-; CHECK-GI-NEXT:    mov v16.s[2], v23.s[0]
-; CHECK-GI-NEXT:    mov v19.s[2], v26.s[0]
-; CHECK-GI-NEXT:    ldr s7, [sp, #80]
-; CHECK-GI-NEXT:    ldr s21, [sp, #176]
-; CHECK-GI-NEXT:    ldr s24, [sp, #208]
-; CHECK-GI-NEXT:    ldr s25, [sp, #240]
-; CHECK-GI-NEXT:    mov v2.s[2], v17.s[0]
-; CHECK-GI-NEXT:    ldr s17, [sp, #120]
-; CHECK-GI-NEXT:    ldr s23, [sp, #152]
-; CHECK-GI-NEXT:    ldr s5, [sp, #24]
-; CHECK-GI-NEXT:    mov v18.s[2], v21.s[0]
-; CHECK-GI-NEXT:    mov v20.s[2], v24.s[0]
-; CHECK-GI-NEXT:    mov v4.s[2], v7.s[0]
-; CHECK-GI-NEXT:    mov v22.s[2], v25.s[0]
-; CHECK-GI-NEXT:    mov v1.s[3], w3
-; CHECK-GI-NEXT:    mov v3.s[3], w7
-; CHECK-GI-NEXT:    mov v16.s[3], v17.s[0]
-; CHECK-GI-NEXT:    mov v19.s[3], v23.s[0]
-; CHECK-GI-NEXT:    ldr s6, [sp, #56]
-; CHECK-GI-NEXT:    ldr s7, [sp, #184]
-; CHECK-GI-NEXT:    ldr s21, [sp, #216]
-; CHECK-GI-NEXT:    ldr s17, [sp, #88]
-; CHECK-GI-NEXT:    mov v0.s[3], v5.s[0]
-; CHECK-GI-NEXT:    ldr s5, [sp, #248]
-; CHECK-GI-NEXT:    mov v2.s[3], v6.s[0]
-; CHECK-GI-NEXT:    mov v18.s[3], v7.s[0]
-; CHECK-GI-NEXT:    mov v20.s[3], v21.s[0]
-; CHECK-GI-NEXT:    mov v4.s[3], v17.s[0]
-; CHECK-GI-NEXT:    mov v22.s[3], v5.s[0]
-; CHECK-GI-NEXT:    uzp1 v1.8h, v1.8h, v3.8h
-; CHECK-GI-NEXT:    movi v3.2d, #0xff00ff00ff00ff
-; CHECK-GI-NEXT:    uzp1 v5.8h, v16.8h, v19.8h
-; CHECK-GI-NEXT:    dup v6.4s, w8
-; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-GI-NEXT:    uzp1 v2.8h, v18.8h, v20.8h
-; CHECK-GI-NEXT:    uzp1 v4.8h, v4.8h, v6.8h
-; CHECK-GI-NEXT:    uzp1 v6.8h, v22.8h, v6.8h
-; CHECK-GI-NEXT:    and v1.16b, v1.16b, v3.16b
-; CHECK-GI-NEXT:    and v5.16b, v5.16b, v3.16b
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v3.16b
-; CHECK-GI-NEXT:    and v2.16b, v2.16b, v3.16b
-; CHECK-GI-NEXT:    add v1.8h, v1.8h, v5.8h
-; CHECK-GI-NEXT:    and v4.16b, v4.16b, v3.16b
-; CHECK-GI-NEXT:    and v3.16b, v6.16b, v3.16b
-; CHECK-GI-NEXT:    add v0.8h, v0.8h, v2.8h
-; CHECK-GI-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-GI-NEXT:    add v3.4h, v4.4h, v3.4h
+; CHECK-GI-NEXT:    ldr w9, [sp, #64]
+; CHECK-GI-NEXT:    ldr w10, [sp, #72]
+; CHECK-GI-NEXT:    and w13, w2, #0xff
+; CHECK-GI-NEXT:    ldr w11, [sp, #80]
+; CHECK-GI-NEXT:    ldr w12, [sp, #88]
+; CHECK-GI-NEXT:    fmov s19, w13
+; CHECK-GI-NEXT:    fmov s0, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #224]
+; CHECK-GI-NEXT:    fmov s16, w10
+; CHECK-GI-NEXT:    ldr w10, [sp, #232]
+; CHECK-GI-NEXT:    fmov s3, w11
+; CHECK-GI-NEXT:    ldr w11, [sp, #240]
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #248]
+; CHECK-GI-NEXT:    fmov s1, w12
+; CHECK-GI-NEXT:    fmov s7, w10
+; CHECK-GI-NEXT:    and w10, w1, #0xff
+; CHECK-GI-NEXT:    fmov s5, w11
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    and w9, w0, #0xff
+; CHECK-GI-NEXT:    ldrb w11, [sp]
+; CHECK-GI-NEXT:    ldrb w12, [sp, #8]
+; CHECK-GI-NEXT:    fmov s6, w9
+; CHECK-GI-NEXT:    fmov s20, w10
+; CHECK-GI-NEXT:    ldrb w9, [sp, #96]
+; CHECK-GI-NEXT:    ldrb w10, [sp, #104]
+; CHECK-GI-NEXT:    fmov s17, w11
+; CHECK-GI-NEXT:    fmov s21, w12
+; CHECK-GI-NEXT:    ldrb w11, [sp, #160]
+; CHECK-GI-NEXT:    mov v0.b[1], v16.b[0]
+; CHECK-GI-NEXT:    fmov s18, w9
+; CHECK-GI-NEXT:    fmov s22, w10
+; CHECK-GI-NEXT:    ldrb w9, [sp, #168]
+; CHECK-GI-NEXT:    mov v6.h[1], v20.h[0]
+; CHECK-GI-NEXT:    fmov s20, w11
+; CHECK-GI-NEXT:    ldrb w10, [sp, #16]
+; CHECK-GI-NEXT:    mov v17.h[1], v21.h[0]
+; CHECK-GI-NEXT:    fmov s21, w9
+; CHECK-GI-NEXT:    ldrb w9, [sp, #112]
+; CHECK-GI-NEXT:    mov v18.h[1], v22.h[0]
+; CHECK-GI-NEXT:    fmov s23, w10
+; CHECK-GI-NEXT:    ldrb w10, [sp, #176]
+; CHECK-GI-NEXT:    and w11, w3, #0xff
+; CHECK-GI-NEXT:    mov v2.b[1], v7.b[0]
+; CHECK-GI-NEXT:    mov v0.b[2], v3.b[0]
+; CHECK-GI-NEXT:    mov v6.h[2], v19.h[0]
+; CHECK-GI-NEXT:    fmov s19, w9
+; CHECK-GI-NEXT:    mov v20.h[1], v21.h[0]
+; CHECK-GI-NEXT:    ldrb w9, [sp, #24]
+; CHECK-GI-NEXT:    fmov s22, w11
+; CHECK-GI-NEXT:    mov v17.h[2], v23.h[0]
+; CHECK-GI-NEXT:    and w11, w4, #0xff
+; CHECK-GI-NEXT:    mov v18.h[2], v19.h[0]
+; CHECK-GI-NEXT:    fmov s19, w10
+; CHECK-GI-NEXT:    ldrb w10, [sp, #120]
+; CHECK-GI-NEXT:    fmov s23, w9
+; CHECK-GI-NEXT:    ldrb w9, [sp, #184]
+; CHECK-GI-NEXT:    mov v6.h[3], v22.h[0]
+; CHECK-GI-NEXT:    fmov s21, w11
+; CHECK-GI-NEXT:    and w11, w6, #0xff
+; CHECK-GI-NEXT:    mov v2.b[2], v5.b[0]
+; CHECK-GI-NEXT:    mov v20.h[2], v19.h[0]
+; CHECK-GI-NEXT:    fmov s19, w10
+; CHECK-GI-NEXT:    fmov s16, w9
+; CHECK-GI-NEXT:    ldrb w9, [sp, #128]
+; CHECK-GI-NEXT:    and w10, w5, #0xff
+; CHECK-GI-NEXT:    mov v17.h[3], v23.h[0]
+; CHECK-GI-NEXT:    mov v6.h[4], v21.h[0]
+; CHECK-GI-NEXT:    mov v0.b[3], v1.b[0]
+; CHECK-GI-NEXT:    mov v18.h[3], v19.h[0]
+; CHECK-GI-NEXT:    fmov s19, w9
+; CHECK-GI-NEXT:    ldrb w9, [sp, #192]
+; CHECK-GI-NEXT:    mov v20.h[3], v16.h[0]
+; CHECK-GI-NEXT:    fmov s16, w10
+; CHECK-GI-NEXT:    ldrb w10, [sp, #32]
+; CHECK-GI-NEXT:    mov v2.b[3], v4.b[0]
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    mov v18.h[4], v19.h[0]
+; CHECK-GI-NEXT:    fmov s19, w10
+; CHECK-GI-NEXT:    ldrb w10, [sp, #136]
+; CHECK-GI-NEXT:    mov v6.h[5], v16.h[0]
+; CHECK-GI-NEXT:    fmov s16, w10
+; CHECK-GI-NEXT:    ldrb w10, [sp, #48]
+; CHECK-GI-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-GI-NEXT:    mov v17.h[4], v19.h[0]
+; CHECK-GI-NEXT:    fmov s19, w9
+; CHECK-GI-NEXT:    ldrb w9, [sp, #40]
+; CHECK-GI-NEXT:    mov v18.h[5], v16.h[0]
+; CHECK-GI-NEXT:    fmov s16, w9
+; CHECK-GI-NEXT:    ldrb w9, [sp, #144]
+; CHECK-GI-NEXT:    mov v20.h[4], v19.h[0]
+; CHECK-GI-NEXT:    fmov s19, w11
+; CHECK-GI-NEXT:    ldrb w11, [sp, #200]
+; CHECK-GI-NEXT:    add v0.4h, v0.4h, v2.4h
+; CHECK-GI-NEXT:    fmov s7, w11
+; CHECK-GI-NEXT:    mov v17.h[5], v16.h[0]
+; CHECK-GI-NEXT:    fmov s16, w9
+; CHECK-GI-NEXT:    ldrb w11, [sp, #208]
+; CHECK-GI-NEXT:    mov v6.h[6], v19.h[0]
+; CHECK-GI-NEXT:    ldrb w9, [sp, #56]
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    mov v20.h[5], v7.h[0]
+; CHECK-GI-NEXT:    fmov s7, w10
+; CHECK-GI-NEXT:    mov v18.h[6], v16.h[0]
+; CHECK-GI-NEXT:    fmov s16, w11
+; CHECK-GI-NEXT:    ldrb w10, [sp, #152]
+; CHECK-GI-NEXT:    and w11, w7, #0xff
+; CHECK-GI-NEXT:    fmov s3, w11
+; CHECK-GI-NEXT:    str q0, [x8, #64]
+; CHECK-GI-NEXT:    fmov s5, w10
+; CHECK-GI-NEXT:    ldrb w10, [sp, #216]
+; CHECK-GI-NEXT:    mov v17.h[6], v7.h[0]
+; CHECK-GI-NEXT:    mov v20.h[6], v16.h[0]
+; CHECK-GI-NEXT:    fmov s7, w9
+; CHECK-GI-NEXT:    mov v6.h[7], v3.h[0]
+; CHECK-GI-NEXT:    fmov s3, w10
+; CHECK-GI-NEXT:    mov v18.h[7], v5.h[0]
+; CHECK-GI-NEXT:    mov v17.h[7], v7.h[0]
+; CHECK-GI-NEXT:    mov v20.h[7], v3.h[0]
+; CHECK-GI-NEXT:    add v1.8h, v6.8h, v18.8h
+; CHECK-GI-NEXT:    add v3.8h, v17.8h, v20.8h
+; CHECK-GI-NEXT:    ushll v4.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ushll v4.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    stp q2, q1, [x8]
 ; CHECK-GI-NEXT:    ushll v2.4s, v3.4h, #0
-; CHECK-GI-NEXT:    stp q4, q0, [x8, #32]
-; CHECK-GI-NEXT:    str q2, [x8, #64]
+; CHECK-GI-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    stp q4, q1, [x8]
+; CHECK-GI-NEXT:    stp q2, q3, [x8, #32]
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <20 x i8> %s0 to <20 x i32>
@@ -1459,69 +1497,107 @@ define <16 x i32> @i12(<16 x i12> %s0, <16 x i12> %s1) {
 ;
 ; CHECK-GI-LABEL: i12:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    fmov s4, w4
-; CHECK-GI-NEXT:    ldr s0, [sp]
-; CHECK-GI-NEXT:    ldr s20, [sp, #8]
-; CHECK-GI-NEXT:    ldr s2, [sp, #32]
-; CHECK-GI-NEXT:    ldr s21, [sp, #40]
-; CHECK-GI-NEXT:    ldr s16, [sp, #64]
-; CHECK-GI-NEXT:    ldr s22, [sp, #72]
-; CHECK-GI-NEXT:    ldr s17, [sp, #96]
-; CHECK-GI-NEXT:    ldr s23, [sp, #104]
-; CHECK-GI-NEXT:    mov v1.s[1], w1
-; CHECK-GI-NEXT:    mov v4.s[1], w5
-; CHECK-GI-NEXT:    ldr s18, [sp, #128]
-; CHECK-GI-NEXT:    ldr s24, [sp, #136]
-; CHECK-GI-NEXT:    mov v0.s[1], v20.s[0]
-; CHECK-GI-NEXT:    ldr s19, [sp, #160]
-; CHECK-GI-NEXT:    ldr s25, [sp, #168]
-; CHECK-GI-NEXT:    mov v2.s[1], v21.s[0]
-; CHECK-GI-NEXT:    mov v16.s[1], v22.s[0]
-; CHECK-GI-NEXT:    mov v17.s[1], v23.s[0]
-; CHECK-GI-NEXT:    mov v18.s[1], v24.s[0]
-; CHECK-GI-NEXT:    mov v19.s[1], v25.s[0]
-; CHECK-GI-NEXT:    ldr s6, [sp, #16]
-; CHECK-GI-NEXT:    ldr s7, [sp, #48]
-; CHECK-GI-NEXT:    ldr s20, [sp, #80]
-; CHECK-GI-NEXT:    ldr s21, [sp, #112]
-; CHECK-GI-NEXT:    ldr s22, [sp, #144]
-; CHECK-GI-NEXT:    ldr s23, [sp, #176]
-; CHECK-GI-NEXT:    mov v1.s[2], w2
-; CHECK-GI-NEXT:    mov v4.s[2], w6
-; CHECK-GI-NEXT:    mov v0.s[2], v6.s[0]
-; CHECK-GI-NEXT:    mov v2.s[2], v7.s[0]
-; CHECK-GI-NEXT:    mov v16.s[2], v20.s[0]
-; CHECK-GI-NEXT:    mov v17.s[2], v21.s[0]
-; CHECK-GI-NEXT:    mov v18.s[2], v22.s[0]
-; CHECK-GI-NEXT:    mov v19.s[2], v23.s[0]
-; CHECK-GI-NEXT:    ldr s3, [sp, #24]
-; CHECK-GI-NEXT:    ldr s5, [sp, #56]
-; CHECK-GI-NEXT:    ldr s6, [sp, #88]
-; CHECK-GI-NEXT:    ldr s7, [sp, #120]
-; CHECK-GI-NEXT:    ldr s20, [sp, #152]
-; CHECK-GI-NEXT:    ldr s21, [sp, #184]
-; CHECK-GI-NEXT:    mov v1.s[3], w3
-; CHECK-GI-NEXT:    mov v4.s[3], w7
-; CHECK-GI-NEXT:    movi v22.4s, #15, msl #8
-; CHECK-GI-NEXT:    mov v0.s[3], v3.s[0]
-; CHECK-GI-NEXT:    mov v2.s[3], v5.s[0]
-; CHECK-GI-NEXT:    mov v16.s[3], v6.s[0]
-; CHECK-GI-NEXT:    mov v17.s[3], v7.s[0]
-; CHECK-GI-NEXT:    mov v18.s[3], v20.s[0]
-; CHECK-GI-NEXT:    mov v19.s[3], v21.s[0]
-; CHECK-GI-NEXT:    and v1.16b, v1.16b, v22.16b
-; CHECK-GI-NEXT:    and v3.16b, v4.16b, v22.16b
-; CHECK-GI-NEXT:    and v4.16b, v0.16b, v22.16b
-; CHECK-GI-NEXT:    and v5.16b, v2.16b, v22.16b
-; CHECK-GI-NEXT:    and v0.16b, v16.16b, v22.16b
-; CHECK-GI-NEXT:    and v2.16b, v17.16b, v22.16b
-; CHECK-GI-NEXT:    and v6.16b, v18.16b, v22.16b
-; CHECK-GI-NEXT:    and v7.16b, v19.16b, v22.16b
-; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT:    add v1.4s, v3.4s, v2.4s
-; CHECK-GI-NEXT:    add v2.4s, v4.4s, v6.4s
-; CHECK-GI-NEXT:    add v3.4s, v5.4s, v7.4s
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fmov s1, w1
+; CHECK-GI-NEXT:    ldr w8, [sp]
+; CHECK-GI-NEXT:    fmov s2, w5
+; CHECK-GI-NEXT:    ldr w9, [sp, #8]
+; CHECK-GI-NEXT:    ldr w11, [sp, #32]
+; CHECK-GI-NEXT:    ldr w12, [sp, #40]
+; CHECK-GI-NEXT:    fmov s5, w7
+; CHECK-GI-NEXT:    ldr w10, [sp, #16]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    fmov s1, w4
+; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    fmov s4, w12
+; CHECK-GI-NEXT:    ldr w12, [sp, #96]
+; CHECK-GI-NEXT:    ldr w13, [sp, #104]
+; CHECK-GI-NEXT:    ldr w14, [sp, #128]
+; CHECK-GI-NEXT:    ldr w15, [sp, #136]
+; CHECK-GI-NEXT:    ldr w16, [sp, #160]
+; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
+; CHECK-GI-NEXT:    fmov s2, w2
+; CHECK-GI-NEXT:    fmov s7, w13
+; CHECK-GI-NEXT:    fmov s16, w15
+; CHECK-GI-NEXT:    ldr w17, [sp, #168]
+; CHECK-GI-NEXT:    ldr w9, [sp, #24]
+; CHECK-GI-NEXT:    ldr w13, [sp, #176]
+; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-GI-NEXT:    fmov s2, w6
+; CHECK-GI-NEXT:    fmov s17, w17
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #56]
+; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    fmov s3, w11
+; CHECK-GI-NEXT:    ldr w11, [sp, #48]
+; CHECK-GI-NEXT:    mov v1.h[3], v5.h[0]
+; CHECK-GI-NEXT:    fmov s5, w10
+; CHECK-GI-NEXT:    ldr w10, [sp, #64]
+; CHECK-GI-NEXT:    mov v3.h[1], v4.h[0]
+; CHECK-GI-NEXT:    fmov s4, w3
+; CHECK-GI-NEXT:    mov v2.h[2], v5.h[0]
+; CHECK-GI-NEXT:    fmov s5, w11
+; CHECK-GI-NEXT:    ldr w11, [sp, #72]
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    fmov s6, w11
+; CHECK-GI-NEXT:    mov v0.h[3], v4.h[0]
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    mov v3.h[2], v5.h[0]
+; CHECK-GI-NEXT:    fmov s5, w10
+; CHECK-GI-NEXT:    ldr w9, [sp, #80]
+; CHECK-GI-NEXT:    ldr w10, [sp, #112]
+; CHECK-GI-NEXT:    ldr w11, [sp, #144]
+; CHECK-GI-NEXT:    mov v2.h[3], v4.h[0]
+; CHECK-GI-NEXT:    mov v5.h[1], v6.h[0]
+; CHECK-GI-NEXT:    fmov s6, w12
+; CHECK-GI-NEXT:    fmov s18, w11
+; CHECK-GI-NEXT:    ldr w12, [sp, #88]
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    mov v6.h[1], v7.h[0]
+; CHECK-GI-NEXT:    fmov s7, w14
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    mov v7.h[1], v16.h[0]
+; CHECK-GI-NEXT:    fmov s16, w16
+; CHECK-GI-NEXT:    mov v16.h[1], v17.h[0]
+; CHECK-GI-NEXT:    fmov s17, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #152]
+; CHECK-GI-NEXT:    mov v7.h[2], v18.h[0]
+; CHECK-GI-NEXT:    fmov s18, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #120]
+; CHECK-GI-NEXT:    mov v5.h[2], v17.h[0]
+; CHECK-GI-NEXT:    fmov s17, w10
+; CHECK-GI-NEXT:    ldr w10, [sp, #184]
+; CHECK-GI-NEXT:    mov v3.h[3], v18.h[0]
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    fmov s18, w10
+; CHECK-GI-NEXT:    mov v6.h[2], v17.h[0]
+; CHECK-GI-NEXT:    fmov s17, w13
+; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    mov v16.h[2], v17.h[0]
+; CHECK-GI-NEXT:    fmov s17, w12
+; CHECK-GI-NEXT:    mov v6.h[3], v4.h[0]
+; CHECK-GI-NEXT:    movi v4.4s, #15, msl #8
+; CHECK-GI-NEXT:    mov v5.h[3], v17.h[0]
+; CHECK-GI-NEXT:    fmov s17, w9
+; CHECK-GI-NEXT:    mov v16.h[3], v18.h[0]
+; CHECK-GI-NEXT:    ushll v6.4s, v6.4h, #0
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v4.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v4.16b
+; CHECK-GI-NEXT:    mov v7.h[3], v17.h[0]
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v4.16b
+; CHECK-GI-NEXT:    and v3.16b, v3.16b, v4.16b
+; CHECK-GI-NEXT:    ushll v5.4s, v5.4h, #0
+; CHECK-GI-NEXT:    ushll v16.4s, v16.4h, #0
+; CHECK-GI-NEXT:    and v6.16b, v6.16b, v4.16b
+; CHECK-GI-NEXT:    ushll v7.4s, v7.4h, #0
+; CHECK-GI-NEXT:    and v5.16b, v5.16b, v4.16b
+; CHECK-GI-NEXT:    add v1.4s, v1.4s, v6.4s
+; CHECK-GI-NEXT:    and v7.16b, v7.16b, v4.16b
+; CHECK-GI-NEXT:    and v4.16b, v16.16b, v4.16b
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v5.4s
+; CHECK-GI-NEXT:    add v2.4s, v2.4s, v7.4s
+; CHECK-GI-NEXT:    add v3.4s, v3.4s, v4.4s
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = zext <16 x i12> %s0 to <16 x i32>
diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll
index 5237a3491de9b4..529a3b72e09714 100644
--- a/llvm/test/CodeGen/AArch64/sext.ll
+++ b/llvm/test/CodeGen/AArch64/sext.ll
@@ -219,12 +219,21 @@ define <3 x i16> @sext_v3i8_v3i16(<3 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v3i8_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    mov v0.s[1], w1
-; CHECK-GI-NEXT:    mov v0.s[2], w2
-; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
-; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8
-; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-GI-NEXT:    lsl w8, w0, #8
+; CHECK-GI-NEXT:    lsl w9, w1, #8
+; CHECK-GI-NEXT:    lsl w10, w2, #8
+; CHECK-GI-NEXT:    sxth w8, w8
+; CHECK-GI-NEXT:    sxth w9, w9
+; CHECK-GI-NEXT:    sxth w10, w10
+; CHECK-GI-NEXT:    asr w8, w8, #8
+; CHECK-GI-NEXT:    asr w9, w9, #8
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    asr w8, w10, #8
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <3 x i8> %a to <3 x i16>
@@ -244,16 +253,12 @@ define <3 x i32> @sext_v3i8_v3i32(<3 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v3i8_v3i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, #24 // =0x18
-; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w1
-; CHECK-GI-NEXT:    mov v1.s[1], w8
-; CHECK-GI-NEXT:    mov v0.s[2], w2
-; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    neg v1.4s, v1.4s
-; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    sxtb w8, w0
+; CHECK-GI-NEXT:    sxtb w9, w1
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    sxtb w8, w2
+; CHECK-GI-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NEXT:    mov v0.s[2], w8
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <3 x i8> %a to <3 x i32>
@@ -280,16 +285,15 @@ define <3 x i64> @sext_v3i8_v3i64(<3 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v3i8_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 def $x2
-; CHECK-GI-NEXT:    sxtb x8, w2
-; CHECK-GI-NEXT:    fmov d2, x8
-; CHECK-GI-NEXT:    mov v0.s[1], w1
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56
-; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #56
-; CHECK-GI-NEXT:    mov d1, v0.d[1]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    sxtb x8, w0
+; CHECK-GI-NEXT:    sxtb x9, w1
+; CHECK-GI-NEXT:    sxtb x10, w2
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    fmov d1, x9
+; CHECK-GI-NEXT:    fmov d2, x10
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <3 x i8> %a to <3 x i64>
@@ -382,12 +386,21 @@ define <3 x i16> @sext_v3i10_v3i16(<3 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v3i10_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    mov v0.s[1], w1
-; CHECK-GI-NEXT:    mov v0.s[2], w2
-; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
-; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #6
-; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #6
+; CHECK-GI-NEXT:    lsl w8, w0, #6
+; CHECK-GI-NEXT:    lsl w9, w1, #6
+; CHECK-GI-NEXT:    lsl w10, w2, #6
+; CHECK-GI-NEXT:    sxth w8, w8
+; CHECK-GI-NEXT:    sxth w9, w9
+; CHECK-GI-NEXT:    sxth w10, w10
+; CHECK-GI-NEXT:    asr w8, w8, #6
+; CHECK-GI-NEXT:    asr w9, w9, #6
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    asr w8, w10, #6
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <3 x i10> %a to <3 x i16>
@@ -407,16 +420,12 @@ define <3 x i32> @sext_v3i10_v3i32(<3 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v3i10_v3i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, #22 // =0x16
-; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w1
-; CHECK-GI-NEXT:    mov v1.s[1], w8
-; CHECK-GI-NEXT:    mov v0.s[2], w2
-; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    neg v1.4s, v1.4s
-; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    sbfx w8, w0, #0, #10
+; CHECK-GI-NEXT:    sbfx w9, w1, #0, #10
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    sbfx w8, w2, #0, #10
+; CHECK-GI-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NEXT:    mov v0.s[2], w8
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <3 x i10> %a to <3 x i32>
@@ -443,16 +452,15 @@ define <3 x i64> @sext_v3i10_v3i64(<3 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v3i10_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 def $x2
-; CHECK-GI-NEXT:    sbfx x8, x2, #0, #10
-; CHECK-GI-NEXT:    fmov d2, x8
-; CHECK-GI-NEXT:    mov v0.s[1], w1
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #54
-; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #54
-; CHECK-GI-NEXT:    mov d1, v0.d[1]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    sbfx x8, x0, #0, #10
+; CHECK-GI-NEXT:    sbfx x9, x1, #0, #10
+; CHECK-GI-NEXT:    sbfx x10, x2, #0, #10
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    fmov d1, x9
+; CHECK-GI-NEXT:    fmov d2, x10
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <3 x i10> %a to <3 x i64>
@@ -1024,34 +1032,48 @@ define <16 x i16> @sext_v16i10_v16i16(<16 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v16i10_v16i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov s4, w0
-; CHECK-GI-NEXT:    fmov s5, w4
-; CHECK-GI-NEXT:    ldr s0, [sp]
-; CHECK-GI-NEXT:    ldr s1, [sp, #8]
-; CHECK-GI-NEXT:    ldr s2, [sp, #32]
-; CHECK-GI-NEXT:    ldr s3, [sp, #40]
-; CHECK-GI-NEXT:    mov v4.s[1], w1
-; CHECK-GI-NEXT:    mov v5.s[1], w5
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT:    ldr s1, [sp, #16]
-; CHECK-GI-NEXT:    ldr s3, [sp, #48]
-; CHECK-GI-NEXT:    mov v4.s[2], w2
-; CHECK-GI-NEXT:    mov v5.s[2], w6
-; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT:    mov v2.s[2], v3.s[0]
-; CHECK-GI-NEXT:    ldr s1, [sp, #24]
-; CHECK-GI-NEXT:    ldr s3, [sp, #56]
-; CHECK-GI-NEXT:    mov v4.s[3], w3
-; CHECK-GI-NEXT:    mov v5.s[3], w7
-; CHECK-GI-NEXT:    mov v0.s[3], v1.s[0]
-; CHECK-GI-NEXT:    mov v2.s[3], v3.s[0]
-; CHECK-GI-NEXT:    uzp1 v1.8h, v4.8h, v5.8h
-; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    ldr w8, [sp]
+; CHECK-GI-NEXT:    ldr w9, [sp, #8]
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fmov s2, w1
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    ldr w8, [sp, #16]
+; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
+; CHECK-GI-NEXT:    fmov s2, w2
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #24]
+; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-GI-NEXT:    fmov s2, w3
+; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #32]
+; CHECK-GI-NEXT:    mov v0.h[3], v2.h[0]
+; CHECK-GI-NEXT:    fmov s2, w4
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #40]
+; CHECK-GI-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-GI-NEXT:    fmov s2, w5
+; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #48]
+; CHECK-GI-NEXT:    mov v0.h[5], v2.h[0]
+; CHECK-GI-NEXT:    fmov s2, w6
+; CHECK-GI-NEXT:    mov v1.h[5], v3.h[0]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #56]
+; CHECK-GI-NEXT:    mov v0.h[6], v2.h[0]
+; CHECK-GI-NEXT:    fmov s2, w7
+; CHECK-GI-NEXT:    mov v1.h[6], v3.h[0]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v0.h[7], v2.h[0]
+; CHECK-GI-NEXT:    mov v1.h[7], v3.h[0]
+; CHECK-GI-NEXT:    shl v0.8h, v0.8h, #6
 ; CHECK-GI-NEXT:    shl v1.8h, v1.8h, #6
-; CHECK-GI-NEXT:    shl v2.8h, v0.8h, #6
-; CHECK-GI-NEXT:    sshr v0.8h, v1.8h, #6
-; CHECK-GI-NEXT:    sshr v1.8h, v2.8h, #6
+; CHECK-GI-NEXT:    sshr v0.8h, v0.8h, #6
+; CHECK-GI-NEXT:    sshr v1.8h, v1.8h, #6
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <16 x i10> %a to <16 x i16>
@@ -1101,36 +1123,54 @@ define <16 x i32> @sext_v16i10_v16i32(<16 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v16i10_v16i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov s4, w0
-; CHECK-GI-NEXT:    fmov s5, w4
-; CHECK-GI-NEXT:    ldr s0, [sp]
-; CHECK-GI-NEXT:    ldr s1, [sp, #8]
-; CHECK-GI-NEXT:    ldr s2, [sp, #32]
-; CHECK-GI-NEXT:    ldr s3, [sp, #40]
-; CHECK-GI-NEXT:    mov v4.s[1], w1
-; CHECK-GI-NEXT:    mov v5.s[1], w5
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT:    ldr s1, [sp, #16]
-; CHECK-GI-NEXT:    ldr s3, [sp, #48]
-; CHECK-GI-NEXT:    mov v4.s[2], w2
-; CHECK-GI-NEXT:    mov v5.s[2], w6
-; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT:    mov v2.s[2], v3.s[0]
-; CHECK-GI-NEXT:    ldr s1, [sp, #24]
-; CHECK-GI-NEXT:    ldr s3, [sp, #56]
-; CHECK-GI-NEXT:    mov v4.s[3], w3
-; CHECK-GI-NEXT:    mov v5.s[3], w7
-; CHECK-GI-NEXT:    mov v0.s[3], v1.s[0]
-; CHECK-GI-NEXT:    mov v2.s[3], v3.s[0]
-; CHECK-GI-NEXT:    shl v1.4s, v4.4s, #22
-; CHECK-GI-NEXT:    shl v3.4s, v5.4s, #22
-; CHECK-GI-NEXT:    shl v4.4s, v0.4s, #22
-; CHECK-GI-NEXT:    shl v5.4s, v2.4s, #22
-; CHECK-GI-NEXT:    sshr v0.4s, v1.4s, #22
-; CHECK-GI-NEXT:    sshr v1.4s, v3.4s, #22
-; CHECK-GI-NEXT:    sshr v2.4s, v4.4s, #22
-; CHECK-GI-NEXT:    sshr v3.4s, v5.4s, #22
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fmov s1, w1
+; CHECK-GI-NEXT:    ldr w8, [sp]
+; CHECK-GI-NEXT:    fmov s2, w5
+; CHECK-GI-NEXT:    ldr w9, [sp, #8]
+; CHECK-GI-NEXT:    ldr w10, [sp, #32]
+; CHECK-GI-NEXT:    ldr w11, [sp, #40]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #16]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    fmov s1, w4
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    fmov s5, w10
+; CHECK-GI-NEXT:    fmov s6, w11
+; CHECK-GI-NEXT:    ldr w9, [sp, #48]
+; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
+; CHECK-GI-NEXT:    fmov s2, w2
+; CHECK-GI-NEXT:    mov v3.h[1], v4.h[0]
+; CHECK-GI-NEXT:    mov v5.h[1], v6.h[0]
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    fmov s6, w9
+; CHECK-GI-NEXT:    ldr w8, [sp, #24]
+; CHECK-GI-NEXT:    ldr w9, [sp, #56]
+; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-GI-NEXT:    fmov s2, w6
+; CHECK-GI-NEXT:    mov v3.h[2], v4.h[0]
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v5.h[2], v6.h[0]
+; CHECK-GI-NEXT:    fmov s6, w9
+; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-GI-NEXT:    fmov s2, w3
+; CHECK-GI-NEXT:    mov v3.h[3], v4.h[0]
+; CHECK-GI-NEXT:    mov v0.h[3], v2.h[0]
+; CHECK-GI-NEXT:    fmov s2, w7
+; CHECK-GI-NEXT:    mov v5.h[3], v6.h[0]
+; CHECK-GI-NEXT:    mov v1.h[3], v2.h[0]
+; CHECK-GI-NEXT:    ushll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v5.4h, #0
+; CHECK-GI-NEXT:    shl v2.4s, v2.4s, #22
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #22
+; CHECK-GI-NEXT:    shl v3.4s, v3.4s, #22
+; CHECK-GI-NEXT:    sshr v2.4s, v2.4s, #22
+; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #22
+; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #22
+; CHECK-GI-NEXT:    sshr v3.4s, v3.4s, #22
+; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #22
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <16 x i10> %a to <16 x i32>
@@ -1188,49 +1228,69 @@ define <16 x i64> @sext_v16i10_v16i64(<16 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v16i10_v16i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov s7, w0
-; CHECK-GI-NEXT:    fmov s17, w2
-; CHECK-GI-NEXT:    ldr s0, [sp]
-; CHECK-GI-NEXT:    fmov s18, w4
-; CHECK-GI-NEXT:    fmov s19, w6
-; CHECK-GI-NEXT:    ldr s1, [sp, #8]
-; CHECK-GI-NEXT:    ldr s2, [sp, #16]
-; CHECK-GI-NEXT:    ldr s3, [sp, #24]
-; CHECK-GI-NEXT:    ldr s4, [sp, #32]
-; CHECK-GI-NEXT:    ldr s5, [sp, #40]
-; CHECK-GI-NEXT:    ldr s6, [sp, #48]
-; CHECK-GI-NEXT:    ldr s16, [sp, #56]
-; CHECK-GI-NEXT:    mov v7.s[1], w1
-; CHECK-GI-NEXT:    mov v17.s[1], w3
-; CHECK-GI-NEXT:    mov v18.s[1], w5
-; CHECK-GI-NEXT:    mov v19.s[1], w7
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
-; CHECK-GI-NEXT:    mov v6.s[1], v16.s[0]
-; CHECK-GI-NEXT:    ushll v1.2d, v7.2s, #0
-; CHECK-GI-NEXT:    ushll v3.2d, v17.2s, #0
-; CHECK-GI-NEXT:    ushll v5.2d, v18.2s, #0
-; CHECK-GI-NEXT:    ushll v7.2d, v19.2s, #0
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll v2.2d, v2.2s, #0
-; CHECK-GI-NEXT:    ushll v4.2d, v4.2s, #0
-; CHECK-GI-NEXT:    ushll v6.2d, v6.2s, #0
-; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #54
-; CHECK-GI-NEXT:    shl v3.2d, v3.2d, #54
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fmov s1, w1
+; CHECK-GI-NEXT:    ldr w8, [sp]
+; CHECK-GI-NEXT:    fmov s2, w5
+; CHECK-GI-NEXT:    ldr w9, [sp, #8]
+; CHECK-GI-NEXT:    ldr w10, [sp, #32]
+; CHECK-GI-NEXT:    ldr w11, [sp, #40]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    fmov s1, w4
+; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    fmov s4, w11
+; CHECK-GI-NEXT:    ldr w9, [sp, #48]
+; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #16]
+; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #24]
+; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    fmov s3, w10
+; CHECK-GI-NEXT:    mov v3.h[1], v4.h[0]
+; CHECK-GI-NEXT:    fmov s4, w2
+; CHECK-GI-NEXT:    mov v2.h[2], v5.h[0]
+; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-GI-NEXT:    fmov s4, w6
+; CHECK-GI-NEXT:    mov v2.h[3], v5.h[0]
+; CHECK-GI-NEXT:    mov v1.h[2], v4.h[0]
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #56]
+; CHECK-GI-NEXT:    mov v3.h[2], v4.h[0]
+; CHECK-GI-NEXT:    fmov s4, w3
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    mov v0.h[3], v4.h[0]
+; CHECK-GI-NEXT:    fmov s4, w7
+; CHECK-GI-NEXT:    ushll v6.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll2 v2.2d, v2.4s, #0
+; CHECK-GI-NEXT:    mov v1.h[3], v4.h[0]
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    shl v6.2d, v6.2d, #54
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    shl v18.2d, v2.2d, #54
+; CHECK-GI-NEXT:    mov v3.h[3], v4.h[0]
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v4.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    ushll v5.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll2 v1.2d, v1.4s, #0
+; CHECK-GI-NEXT:    shl v4.2d, v4.2d, #54
+; CHECK-GI-NEXT:    shl v16.2d, v0.2d, #54
+; CHECK-GI-NEXT:    ushll v7.2d, v3.2s, #0
+; CHECK-GI-NEXT:    ushll2 v3.2d, v3.4s, #0
 ; CHECK-GI-NEXT:    shl v5.2d, v5.2d, #54
+; CHECK-GI-NEXT:    shl v17.2d, v1.2d, #54
+; CHECK-GI-NEXT:    sshr v0.2d, v4.2d, #54
+; CHECK-GI-NEXT:    sshr v1.2d, v16.2d, #54
+; CHECK-GI-NEXT:    sshr v4.2d, v6.2d, #54
 ; CHECK-GI-NEXT:    shl v7.2d, v7.2d, #54
-; CHECK-GI-NEXT:    shl v16.2d, v0.2d, #54
-; CHECK-GI-NEXT:    shl v17.2d, v2.2d, #54
-; CHECK-GI-NEXT:    shl v18.2d, v4.2d, #54
-; CHECK-GI-NEXT:    shl v19.2d, v6.2d, #54
-; CHECK-GI-NEXT:    sshr v0.2d, v1.2d, #54
-; CHECK-GI-NEXT:    sshr v1.2d, v3.2d, #54
+; CHECK-GI-NEXT:    shl v19.2d, v3.2d, #54
 ; CHECK-GI-NEXT:    sshr v2.2d, v5.2d, #54
-; CHECK-GI-NEXT:    sshr v3.2d, v7.2d, #54
-; CHECK-GI-NEXT:    sshr v4.2d, v16.2d, #54
-; CHECK-GI-NEXT:    sshr v5.2d, v17.2d, #54
-; CHECK-GI-NEXT:    sshr v6.2d, v18.2d, #54
+; CHECK-GI-NEXT:    sshr v3.2d, v17.2d, #54
+; CHECK-GI-NEXT:    sshr v5.2d, v18.2d, #54
+; CHECK-GI-NEXT:    sshr v6.2d, v7.2d, #54
 ; CHECK-GI-NEXT:    sshr v7.2d, v19.2d, #54
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index c81fd26a775256..54ada05c904487 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -3812,51 +3812,72 @@ define i16 @add_v24i8_v24i16_zext(<24 x i8> %x) {
 ;
 ; CHECK-GI-LABEL: add_v24i8_v24i16_zext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov s4, w0
-; CHECK-GI-NEXT:    fmov s5, w4
-; CHECK-GI-NEXT:    ldr s0, [sp]
-; CHECK-GI-NEXT:    ldr s6, [sp, #8]
-; CHECK-GI-NEXT:    ldr s1, [sp, #32]
-; CHECK-GI-NEXT:    ldr s7, [sp, #40]
-; CHECK-GI-NEXT:    ldr s2, [sp, #64]
-; CHECK-GI-NEXT:    ldr s16, [sp, #72]
-; CHECK-GI-NEXT:    ldr s3, [sp, #96]
-; CHECK-GI-NEXT:    ldr s17, [sp, #104]
-; CHECK-GI-NEXT:    mov v4.s[1], w1
-; CHECK-GI-NEXT:    mov v5.s[1], w5
-; CHECK-GI-NEXT:    mov v0.s[1], v6.s[0]
-; CHECK-GI-NEXT:    mov v1.s[1], v7.s[0]
-; CHECK-GI-NEXT:    mov v2.s[1], v16.s[0]
-; CHECK-GI-NEXT:    mov v3.s[1], v17.s[0]
-; CHECK-GI-NEXT:    ldr s6, [sp, #16]
-; CHECK-GI-NEXT:    ldr s7, [sp, #48]
-; CHECK-GI-NEXT:    ldr s16, [sp, #80]
-; CHECK-GI-NEXT:    ldr s17, [sp, #112]
-; CHECK-GI-NEXT:    mov v4.s[2], w2
-; CHECK-GI-NEXT:    mov v5.s[2], w6
-; CHECK-GI-NEXT:    mov v0.s[2], v6.s[0]
-; CHECK-GI-NEXT:    mov v1.s[2], v7.s[0]
-; CHECK-GI-NEXT:    mov v2.s[2], v16.s[0]
-; CHECK-GI-NEXT:    mov v3.s[2], v17.s[0]
-; CHECK-GI-NEXT:    ldr s6, [sp, #24]
-; CHECK-GI-NEXT:    ldr s7, [sp, #56]
-; CHECK-GI-NEXT:    ldr s16, [sp, #88]
-; CHECK-GI-NEXT:    ldr s17, [sp, #120]
-; CHECK-GI-NEXT:    mov v4.s[3], w3
-; CHECK-GI-NEXT:    mov v5.s[3], w7
-; CHECK-GI-NEXT:    mov v0.s[3], v6.s[0]
-; CHECK-GI-NEXT:    mov v1.s[3], v7.s[0]
-; CHECK-GI-NEXT:    mov v2.s[3], v16.s[0]
-; CHECK-GI-NEXT:    mov v3.s[3], v17.s[0]
-; CHECK-GI-NEXT:    uzp1 v4.8h, v4.8h, v5.8h
-; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-GI-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
-; CHECK-GI-NEXT:    uzp1 v0.16b, v4.16b, v0.16b
-; CHECK-GI-NEXT:    xtn v1.8b, v1.8h
-; CHECK-GI-NEXT:    uaddlv h0, v0.16b
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fmov s1, w1
+; CHECK-GI-NEXT:    ldr w8, [sp]
+; CHECK-GI-NEXT:    ldr w9, [sp, #64]
+; CHECK-GI-NEXT:    ldr w10, [sp, #72]
+; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
+; CHECK-GI-NEXT:    fmov s1, w2
+; CHECK-GI-NEXT:    fmov s2, w10
+; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
+; CHECK-GI-NEXT:    fmov s1, w3
+; CHECK-GI-NEXT:    mov v0.b[3], v1.b[0]
+; CHECK-GI-NEXT:    fmov s1, w4
+; CHECK-GI-NEXT:    mov v0.b[4], v1.b[0]
+; CHECK-GI-NEXT:    fmov s1, w5
+; CHECK-GI-NEXT:    mov v0.b[5], v1.b[0]
+; CHECK-GI-NEXT:    fmov s1, w6
+; CHECK-GI-NEXT:    mov v0.b[6], v1.b[0]
+; CHECK-GI-NEXT:    fmov s1, w7
+; CHECK-GI-NEXT:    mov v0.b[7], v1.b[0]
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #8]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #16]
+; CHECK-GI-NEXT:    mov v0.b[8], v1.b[0]
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #80]
+; CHECK-GI-NEXT:    mov v1.b[1], v2.b[0]
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #88]
+; CHECK-GI-NEXT:    mov v0.b[9], v3.b[0]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #24]
+; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #96]
+; CHECK-GI-NEXT:    mov v0.b[10], v3.b[0]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #32]
+; CHECK-GI-NEXT:    mov v1.b[3], v2.b[0]
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #104]
+; CHECK-GI-NEXT:    mov v0.b[11], v3.b[0]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #40]
+; CHECK-GI-NEXT:    mov v1.b[4], v2.b[0]
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #112]
+; CHECK-GI-NEXT:    mov v0.b[12], v3.b[0]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #48]
+; CHECK-GI-NEXT:    mov v1.b[5], v2.b[0]
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #120]
+; CHECK-GI-NEXT:    mov v0.b[13], v3.b[0]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #56]
+; CHECK-GI-NEXT:    mov v1.b[6], v2.b[0]
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    mov v0.b[14], v3.b[0]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v1.b[7], v2.b[0]
+; CHECK-GI-NEXT:    mov v0.b[15], v3.b[0]
 ; CHECK-GI-NEXT:    uaddlv h1, v1.8b
-; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    uaddlv h0, v0.16b
 ; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    add w0, w8, w9
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -3938,51 +3959,72 @@ define i16 @add_v24i8_v24i16_sext(<24 x i8> %x) {
 ;
 ; CHECK-GI-LABEL: add_v24i8_v24i16_sext:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov s4, w0
-; CHECK-GI-NEXT:    fmov s5, w4
-; CHECK-GI-NEXT:    ldr s0, [sp]
-; CHECK-GI-NEXT:    ldr s6, [sp, #8]
-; CHECK-GI-NEXT:    ldr s1, [sp, #32]
-; CHECK-GI-NEXT:    ldr s7, [sp, #40]
-; CHECK-GI-NEXT:    ldr s2, [sp, #64]
-; CHECK-GI-NEXT:    ldr s16, [sp, #72]
-; CHECK-GI-NEXT:    ldr s3, [sp, #96]
-; CHECK-GI-NEXT:    ldr s17, [sp, #104]
-; CHECK-GI-NEXT:    mov v4.s[1], w1
-; CHECK-GI-NEXT:    mov v5.s[1], w5
-; CHECK-GI-NEXT:    mov v0.s[1], v6.s[0]
-; CHECK-GI-NEXT:    mov v1.s[1], v7.s[0]
-; CHECK-GI-NEXT:    mov v2.s[1], v16.s[0]
-; CHECK-GI-NEXT:    mov v3.s[1], v17.s[0]
-; CHECK-GI-NEXT:    ldr s6, [sp, #16]
-; CHECK-GI-NEXT:    ldr s7, [sp, #48]
-; CHECK-GI-NEXT:    ldr s16, [sp, #80]
-; CHECK-GI-NEXT:    ldr s17, [sp, #112]
-; CHECK-GI-NEXT:    mov v4.s[2], w2
-; CHECK-GI-NEXT:    mov v5.s[2], w6
-; CHECK-GI-NEXT:    mov v0.s[2], v6.s[0]
-; CHECK-GI-NEXT:    mov v1.s[2], v7.s[0]
-; CHECK-GI-NEXT:    mov v2.s[2], v16.s[0]
-; CHECK-GI-NEXT:    mov v3.s[2], v17.s[0]
-; CHECK-GI-NEXT:    ldr s6, [sp, #24]
-; CHECK-GI-NEXT:    ldr s7, [sp, #56]
-; CHECK-GI-NEXT:    ldr s16, [sp, #88]
-; CHECK-GI-NEXT:    ldr s17, [sp, #120]
-; CHECK-GI-NEXT:    mov v4.s[3], w3
-; CHECK-GI-NEXT:    mov v5.s[3], w7
-; CHECK-GI-NEXT:    mov v0.s[3], v6.s[0]
-; CHECK-GI-NEXT:    mov v1.s[3], v7.s[0]
-; CHECK-GI-NEXT:    mov v2.s[3], v16.s[0]
-; CHECK-GI-NEXT:    mov v3.s[3], v17.s[0]
-; CHECK-GI-NEXT:    uzp1 v4.8h, v4.8h, v5.8h
-; CHECK-GI-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-GI-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
-; CHECK-GI-NEXT:    uzp1 v0.16b, v4.16b, v0.16b
-; CHECK-GI-NEXT:    xtn v1.8b, v1.8h
-; CHECK-GI-NEXT:    saddlv h0, v0.16b
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fmov s1, w1
+; CHECK-GI-NEXT:    ldr w8, [sp]
+; CHECK-GI-NEXT:    ldr w9, [sp, #64]
+; CHECK-GI-NEXT:    ldr w10, [sp, #72]
+; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
+; CHECK-GI-NEXT:    fmov s1, w2
+; CHECK-GI-NEXT:    fmov s2, w10
+; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
+; CHECK-GI-NEXT:    fmov s1, w3
+; CHECK-GI-NEXT:    mov v0.b[3], v1.b[0]
+; CHECK-GI-NEXT:    fmov s1, w4
+; CHECK-GI-NEXT:    mov v0.b[4], v1.b[0]
+; CHECK-GI-NEXT:    fmov s1, w5
+; CHECK-GI-NEXT:    mov v0.b[5], v1.b[0]
+; CHECK-GI-NEXT:    fmov s1, w6
+; CHECK-GI-NEXT:    mov v0.b[6], v1.b[0]
+; CHECK-GI-NEXT:    fmov s1, w7
+; CHECK-GI-NEXT:    mov v0.b[7], v1.b[0]
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #8]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #16]
+; CHECK-GI-NEXT:    mov v0.b[8], v1.b[0]
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #80]
+; CHECK-GI-NEXT:    mov v1.b[1], v2.b[0]
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #88]
+; CHECK-GI-NEXT:    mov v0.b[9], v3.b[0]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #24]
+; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #96]
+; CHECK-GI-NEXT:    mov v0.b[10], v3.b[0]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #32]
+; CHECK-GI-NEXT:    mov v1.b[3], v2.b[0]
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #104]
+; CHECK-GI-NEXT:    mov v0.b[11], v3.b[0]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #40]
+; CHECK-GI-NEXT:    mov v1.b[4], v2.b[0]
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #112]
+; CHECK-GI-NEXT:    mov v0.b[12], v3.b[0]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #48]
+; CHECK-GI-NEXT:    mov v1.b[5], v2.b[0]
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #120]
+; CHECK-GI-NEXT:    mov v0.b[13], v3.b[0]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #56]
+; CHECK-GI-NEXT:    mov v1.b[6], v2.b[0]
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    mov v0.b[14], v3.b[0]
+; CHECK-GI-NEXT:    fmov s3, w8
+; CHECK-GI-NEXT:    mov v1.b[7], v2.b[0]
+; CHECK-GI-NEXT:    mov v0.b[15], v3.b[0]
 ; CHECK-GI-NEXT:    saddlv h1, v1.8b
-; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    saddlv h0, v0.16b
 ; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    add w0, w8, w9
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -4125,106 +4167,149 @@ define i32 @add_v24i8_v24i32_zext(<24 x i8> %x) {
 ;
 ; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_zext:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    fmov s4, w0
-; CHECK-GI-BASE-NEXT:    fmov s5, w4
-; CHECK-GI-BASE-NEXT:    ldr s0, [sp]
-; CHECK-GI-BASE-NEXT:    ldr s6, [sp, #8]
-; CHECK-GI-BASE-NEXT:    ldr s1, [sp, #32]
-; CHECK-GI-BASE-NEXT:    ldr s7, [sp, #40]
-; CHECK-GI-BASE-NEXT:    ldr s2, [sp, #64]
-; CHECK-GI-BASE-NEXT:    ldr s16, [sp, #72]
-; CHECK-GI-BASE-NEXT:    ldr s3, [sp, #96]
-; CHECK-GI-BASE-NEXT:    ldr s17, [sp, #104]
-; CHECK-GI-BASE-NEXT:    mov v4.s[1], w1
-; CHECK-GI-BASE-NEXT:    mov v5.s[1], w5
-; CHECK-GI-BASE-NEXT:    mov v0.s[1], v6.s[0]
-; CHECK-GI-BASE-NEXT:    mov v1.s[1], v7.s[0]
-; CHECK-GI-BASE-NEXT:    mov v2.s[1], v16.s[0]
-; CHECK-GI-BASE-NEXT:    mov v3.s[1], v17.s[0]
-; CHECK-GI-BASE-NEXT:    ldr s6, [sp, #16]
-; CHECK-GI-BASE-NEXT:    ldr s7, [sp, #48]
-; CHECK-GI-BASE-NEXT:    ldr s16, [sp, #80]
-; CHECK-GI-BASE-NEXT:    ldr s17, [sp, #112]
-; CHECK-GI-BASE-NEXT:    mov v4.s[2], w2
-; CHECK-GI-BASE-NEXT:    mov v5.s[2], w6
-; CHECK-GI-BASE-NEXT:    mov v0.s[2], v6.s[0]
-; CHECK-GI-BASE-NEXT:    mov v1.s[2], v7.s[0]
-; CHECK-GI-BASE-NEXT:    mov v2.s[2], v16.s[0]
-; CHECK-GI-BASE-NEXT:    mov v3.s[2], v17.s[0]
-; CHECK-GI-BASE-NEXT:    ldr s6, [sp, #24]
-; CHECK-GI-BASE-NEXT:    ldr s7, [sp, #56]
-; CHECK-GI-BASE-NEXT:    ldr s16, [sp, #88]
-; CHECK-GI-BASE-NEXT:    ldr s17, [sp, #120]
-; CHECK-GI-BASE-NEXT:    mov v4.s[3], w3
-; CHECK-GI-BASE-NEXT:    mov v5.s[3], w7
-; CHECK-GI-BASE-NEXT:    mov v0.s[3], v6.s[0]
-; CHECK-GI-BASE-NEXT:    mov v1.s[3], v7.s[0]
-; CHECK-GI-BASE-NEXT:    mov v2.s[3], v16.s[0]
-; CHECK-GI-BASE-NEXT:    mov v3.s[3], v17.s[0]
-; CHECK-GI-BASE-NEXT:    uzp1 v4.8h, v4.8h, v5.8h
-; CHECK-GI-BASE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
-; CHECK-GI-BASE-NEXT:    uzp1 v0.16b, v4.16b, v0.16b
-; CHECK-GI-BASE-NEXT:    xtn v1.8b, v1.8h
-; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.16b
+; CHECK-GI-BASE-NEXT:    fmov s0, w0
+; CHECK-GI-BASE-NEXT:    fmov s1, w1
+; CHECK-GI-BASE-NEXT:    ldr w8, [sp]
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #64]
+; CHECK-GI-BASE-NEXT:    ldr w10, [sp, #72]
+; CHECK-GI-BASE-NEXT:    mov v0.b[1], v1.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s1, w2
+; CHECK-GI-BASE-NEXT:    fmov s2, w10
+; CHECK-GI-BASE-NEXT:    mov v0.b[2], v1.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s1, w3
+; CHECK-GI-BASE-NEXT:    mov v0.b[3], v1.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s1, w4
+; CHECK-GI-BASE-NEXT:    mov v0.b[4], v1.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s1, w5
+; CHECK-GI-BASE-NEXT:    mov v0.b[5], v1.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s1, w6
+; CHECK-GI-BASE-NEXT:    mov v0.b[6], v1.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s1, w7
+; CHECK-GI-BASE-NEXT:    mov v0.b[7], v1.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s1, w8
+; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #8]
+; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #16]
+; CHECK-GI-BASE-NEXT:    mov v0.b[8], v1.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s1, w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #80]
+; CHECK-GI-BASE-NEXT:    mov v1.b[1], v2.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s2, w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #88]
+; CHECK-GI-BASE-NEXT:    mov v0.b[9], v3.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #24]
+; CHECK-GI-BASE-NEXT:    mov v1.b[2], v2.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s2, w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #96]
+; CHECK-GI-BASE-NEXT:    mov v0.b[10], v3.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #32]
+; CHECK-GI-BASE-NEXT:    mov v1.b[3], v2.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s2, w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #104]
+; CHECK-GI-BASE-NEXT:    mov v0.b[11], v3.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #40]
+; CHECK-GI-BASE-NEXT:    mov v1.b[4], v2.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s2, w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #112]
+; CHECK-GI-BASE-NEXT:    mov v0.b[12], v3.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #48]
+; CHECK-GI-BASE-NEXT:    mov v1.b[5], v2.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s2, w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #120]
+; CHECK-GI-BASE-NEXT:    mov v0.b[13], v3.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #56]
+; CHECK-GI-BASE-NEXT:    mov v1.b[6], v2.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s2, w9
+; CHECK-GI-BASE-NEXT:    mov v0.b[14], v3.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    mov v1.b[7], v2.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[15], v3.b[0]
 ; CHECK-GI-BASE-NEXT:    uaddlv h1, v1.8b
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.16b
 ; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
 ; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_zext:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    fmov s4, w0
-; CHECK-GI-DOT-NEXT:    fmov s5, w4
-; CHECK-GI-DOT-NEXT:    ldr s0, [sp]
-; CHECK-GI-DOT-NEXT:    ldr s6, [sp, #8]
-; CHECK-GI-DOT-NEXT:    ldr s1, [sp, #32]
-; CHECK-GI-DOT-NEXT:    ldr s7, [sp, #40]
-; CHECK-GI-DOT-NEXT:    ldr s2, [sp, #64]
-; CHECK-GI-DOT-NEXT:    ldr s16, [sp, #72]
-; CHECK-GI-DOT-NEXT:    ldr s3, [sp, #96]
-; CHECK-GI-DOT-NEXT:    ldr s17, [sp, #104]
-; CHECK-GI-DOT-NEXT:    mov v4.s[1], w1
-; CHECK-GI-DOT-NEXT:    mov v5.s[1], w5
-; CHECK-GI-DOT-NEXT:    mov v0.s[1], v6.s[0]
-; CHECK-GI-DOT-NEXT:    mov v1.s[1], v7.s[0]
-; CHECK-GI-DOT-NEXT:    mov v2.s[1], v16.s[0]
-; CHECK-GI-DOT-NEXT:    mov v3.s[1], v17.s[0]
-; CHECK-GI-DOT-NEXT:    ldr s6, [sp, #16]
-; CHECK-GI-DOT-NEXT:    ldr s7, [sp, #48]
-; CHECK-GI-DOT-NEXT:    ldr s16, [sp, #80]
-; CHECK-GI-DOT-NEXT:    ldr s17, [sp, #112]
-; CHECK-GI-DOT-NEXT:    mov v4.s[2], w2
-; CHECK-GI-DOT-NEXT:    mov v5.s[2], w6
-; CHECK-GI-DOT-NEXT:    mov v0.s[2], v6.s[0]
-; CHECK-GI-DOT-NEXT:    mov v1.s[2], v7.s[0]
-; CHECK-GI-DOT-NEXT:    mov v2.s[2], v16.s[0]
-; CHECK-GI-DOT-NEXT:    mov v3.s[2], v17.s[0]
-; CHECK-GI-DOT-NEXT:    ldr s6, [sp, #24]
-; CHECK-GI-DOT-NEXT:    ldr s7, [sp, #56]
-; CHECK-GI-DOT-NEXT:    ldr s16, [sp, #88]
-; CHECK-GI-DOT-NEXT:    ldr s17, [sp, #120]
-; CHECK-GI-DOT-NEXT:    mov v4.s[3], w3
-; CHECK-GI-DOT-NEXT:    mov v5.s[3], w7
-; CHECK-GI-DOT-NEXT:    mov v0.s[3], v6.s[0]
-; CHECK-GI-DOT-NEXT:    mov v1.s[3], v7.s[0]
-; CHECK-GI-DOT-NEXT:    mov v2.s[3], v16.s[0]
-; CHECK-GI-DOT-NEXT:    mov v3.s[3], v17.s[0]
-; CHECK-GI-DOT-NEXT:    uzp1 v4.8h, v4.8h, v5.8h
-; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-GI-DOT-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
+; CHECK-GI-DOT-NEXT:    fmov s0, w0
+; CHECK-GI-DOT-NEXT:    fmov s1, w1
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp]
+; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #64]
+; CHECK-GI-DOT-NEXT:    ldr w10, [sp, #72]
+; CHECK-GI-DOT-NEXT:    movi v4.8b, #1
+; CHECK-GI-DOT-NEXT:    fmov s2, w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #8]
+; CHECK-GI-DOT-NEXT:    mov v0.b[1], v1.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s1, w2
+; CHECK-GI-DOT-NEXT:    fmov s3, w10
+; CHECK-GI-DOT-NEXT:    mov v0.b[2], v1.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s1, w3
+; CHECK-GI-DOT-NEXT:    mov v0.b[3], v1.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s1, w4
+; CHECK-GI-DOT-NEXT:    mov v0.b[4], v1.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s1, w5
+; CHECK-GI-DOT-NEXT:    mov v0.b[5], v1.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s1, w6
+; CHECK-GI-DOT-NEXT:    mov v0.b[6], v1.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s1, w7
+; CHECK-GI-DOT-NEXT:    mov v0.b[7], v1.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s1, w9
+; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #80]
+; CHECK-GI-DOT-NEXT:    mov v1.b[1], v3.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #88]
+; CHECK-GI-DOT-NEXT:    mov v0.b[8], v2.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s2, w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #16]
+; CHECK-GI-DOT-NEXT:    mov v1.b[2], v3.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #96]
+; CHECK-GI-DOT-NEXT:    mov v0.b[9], v2.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s2, w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #24]
+; CHECK-GI-DOT-NEXT:    mov v1.b[3], v3.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #104]
+; CHECK-GI-DOT-NEXT:    mov v0.b[10], v2.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s2, w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #32]
+; CHECK-GI-DOT-NEXT:    mov v1.b[4], v3.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #112]
+; CHECK-GI-DOT-NEXT:    mov v0.b[11], v2.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s2, w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #40]
+; CHECK-GI-DOT-NEXT:    mov v1.b[5], v3.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #120]
+; CHECK-GI-DOT-NEXT:    mov v0.b[12], v2.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s2, w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #48]
+; CHECK-GI-DOT-NEXT:    fmov s5, w9
+; CHECK-GI-DOT-NEXT:    mov v1.b[6], v3.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s3, w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #56]
+; CHECK-GI-DOT-NEXT:    mov v0.b[13], v2.b[0]
 ; CHECK-GI-DOT-NEXT:    movi v2.8b, #1
-; CHECK-GI-DOT-NEXT:    movi v3.8b, #1
-; CHECK-GI-DOT-NEXT:    uzp1 v0.16b, v4.16b, v0.16b
-; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    xtn v1.8b, v1.8h
-; CHECK-GI-DOT-NEXT:    mov v3.d[1], v2.d[0]
-; CHECK-GI-DOT-NEXT:    udot v5.4s, v0.16b, v3.16b
-; CHECK-GI-DOT-NEXT:    udot v4.4s, v1.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v4.4s
+; CHECK-GI-DOT-NEXT:    mov v1.b[7], v5.b[0]
+; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    mov v0.b[14], v3.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s3, w8
+; CHECK-GI-DOT-NEXT:    mov v4.d[1], v2.d[0]
+; CHECK-GI-DOT-NEXT:    fmov d1, d1
+; CHECK-GI-DOT-NEXT:    mov v0.b[15], v3.b[0]
+; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    udot v5.4s, v0.16b, v4.16b
+; CHECK-GI-DOT-NEXT:    udot v3.4s, v1.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v3.4s
 ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
 ; CHECK-GI-DOT-NEXT:    fmov w0, s0
 ; CHECK-GI-DOT-NEXT:    ret
@@ -4398,106 +4483,149 @@ define i32 @add_v24i8_v24i32_sext(<24 x i8> %x) {
 ;
 ; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_sext:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    fmov s4, w0
-; CHECK-GI-BASE-NEXT:    fmov s5, w4
-; CHECK-GI-BASE-NEXT:    ldr s0, [sp]
-; CHECK-GI-BASE-NEXT:    ldr s6, [sp, #8]
-; CHECK-GI-BASE-NEXT:    ldr s1, [sp, #32]
-; CHECK-GI-BASE-NEXT:    ldr s7, [sp, #40]
-; CHECK-GI-BASE-NEXT:    ldr s2, [sp, #64]
-; CHECK-GI-BASE-NEXT:    ldr s16, [sp, #72]
-; CHECK-GI-BASE-NEXT:    ldr s3, [sp, #96]
-; CHECK-GI-BASE-NEXT:    ldr s17, [sp, #104]
-; CHECK-GI-BASE-NEXT:    mov v4.s[1], w1
-; CHECK-GI-BASE-NEXT:    mov v5.s[1], w5
-; CHECK-GI-BASE-NEXT:    mov v0.s[1], v6.s[0]
-; CHECK-GI-BASE-NEXT:    mov v1.s[1], v7.s[0]
-; CHECK-GI-BASE-NEXT:    mov v2.s[1], v16.s[0]
-; CHECK-GI-BASE-NEXT:    mov v3.s[1], v17.s[0]
-; CHECK-GI-BASE-NEXT:    ldr s6, [sp, #16]
-; CHECK-GI-BASE-NEXT:    ldr s7, [sp, #48]
-; CHECK-GI-BASE-NEXT:    ldr s16, [sp, #80]
-; CHECK-GI-BASE-NEXT:    ldr s17, [sp, #112]
-; CHECK-GI-BASE-NEXT:    mov v4.s[2], w2
-; CHECK-GI-BASE-NEXT:    mov v5.s[2], w6
-; CHECK-GI-BASE-NEXT:    mov v0.s[2], v6.s[0]
-; CHECK-GI-BASE-NEXT:    mov v1.s[2], v7.s[0]
-; CHECK-GI-BASE-NEXT:    mov v2.s[2], v16.s[0]
-; CHECK-GI-BASE-NEXT:    mov v3.s[2], v17.s[0]
-; CHECK-GI-BASE-NEXT:    ldr s6, [sp, #24]
-; CHECK-GI-BASE-NEXT:    ldr s7, [sp, #56]
-; CHECK-GI-BASE-NEXT:    ldr s16, [sp, #88]
-; CHECK-GI-BASE-NEXT:    ldr s17, [sp, #120]
-; CHECK-GI-BASE-NEXT:    mov v4.s[3], w3
-; CHECK-GI-BASE-NEXT:    mov v5.s[3], w7
-; CHECK-GI-BASE-NEXT:    mov v0.s[3], v6.s[0]
-; CHECK-GI-BASE-NEXT:    mov v1.s[3], v7.s[0]
-; CHECK-GI-BASE-NEXT:    mov v2.s[3], v16.s[0]
-; CHECK-GI-BASE-NEXT:    mov v3.s[3], v17.s[0]
-; CHECK-GI-BASE-NEXT:    uzp1 v4.8h, v4.8h, v5.8h
-; CHECK-GI-BASE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
-; CHECK-GI-BASE-NEXT:    uzp1 v0.16b, v4.16b, v0.16b
-; CHECK-GI-BASE-NEXT:    xtn v1.8b, v1.8h
-; CHECK-GI-BASE-NEXT:    saddlv h0, v0.16b
+; CHECK-GI-BASE-NEXT:    fmov s0, w0
+; CHECK-GI-BASE-NEXT:    fmov s1, w1
+; CHECK-GI-BASE-NEXT:    ldr w8, [sp]
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #64]
+; CHECK-GI-BASE-NEXT:    ldr w10, [sp, #72]
+; CHECK-GI-BASE-NEXT:    mov v0.b[1], v1.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s1, w2
+; CHECK-GI-BASE-NEXT:    fmov s2, w10
+; CHECK-GI-BASE-NEXT:    mov v0.b[2], v1.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s1, w3
+; CHECK-GI-BASE-NEXT:    mov v0.b[3], v1.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s1, w4
+; CHECK-GI-BASE-NEXT:    mov v0.b[4], v1.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s1, w5
+; CHECK-GI-BASE-NEXT:    mov v0.b[5], v1.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s1, w6
+; CHECK-GI-BASE-NEXT:    mov v0.b[6], v1.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s1, w7
+; CHECK-GI-BASE-NEXT:    mov v0.b[7], v1.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s1, w8
+; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #8]
+; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #16]
+; CHECK-GI-BASE-NEXT:    mov v0.b[8], v1.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s1, w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #80]
+; CHECK-GI-BASE-NEXT:    mov v1.b[1], v2.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s2, w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #88]
+; CHECK-GI-BASE-NEXT:    mov v0.b[9], v3.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #24]
+; CHECK-GI-BASE-NEXT:    mov v1.b[2], v2.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s2, w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #96]
+; CHECK-GI-BASE-NEXT:    mov v0.b[10], v3.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #32]
+; CHECK-GI-BASE-NEXT:    mov v1.b[3], v2.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s2, w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #104]
+; CHECK-GI-BASE-NEXT:    mov v0.b[11], v3.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #40]
+; CHECK-GI-BASE-NEXT:    mov v1.b[4], v2.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s2, w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #112]
+; CHECK-GI-BASE-NEXT:    mov v0.b[12], v3.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #48]
+; CHECK-GI-BASE-NEXT:    mov v1.b[5], v2.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s2, w9
+; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #120]
+; CHECK-GI-BASE-NEXT:    mov v0.b[13], v3.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #56]
+; CHECK-GI-BASE-NEXT:    mov v1.b[6], v2.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s2, w9
+; CHECK-GI-BASE-NEXT:    mov v0.b[14], v3.b[0]
+; CHECK-GI-BASE-NEXT:    fmov s3, w8
+; CHECK-GI-BASE-NEXT:    mov v1.b[7], v2.b[0]
+; CHECK-GI-BASE-NEXT:    mov v0.b[15], v3.b[0]
 ; CHECK-GI-BASE-NEXT:    saddlv h1, v1.8b
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
+; CHECK-GI-BASE-NEXT:    saddlv h0, v0.16b
 ; CHECK-GI-BASE-NEXT:    fmov w9, s1
+; CHECK-GI-BASE-NEXT:    fmov w8, s0
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
 ; CHECK-GI-BASE-NEXT:    sxth w0, w8
 ; CHECK-GI-BASE-NEXT:    ret
 ;
 ; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_sext:
 ; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    fmov s4, w0
-; CHECK-GI-DOT-NEXT:    fmov s5, w4
-; CHECK-GI-DOT-NEXT:    ldr s0, [sp]
-; CHECK-GI-DOT-NEXT:    ldr s6, [sp, #8]
-; CHECK-GI-DOT-NEXT:    ldr s1, [sp, #32]
-; CHECK-GI-DOT-NEXT:    ldr s7, [sp, #40]
-; CHECK-GI-DOT-NEXT:    ldr s2, [sp, #64]
-; CHECK-GI-DOT-NEXT:    ldr s16, [sp, #72]
-; CHECK-GI-DOT-NEXT:    ldr s3, [sp, #96]
-; CHECK-GI-DOT-NEXT:    ldr s17, [sp, #104]
-; CHECK-GI-DOT-NEXT:    mov v4.s[1], w1
-; CHECK-GI-DOT-NEXT:    mov v5.s[1], w5
-; CHECK-GI-DOT-NEXT:    mov v0.s[1], v6.s[0]
-; CHECK-GI-DOT-NEXT:    mov v1.s[1], v7.s[0]
-; CHECK-GI-DOT-NEXT:    mov v2.s[1], v16.s[0]
-; CHECK-GI-DOT-NEXT:    mov v3.s[1], v17.s[0]
-; CHECK-GI-DOT-NEXT:    ldr s6, [sp, #16]
-; CHECK-GI-DOT-NEXT:    ldr s7, [sp, #48]
-; CHECK-GI-DOT-NEXT:    ldr s16, [sp, #80]
-; CHECK-GI-DOT-NEXT:    ldr s17, [sp, #112]
-; CHECK-GI-DOT-NEXT:    mov v4.s[2], w2
-; CHECK-GI-DOT-NEXT:    mov v5.s[2], w6
-; CHECK-GI-DOT-NEXT:    mov v0.s[2], v6.s[0]
-; CHECK-GI-DOT-NEXT:    mov v1.s[2], v7.s[0]
-; CHECK-GI-DOT-NEXT:    mov v2.s[2], v16.s[0]
-; CHECK-GI-DOT-NEXT:    mov v3.s[2], v17.s[0]
-; CHECK-GI-DOT-NEXT:    ldr s6, [sp, #24]
-; CHECK-GI-DOT-NEXT:    ldr s7, [sp, #56]
-; CHECK-GI-DOT-NEXT:    ldr s16, [sp, #88]
-; CHECK-GI-DOT-NEXT:    ldr s17, [sp, #120]
-; CHECK-GI-DOT-NEXT:    mov v4.s[3], w3
-; CHECK-GI-DOT-NEXT:    mov v5.s[3], w7
-; CHECK-GI-DOT-NEXT:    mov v0.s[3], v6.s[0]
-; CHECK-GI-DOT-NEXT:    mov v1.s[3], v7.s[0]
-; CHECK-GI-DOT-NEXT:    mov v2.s[3], v16.s[0]
-; CHECK-GI-DOT-NEXT:    mov v3.s[3], v17.s[0]
-; CHECK-GI-DOT-NEXT:    uzp1 v4.8h, v4.8h, v5.8h
-; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-GI-DOT-NEXT:    uzp1 v1.8h, v2.8h, v3.8h
+; CHECK-GI-DOT-NEXT:    fmov s0, w0
+; CHECK-GI-DOT-NEXT:    fmov s1, w1
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp]
+; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #64]
+; CHECK-GI-DOT-NEXT:    ldr w10, [sp, #72]
+; CHECK-GI-DOT-NEXT:    movi v4.8b, #1
+; CHECK-GI-DOT-NEXT:    fmov s2, w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #8]
+; CHECK-GI-DOT-NEXT:    mov v0.b[1], v1.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s1, w2
+; CHECK-GI-DOT-NEXT:    fmov s3, w10
+; CHECK-GI-DOT-NEXT:    mov v0.b[2], v1.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s1, w3
+; CHECK-GI-DOT-NEXT:    mov v0.b[3], v1.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s1, w4
+; CHECK-GI-DOT-NEXT:    mov v0.b[4], v1.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s1, w5
+; CHECK-GI-DOT-NEXT:    mov v0.b[5], v1.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s1, w6
+; CHECK-GI-DOT-NEXT:    mov v0.b[6], v1.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s1, w7
+; CHECK-GI-DOT-NEXT:    mov v0.b[7], v1.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s1, w9
+; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #80]
+; CHECK-GI-DOT-NEXT:    mov v1.b[1], v3.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #88]
+; CHECK-GI-DOT-NEXT:    mov v0.b[8], v2.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s2, w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #16]
+; CHECK-GI-DOT-NEXT:    mov v1.b[2], v3.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #96]
+; CHECK-GI-DOT-NEXT:    mov v0.b[9], v2.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s2, w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #24]
+; CHECK-GI-DOT-NEXT:    mov v1.b[3], v3.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #104]
+; CHECK-GI-DOT-NEXT:    mov v0.b[10], v2.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s2, w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #32]
+; CHECK-GI-DOT-NEXT:    mov v1.b[4], v3.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #112]
+; CHECK-GI-DOT-NEXT:    mov v0.b[11], v2.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s2, w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #40]
+; CHECK-GI-DOT-NEXT:    mov v1.b[5], v3.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s3, w9
+; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #120]
+; CHECK-GI-DOT-NEXT:    mov v0.b[12], v2.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s2, w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #48]
+; CHECK-GI-DOT-NEXT:    fmov s5, w9
+; CHECK-GI-DOT-NEXT:    mov v1.b[6], v3.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s3, w8
+; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #56]
+; CHECK-GI-DOT-NEXT:    mov v0.b[13], v2.b[0]
 ; CHECK-GI-DOT-NEXT:    movi v2.8b, #1
-; CHECK-GI-DOT-NEXT:    movi v3.8b, #1
-; CHECK-GI-DOT-NEXT:    uzp1 v0.16b, v4.16b, v0.16b
-; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT:    xtn v1.8b, v1.8h
-; CHECK-GI-DOT-NEXT:    mov v3.d[1], v2.d[0]
-; CHECK-GI-DOT-NEXT:    sdot v5.4s, v0.16b, v3.16b
-; CHECK-GI-DOT-NEXT:    sdot v4.4s, v1.16b, v2.16b
-; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v4.4s
+; CHECK-GI-DOT-NEXT:    mov v1.b[7], v5.b[0]
+; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    mov v0.b[14], v3.b[0]
+; CHECK-GI-DOT-NEXT:    fmov s3, w8
+; CHECK-GI-DOT-NEXT:    mov v4.d[1], v2.d[0]
+; CHECK-GI-DOT-NEXT:    fmov d1, d1
+; CHECK-GI-DOT-NEXT:    mov v0.b[15], v3.b[0]
+; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT:    sdot v5.4s, v0.16b, v4.16b
+; CHECK-GI-DOT-NEXT:    sdot v3.4s, v1.16b, v2.16b
+; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v3.4s
 ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
 ; CHECK-GI-DOT-NEXT:    fmov w0, s0
 ; CHECK-GI-DOT-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/xtn.ll b/llvm/test/CodeGen/AArch64/xtn.ll
index 3c86f4bf9eb213..e536ba240453e2 100644
--- a/llvm/test/CodeGen/AArch64/xtn.ll
+++ b/llvm/test/CodeGen/AArch64/xtn.ll
@@ -127,19 +127,12 @@ entry:
 }
 
 define <2 x i8> @xtn_v2i128_v2i8(<2 x i128> %a) {
-; CHECK-SD-LABEL: xtn_v2i128_v2i8:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmov s0, w0
-; CHECK-SD-NEXT:    mov v0.s[1], w2
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: xtn_v2i128_v2i8:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov d0, x0
-; CHECK-GI-NEXT:    mov v0.d[1], x2
-; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: xtn_v2i128_v2i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    mov v0.s[1], w2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
 entry:
   %arg1 = trunc <2 x i128> %a to <2 x i8>
   ret <2 x i8> %arg1
@@ -174,9 +167,11 @@ define <2 x i16> @xtn_v2i128_v2i16(<2 x i128> %a) {
 ;
 ; CHECK-GI-LABEL: xtn_v2i128_v2i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov d0, x0
-; CHECK-GI-NEXT:    mov v0.d[1], x2
-; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fmov s1, w2
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = trunc <2 x i128> %a to <2 x i16>
@@ -194,19 +189,12 @@ entry:
 }
 
 define <2 x i32> @xtn_v2i128_v2i32(<2 x i128> %a) {
-; CHECK-SD-LABEL: xtn_v2i128_v2i32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmov s0, w0
-; CHECK-SD-NEXT:    mov v0.s[1], w2
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: xtn_v2i128_v2i32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov d0, x0
-; CHECK-GI-NEXT:    mov v0.d[1], x2
-; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: xtn_v2i128_v2i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    mov v0.s[1], w2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
 entry:
   %arg1 = trunc <2 x i128> %a to <2 x i32>
   ret <2 x i32> %arg1
diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll
index 716d2398996be2..bb968c8eb00fcb 100644
--- a/llvm/test/CodeGen/AArch64/zext.ll
+++ b/llvm/test/CodeGen/AArch64/zext.ll
@@ -242,16 +242,15 @@ define <3 x i16> @zext_v3i8_v3i16(<3 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v3i8_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    mov w8, #255 // =0xff
+; CHECK-GI-NEXT:    and w8, w0, #0xff
+; CHECK-GI-NEXT:    and w9, w1, #0xff
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    and w8, w2, #0xff
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w1
-; CHECK-GI-NEXT:    mov v2.16b, v1.16b
-; CHECK-GI-NEXT:    mov v0.s[2], w2
-; CHECK-GI-NEXT:    mov v2.h[1], v1.h[0]
-; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
-; CHECK-GI-NEXT:    mov v2.h[2], v1.h[0]
-; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <3 x i8> %a to <3 x i16>
@@ -271,14 +270,12 @@ define <3 x i32> @zext_v3i8_v3i32(<3 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v3i8_v3i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, #255 // =0xff
-; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w1
-; CHECK-GI-NEXT:    mov v1.s[1], w8
-; CHECK-GI-NEXT:    mov v0.s[2], w2
-; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    and w8, w0, #0xff
+; CHECK-GI-NEXT:    and w9, w1, #0xff
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    and w8, w2, #0xff
+; CHECK-GI-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NEXT:    mov v0.s[2], w8
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <3 x i8> %a to <3 x i32>
@@ -305,16 +302,15 @@ define <3 x i64> @zext_v3i8_v3i64(<3 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v3i8_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    movi v1.2d, #0x000000000000ff
+; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 def $x2
-; CHECK-GI-NEXT:    and x8, x2, #0xff
-; CHECK-GI-NEXT:    fmov d2, x8
-; CHECK-GI-NEXT:    mov v0.s[1], w1
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT:    mov d1, v0.d[1]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    and x8, x0, #0xff
+; CHECK-GI-NEXT:    and x9, x1, #0xff
+; CHECK-GI-NEXT:    and x10, x2, #0xff
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    fmov d1, x9
+; CHECK-GI-NEXT:    fmov d2, x10
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <3 x i8> %a to <3 x i64>
@@ -407,16 +403,15 @@ define <3 x i16> @zext_v3i10_v3i16(<3 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v3i10_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    mov w8, #1023 // =0x3ff
+; CHECK-GI-NEXT:    and w8, w0, #0x3ff
+; CHECK-GI-NEXT:    and w9, w1, #0x3ff
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    and w8, w2, #0x3ff
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w1
-; CHECK-GI-NEXT:    mov v2.16b, v1.16b
-; CHECK-GI-NEXT:    mov v0.s[2], w2
-; CHECK-GI-NEXT:    mov v2.h[1], v1.h[0]
-; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
-; CHECK-GI-NEXT:    mov v2.h[2], v1.h[0]
-; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <3 x i10> %a to <3 x i16>
@@ -436,14 +431,12 @@ define <3 x i32> @zext_v3i10_v3i32(<3 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v3i10_v3i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, #1023 // =0x3ff
-; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w1
-; CHECK-GI-NEXT:    mov v1.s[1], w8
-; CHECK-GI-NEXT:    mov v0.s[2], w2
-; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    and w8, w0, #0x3ff
+; CHECK-GI-NEXT:    and w9, w1, #0x3ff
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    and w8, w2, #0x3ff
+; CHECK-GI-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NEXT:    mov v0.s[2], w8
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <3 x i10> %a to <3 x i32>
@@ -469,17 +462,15 @@ define <3 x i64> @zext_v3i10_v3i64(<3 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v3i10_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    adrp x8, .LCPI27_0
+; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 def $x2
-; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI27_0]
-; CHECK-GI-NEXT:    and x8, x2, #0x3ff
-; CHECK-GI-NEXT:    fmov d2, x8
-; CHECK-GI-NEXT:    mov v0.s[1], w1
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-GI-NEXT:    mov d1, v0.d[1]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    and x8, x0, #0x3ff
+; CHECK-GI-NEXT:    and x9, x1, #0x3ff
+; CHECK-GI-NEXT:    and x10, x2, #0x3ff
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    fmov d1, x9
+; CHECK-GI-NEXT:    fmov d2, x10
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <3 x i10> %a to <3 x i64>
@@ -1098,33 +1089,51 @@ define <16 x i32> @zext_v16i10_v16i32(<16 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v16i10_v16i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov s4, w0
-; CHECK-GI-NEXT:    fmov s5, w4
-; CHECK-GI-NEXT:    ldr s2, [sp]
-; CHECK-GI-NEXT:    ldr s0, [sp, #8]
-; CHECK-GI-NEXT:    ldr s3, [sp, #32]
-; CHECK-GI-NEXT:    ldr s1, [sp, #40]
-; CHECK-GI-NEXT:    movi v6.4s, #3, msl #8
-; CHECK-GI-NEXT:    mov v4.s[1], w1
-; CHECK-GI-NEXT:    mov v5.s[1], w5
-; CHECK-GI-NEXT:    mov v2.s[1], v0.s[0]
-; CHECK-GI-NEXT:    mov v3.s[1], v1.s[0]
-; CHECK-GI-NEXT:    ldr s0, [sp, #16]
-; CHECK-GI-NEXT:    ldr s1, [sp, #48]
-; CHECK-GI-NEXT:    mov v4.s[2], w2
-; CHECK-GI-NEXT:    mov v5.s[2], w6
-; CHECK-GI-NEXT:    mov v2.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v3.s[2], v1.s[0]
-; CHECK-GI-NEXT:    ldr s0, [sp, #24]
-; CHECK-GI-NEXT:    ldr s1, [sp, #56]
-; CHECK-GI-NEXT:    mov v4.s[3], w3
-; CHECK-GI-NEXT:    mov v5.s[3], w7
-; CHECK-GI-NEXT:    mov v2.s[3], v0.s[0]
-; CHECK-GI-NEXT:    mov v3.s[3], v1.s[0]
-; CHECK-GI-NEXT:    and v0.16b, v4.16b, v6.16b
-; CHECK-GI-NEXT:    and v1.16b, v5.16b, v6.16b
-; CHECK-GI-NEXT:    and v2.16b, v2.16b, v6.16b
-; CHECK-GI-NEXT:    and v3.16b, v3.16b, v6.16b
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fmov s1, w1
+; CHECK-GI-NEXT:    ldr w8, [sp]
+; CHECK-GI-NEXT:    fmov s3, w5
+; CHECK-GI-NEXT:    ldr w9, [sp, #8]
+; CHECK-GI-NEXT:    ldr w10, [sp, #32]
+; CHECK-GI-NEXT:    ldr w11, [sp, #40]
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #16]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    fmov s1, w4
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    fmov s5, w10
+; CHECK-GI-NEXT:    fmov s6, w11
+; CHECK-GI-NEXT:    ldr w9, [sp, #48]
+; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
+; CHECK-GI-NEXT:    fmov s3, w2
+; CHECK-GI-NEXT:    mov v2.h[1], v4.h[0]
+; CHECK-GI-NEXT:    mov v5.h[1], v6.h[0]
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    fmov s6, w9
+; CHECK-GI-NEXT:    ldr w8, [sp, #24]
+; CHECK-GI-NEXT:    ldr w9, [sp, #56]
+; CHECK-GI-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-GI-NEXT:    fmov s3, w6
+; CHECK-GI-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v5.h[2], v6.h[0]
+; CHECK-GI-NEXT:    fmov s6, w9
+; CHECK-GI-NEXT:    mov v1.h[2], v3.h[0]
+; CHECK-GI-NEXT:    fmov s3, w3
+; CHECK-GI-NEXT:    mov v2.h[3], v4.h[0]
+; CHECK-GI-NEXT:    mov v0.h[3], v3.h[0]
+; CHECK-GI-NEXT:    fmov s3, w7
+; CHECK-GI-NEXT:    mov v5.h[3], v6.h[0]
+; CHECK-GI-NEXT:    mov v1.h[3], v3.h[0]
+; CHECK-GI-NEXT:    movi v3.4s, #3, msl #8
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v4.4s, v5.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    and v3.16b, v4.16b, v3.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <16 x i10> %a to <16 x i32>
@@ -1176,44 +1185,64 @@ define <16 x i64> @zext_v16i10_v16i64(<16 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v16i10_v16i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov s16, w0
-; CHECK-GI-NEXT:    fmov s17, w2
-; CHECK-GI-NEXT:    ldr s0, [sp]
-; CHECK-GI-NEXT:    fmov s18, w4
-; CHECK-GI-NEXT:    fmov s19, w6
-; CHECK-GI-NEXT:    ldr s1, [sp, #8]
-; CHECK-GI-NEXT:    ldr s2, [sp, #16]
-; CHECK-GI-NEXT:    ldr s3, [sp, #24]
-; CHECK-GI-NEXT:    ldr s4, [sp, #32]
-; CHECK-GI-NEXT:    ldr s5, [sp, #40]
-; CHECK-GI-NEXT:    ldr s6, [sp, #48]
-; CHECK-GI-NEXT:    ldr s7, [sp, #56]
-; CHECK-GI-NEXT:    mov v16.s[1], w1
-; CHECK-GI-NEXT:    mov v17.s[1], w3
-; CHECK-GI-NEXT:    mov v18.s[1], w5
-; CHECK-GI-NEXT:    mov v19.s[1], w7
-; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v2.s[1], v3.s[0]
-; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
-; CHECK-GI-NEXT:    mov v6.s[1], v7.s[0]
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fmov s1, w1
+; CHECK-GI-NEXT:    ldr w8, [sp]
+; CHECK-GI-NEXT:    fmov s2, w5
+; CHECK-GI-NEXT:    ldr w9, [sp, #8]
+; CHECK-GI-NEXT:    ldr w10, [sp, #32]
+; CHECK-GI-NEXT:    ldr w11, [sp, #40]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    fmov s1, w4
+; CHECK-GI-NEXT:    fmov s3, w9
+; CHECK-GI-NEXT:    fmov s4, w11
+; CHECK-GI-NEXT:    ldr w9, [sp, #48]
+; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
+; CHECK-GI-NEXT:    fmov s2, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #16]
+; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    ldr w8, [sp, #24]
+; CHECK-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NEXT:    fmov s3, w10
+; CHECK-GI-NEXT:    mov v3.h[1], v4.h[0]
+; CHECK-GI-NEXT:    fmov s4, w2
+; CHECK-GI-NEXT:    mov v2.h[2], v5.h[0]
+; CHECK-GI-NEXT:    fmov s5, w8
 ; CHECK-GI-NEXT:    adrp x8, .LCPI54_0
-; CHECK-GI-NEXT:    ushll v1.2d, v16.2s, #0
-; CHECK-GI-NEXT:    ushll v3.2d, v17.2s, #0
-; CHECK-GI-NEXT:    ushll v5.2d, v18.2s, #0
-; CHECK-GI-NEXT:    ushll v7.2d, v19.2s, #0
-; CHECK-GI-NEXT:    ushll v16.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll v18.2d, v2.2s, #0
-; CHECK-GI-NEXT:    ushll v19.2d, v4.2s, #0
-; CHECK-GI-NEXT:    ushll v20.2d, v6.2s, #0
-; CHECK-GI-NEXT:    ldr q17, [x8, :lo12:.LCPI54_0]
-; CHECK-GI-NEXT:    and v0.16b, v1.16b, v17.16b
-; CHECK-GI-NEXT:    and v1.16b, v3.16b, v17.16b
-; CHECK-GI-NEXT:    and v2.16b, v5.16b, v17.16b
-; CHECK-GI-NEXT:    and v3.16b, v7.16b, v17.16b
-; CHECK-GI-NEXT:    and v4.16b, v16.16b, v17.16b
-; CHECK-GI-NEXT:    and v5.16b, v18.16b, v17.16b
-; CHECK-GI-NEXT:    and v6.16b, v19.16b, v17.16b
-; CHECK-GI-NEXT:    and v7.16b, v20.16b, v17.16b
+; CHECK-GI-NEXT:    ldr q7, [x8, :lo12:.LCPI54_0]
+; CHECK-GI-NEXT:    mov v0.h[2], v4.h[0]
+; CHECK-GI-NEXT:    fmov s4, w6
+; CHECK-GI-NEXT:    mov v2.h[3], v5.h[0]
+; CHECK-GI-NEXT:    mov v1.h[2], v4.h[0]
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    ldr w9, [sp, #56]
+; CHECK-GI-NEXT:    mov v3.h[2], v4.h[0]
+; CHECK-GI-NEXT:    fmov s4, w3
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    mov v0.h[3], v4.h[0]
+; CHECK-GI-NEXT:    fmov s4, w7
+; CHECK-GI-NEXT:    ushll v17.2d, v2.2s, #0
+; CHECK-GI-NEXT:    ushll2 v18.2d, v2.4s, #0
+; CHECK-GI-NEXT:    mov v1.h[3], v4.h[0]
+; CHECK-GI-NEXT:    fmov s4, w9
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    mov v3.h[3], v4.h[0]
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v4.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll2 v5.2d, v0.4s, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    ushll v6.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll2 v16.2d, v1.4s, #0
+; CHECK-GI-NEXT:    and v0.16b, v4.16b, v7.16b
+; CHECK-GI-NEXT:    and v1.16b, v5.16b, v7.16b
+; CHECK-GI-NEXT:    and v4.16b, v17.16b, v7.16b
+; CHECK-GI-NEXT:    and v5.16b, v18.16b, v7.16b
+; CHECK-GI-NEXT:    ushll v19.2d, v3.2s, #0
+; CHECK-GI-NEXT:    ushll2 v20.2d, v3.4s, #0
+; CHECK-GI-NEXT:    and v2.16b, v6.16b, v7.16b
+; CHECK-GI-NEXT:    and v3.16b, v16.16b, v7.16b
+; CHECK-GI-NEXT:    and v6.16b, v19.16b, v7.16b
+; CHECK-GI-NEXT:    and v7.16b, v20.16b, v7.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <16 x i10> %a to <16 x i64>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir
index 6a291510fe66c1..90ee4b266f41dc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir
@@ -374,22 +374,22 @@ body:             |
     ; GFX6-LABEL: name: do_not_shl_v2s32_zero_by_16_from_zext_v2s16
     ; GFX6: liveins: $vgpr0
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: %zero:_(s16) = G_CONSTANT i16 0
-    ; GFX6-NEXT: %zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero(s16), %zero(s16)
     ; GFX6-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
     ; GFX6-NEXT: %shiftamtvector:_(<2 x s16>) = G_BUILD_VECTOR %shiftamt(s16), %shiftamt(s16)
-    ; GFX6-NEXT: %extend:_(<2 x s32>) = G_ZEXT %zerovector(<2 x s16>)
+    ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX6-NEXT: %extend:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32)
     ; GFX6-NEXT: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvector(<2 x s16>)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY %shl(<2 x s32>)
     ;
     ; GFX9-LABEL: name: do_not_shl_v2s32_zero_by_16_from_zext_v2s16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: %zero:_(s16) = G_CONSTANT i16 0
-    ; GFX9-NEXT: %zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero(s16), %zero(s16)
     ; GFX9-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
     ; GFX9-NEXT: %shiftamtvector:_(<2 x s16>) = G_BUILD_VECTOR %shiftamt(s16), %shiftamt(s16)
-    ; GFX9-NEXT: %extend:_(<2 x s32>) = G_ZEXT %zerovector(<2 x s16>)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9-NEXT: %extend:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C1]](s32)
     ; GFX9-NEXT: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvector(<2 x s16>)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY %shl(<2 x s32>)
     %zero:_(s16) = G_CONSTANT i16 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir
index 6ceb41199af6da..29b66288b3e4b6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir
@@ -246,22 +246,20 @@ body:             |
     ; GFX6-LABEL: name: do_not_shl_v2s32_zero_by_16_from_zext_v2s16
     ; GFX6: liveins: $vgpr0, $vgpr1
     ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: %zero:_(s16) = G_CONSTANT i16 0
-    ; GFX6-NEXT: %zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero(s16), %zero(s16)
     ; GFX6-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
     ; GFX6-NEXT: %shiftamtvector:_(<2 x s16>) = G_BUILD_VECTOR %shiftamt(s16), %shiftamt(s16)
-    ; GFX6-NEXT: %extend:_(<2 x s32>) = G_ZEXT %zerovector(<2 x s16>)
+    ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX6-NEXT: %extend:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32)
     ; GFX6-NEXT: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvector(<2 x s16>)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY %shl(<2 x s32>)
     ;
     ; GFX9-LABEL: name: do_not_shl_v2s32_zero_by_16_from_zext_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: %zero:_(s16) = G_CONSTANT i16 0
-    ; GFX9-NEXT: %zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero(s16), %zero(s16)
     ; GFX9-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
     ; GFX9-NEXT: %shiftamtvector:_(<2 x s16>) = G_BUILD_VECTOR %shiftamt(s16), %shiftamt(s16)
-    ; GFX9-NEXT: %extend:_(<2 x s32>) = G_ZEXT %zerovector(<2 x s16>)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9-NEXT: %extend:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32)
     ; GFX9-NEXT: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvector(<2 x s16>)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY %shl(<2 x s32>)
     %zero:_(s16) = G_CONSTANT i16 0

>From 56938a159bfc8a25f35c06bf854f6a3b8b01d47e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett at gmail.com>
Date: Fri, 16 Aug 2024 21:25:42 +0200
Subject: [PATCH 2/2] multi use

---
 .../AArch64/GlobalISel/combine-cast.mir       | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir
index 9eef79a9c4bbee..026b18139c2a48 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-cast.mir
@@ -221,3 +221,22 @@ body:             |
     %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32)
     %large:_(<2 x s64>) = G_SEXT %bv(<2 x s32>)
     $q0 = COPY %large(<2 x s64>)
+...
+---
+name:            test_combine_anyext_build_vector_multi_use
+legalized: true
+body:             |
+  bb.1:
+    ; CHECK-LABEL: name: test_combine_anyext_build_vector_multi_use
+    ; CHECK: %arg1:_(s32) = COPY $w0
+    ; CHECK-NEXT: %arg2:_(s32) = COPY $w0
+    ; CHECK-NEXT: %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32)
+    ; CHECK-NEXT: %large:_(<2 x s64>) = G_ANYEXT %bv(<2 x s32>)
+    ; CHECK-NEXT: $q0 = COPY %large(<2 x s64>)
+    ; CHECK-NEXT: $d0 = COPY %bv(<2 x s32>)
+    %arg1:_(s32) = COPY $w0
+    %arg2:_(s32) = COPY $w0
+    %bv:_(<2 x s32>) = G_BUILD_VECTOR %arg1(s32), %arg2(s32)
+    %large:_(<2 x s64>) = G_ANYEXT %bv(<2 x s32>)
+    $q0 = COPY %large(<2 x s64>)
+    $d0 = COPY %bv(<2 x s32>)



More information about the llvm-commits mailing list