[llvm] [GlobalISel] Turn shuffle a, b, mask -> shuffle undef, b, mask iff mask does not reference a (PR #115377)

Thu Nov 14 14:02:17 PST 2024

https://github.com/konstantinschwarz updated https://github.com/llvm/llvm-project/pull/115377

>From 1776e3bc99207190e9267dee845966b64d694620 Mon Sep 17 00:00:00 2001
From: Konstantin Schwarz <konstantin.schwarz at amd.com>
Date: Tue, 5 Nov 2024 21:26:26 +0000
Subject: [PATCH] [GlobalISel] Turn shuffle a, b, mask -> shuffle undef, b,
 mask iff mask does not reference a

---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |   4 +
 .../include/llvm/Target/GlobalISel/Combine.td |  10 +-
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  66 +++++++++++
 .../GlobalISel/combine-extract-vec-elt.mir    |   4 +-
 .../AArch64/GlobalISel/combine-freeze.mir     |   8 +-
 ...rcombiner-shuffle-vector-disjoint-mask.mir | 101 ++++++++++++++++
 .../AArch64/aarch64-matrix-umull-smull.ll     |  12 +-
 llvm/test/CodeGen/AArch64/aarch64-smull.ll    |   3 +-
 llvm/test/CodeGen/AArch64/arm64-ext.ll        |  13 +--
 .../AArch64/arm64-extract_subvector.ll        |  64 ++++++----
 llvm/test/CodeGen/AArch64/ext-narrow-index.ll | 110 ++++++------------
 11 files changed, 273 insertions(+), 122 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-disjoint-mask.mir

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 49e9d6bd73a4cc..de03fcc365785e 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -874,6 +874,10 @@ class CombinerHelper {
   /// Remove references to rhs if it is undef
   bool matchShuffleUndefRHS(MachineInstr &MI, BuildFnTy &MatchInfo);
 
+  /// Turn shuffle a, b, mask -> shuffle undef, b, mask iff mask does not
+  /// reference a.
+  bool matchShuffleDisjointMask(MachineInstr &MI, BuildFnTy &MatchInfo);
+
   /// Use a function which takes in a MachineIRBuilder to perform a combine.
   /// By default, it erases the instruction def'd on \p MO from the function.
   void applyBuildFnMO(const MachineOperand &MO, BuildFnTy &MatchInfo);
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 95f3d637da8548..8fe7f0f5907c63 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1594,6 +1594,13 @@ def combine_shuffle_undef_rhs : GICombineRule<
   (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])
 >;
 
+def combine_shuffle_disjoint_mask : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_SHUFFLE_VECTOR):$root,
+        [{ return Helper.matchShuffleDisjointMask(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])
+>;
+
 // match_extract_of_element and insert_vector_elt_oob must be the first!
 def vector_ops_combines: GICombineGroup<[
 match_extract_of_element_undef_vector,
@@ -1945,7 +1952,8 @@ def constant_fold_binops : GICombineGroup<[constant_fold_binop,
 def prefer_sign_combines : GICombineGroup<[nneg_zext]>;
 
 def shuffle_combines : GICombineGroup<[combine_shuffle_concat,
-                                       combine_shuffle_undef_rhs]>;
+                                       combine_shuffle_undef_rhs,
+                                       combine_shuffle_disjoint_mask]>;
 
 def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
     vector_ops_combines, freeze_combines, cast_combines,
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 1a1a1c28ef1500..83d78c0bde399e 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -7724,3 +7724,69 @@ bool CombinerHelper::matchShuffleUndefRHS(MachineInstr &MI,
 
   return true;
 }
+
+static void commuteMask(MutableArrayRef<int> Mask, const unsigned NumElems) {
+  const unsigned MaskSize = Mask.size();
+  for (unsigned I = 0; I < MaskSize; ++I) {
+    int Idx = Mask[I];
+    if (Idx < 0)
+      continue;
+
+    if (Idx < (int)NumElems)
+      Mask[I] = Idx + NumElems;
+    else
+      Mask[I] = Idx - NumElems;
+  }
+}
+
+bool CombinerHelper::matchShuffleDisjointMask(MachineInstr &MI,
+                                              BuildFnTy &MatchInfo) {
+
+  auto &Shuffle = cast<GShuffleVector>(MI);
+  // If any of the two inputs is already undef, don't check the mask again to
+  // prevent infinite loop
+  if (getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, Shuffle.getSrc1Reg(), MRI))
+    return false;
+
+  if (getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, Shuffle.getSrc2Reg(), MRI))
+    return false;
+
+  const LLT DstTy = MRI.getType(Shuffle.getReg(0));
+  const LLT Src1Ty = MRI.getType(Shuffle.getSrc1Reg());
+  if (!isLegalOrBeforeLegalizer(
+          {TargetOpcode::G_SHUFFLE_VECTOR, {DstTy, Src1Ty}}))
+    return false;
+
+  ArrayRef<int> Mask = Shuffle.getMask();
+  const unsigned NumSrcElems = Src1Ty.isVector() ? Src1Ty.getNumElements() : 1;
+
+  bool TouchesSrc1 = false;
+  bool TouchesSrc2 = false;
+  const unsigned NumElems = Mask.size();
+  for (unsigned Idx = 0; Idx < NumElems; ++Idx) {
+    if (Mask[Idx] < 0)
+      continue;
+
+    if (Mask[Idx] < (int)NumSrcElems)
+      TouchesSrc1 = true;
+    else
+      TouchesSrc2 = true;
+  }
+
+  if (TouchesSrc1 == TouchesSrc2)
+    return false;
+
+  Register NewSrc1 = Shuffle.getSrc1Reg();
+  SmallVector<int, 16> NewMask(Mask);
+  if (TouchesSrc2) {
+    NewSrc1 = Shuffle.getSrc2Reg();
+    commuteMask(NewMask, NumSrcElems);
+  }
+
+  MatchInfo = [=, &Shuffle](MachineIRBuilder &B) {
+    auto Undef = B.buildUndef(Src1Ty);
+    B.buildShuffleVector(Shuffle.getReg(0), NewSrc1, Undef, NewMask);
+  };
+
+  return true;
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir
index 83b5c388520eb3..e2933690c7c55a 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-extract-vec-elt.mir
@@ -598,9 +598,9 @@ body:             |
     ; CHECK: liveins: $x0, $x1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %arg1:_(<4 x s32>) = COPY $q0
-    ; CHECK-NEXT: %arg2:_(<4 x s32>) = COPY $q1
     ; CHECK-NEXT: %idx:_(s64) = COPY $x1
-    ; CHECK-NEXT: %sv:_(<4 x s32>) = G_SHUFFLE_VECTOR %arg1(<4 x s32>), %arg2, shufflemask(undef, 0, 0, 0)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: %sv:_(<4 x s32>) = G_SHUFFLE_VECTOR %arg1(<4 x s32>), [[DEF]], shufflemask(undef, 0, 0, 0)
     ; CHECK-NEXT: %extract:_(s32) = G_EXTRACT_VECTOR_ELT %sv(<4 x s32>), %idx(s64)
     ; CHECK-NEXT: $w0 = COPY %extract(s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir
index fa725ad0c5fb41..6b84a8488e478d 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-freeze.mir
@@ -1395,9 +1395,9 @@ body:             |
     ; CHECK: liveins: $x0, $x1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %arg1:_(<4 x s32>) = COPY $q0
-    ; CHECK-NEXT: %arg2:_(<4 x s32>) = COPY $q1
     ; CHECK-NEXT: %idx:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: %sv:_(<4 x s32>) = G_SHUFFLE_VECTOR %arg1(<4 x s32>), %arg2, shufflemask(3, 0, 0, 0)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: %sv:_(<4 x s32>) = G_SHUFFLE_VECTOR %arg1(<4 x s32>), [[DEF]], shufflemask(3, 0, 0, 0)
     ; CHECK-NEXT: %freeze_sv:_(<4 x s32>) = G_FREEZE %sv
     ; CHECK-NEXT: %extract:_(s32) = G_EXTRACT_VECTOR_ELT %freeze_sv(<4 x s32>), %idx(s64)
     ; CHECK-NEXT: $w0 = COPY %extract(s32)
@@ -1422,9 +1422,9 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %elt:_(s32) = COPY $w0
     ; CHECK-NEXT: %arg1:_(<4 x s32>) = COPY $q0
-    ; CHECK-NEXT: %arg2:_(<4 x s32>) = COPY $q1
     ; CHECK-NEXT: %idx:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: %sv:_(<4 x s32>) = G_SHUFFLE_VECTOR %arg1(<4 x s32>), %arg2, shufflemask(3, 0, 0, 0)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: %sv:_(<4 x s32>) = G_SHUFFLE_VECTOR %arg1(<4 x s32>), [[DEF]], shufflemask(3, 0, 0, 0)
     ; CHECK-NEXT: %freeze_sv:_(<4 x s32>) = G_FREEZE %sv
     ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s32) = G_FREEZE %elt
     ; CHECK-NEXT: %extract:_(<4 x s32>) = G_INSERT_VECTOR_ELT %freeze_sv, [[FREEZE]](s32), %idx(s64)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-disjoint-mask.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-disjoint-mask.mir
new file mode 100644
index 00000000000000..9261d7af41c692
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-disjoint-mask.mir
@@ -0,0 +1,101 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple aarch64-apple-ios  -run-pass=aarch64-prelegalizer-combiner %s -o - | FileCheck %s
+
+---
+name: shuffle_vector_unused_lhs
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $d0, $d1
+
+    ; CHECK-LABEL: name: shuffle_vector_unused_lhs
+    ; CHECK: liveins: $d0, $d1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d1
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<2 x s32>), [[DEF]], shufflemask(1, 0, 1, 0)
+    ; CHECK-NEXT: RET_ReallyLR implicit [[SHUF]](<4 x s32>)
+    %0:_(<2 x s32>) = COPY $d0
+    %1:_(<2 x s32>) = COPY $d1
+    %2:_(<4 x s32>) = G_SHUFFLE_VECTOR %0(<2 x s32>), %1(<2 x s32>), shufflemask(3, 2, 3, 2)
+    RET_ReallyLR implicit %2
+...
+
+---
+name: shuffle_vector_unused_rhs
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $d0, $d1
+
+    ; CHECK-LABEL: name: shuffle_vector_unused_rhs
+    ; CHECK: liveins: $d0, $d1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<2 x s32>), [[DEF]], shufflemask(0, 0, 1, 1)
+    ; CHECK-NEXT: RET_ReallyLR implicit [[SHUF]](<4 x s32>)
+    %0:_(<2 x s32>) = COPY $d0
+    %1:_(<2 x s32>) = COPY $d1
+    %2:_(<4 x s32>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(0,0,1,1)
+    RET_ReallyLR implicit %2
+...
+
+---
+name: shuffle_vector_both_used
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $d0, $d1
+
+    ; CHECK-LABEL: name: shuffle_vector_both_used
+    ; CHECK: liveins: $d0, $d1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1
+    ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<2 x s32>), [[COPY1]], shufflemask(0, 2, 1, 3)
+    ; CHECK-NEXT: RET_ReallyLR implicit [[SHUF]](<4 x s32>)
+    %0:_(<2 x s32>) = COPY $d0
+    %1:_(<2 x s32>) = COPY $d1
+    %2:_(<4 x s32>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(0,2,1,3)
+    RET_ReallyLR implicit %2
+...
+
+---
+name: shuffle_vector_undef_elems
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $d0, $d1
+
+    ; CHECK-LABEL: name: shuffle_vector_undef_elems
+    ; CHECK: liveins: $d0, $d1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<2 x s32>), [[DEF]], shufflemask(undef, 0, 1, undef)
+    ; CHECK-NEXT: RET_ReallyLR implicit [[SHUF]](<4 x s32>)
+    %0:_(<2 x s32>) = COPY $d0
+    %1:_(<2 x s32>) = COPY $d1
+    %2:_(<4 x s32>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(-1,0,1,-1)
+    RET_ReallyLR implicit %2
+...
+
+---
+name: shuffle_vector_scalar
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: shuffle_vector_scalar
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[COPY]](s64), [[COPY]](s64), [[COPY]](s64), [[COPY]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit [[BUILD_VECTOR]](<4 x s64>)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(<4 x s64>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(0, 0, 0, 0)
+    RET_ReallyLR implicit %2
+...
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 2a2f304b23e9b4..0c7a61739695fb 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -201,21 +201,21 @@ define void @matrix_mul_double_shuffle(i32 %N, ptr nocapture %C, ptr nocapture r
 ; CHECK-GI:       // %bb.0: // %vector.header
 ; CHECK-GI-NEXT:    and w9, w3, #0xffff
 ; CHECK-GI-NEXT:    adrp x8, .LCPI2_0
-; CHECK-GI-NEXT:    dup v1.4s, w9
+; CHECK-GI-NEXT:    dup v0.4s, w9
 ; CHECK-GI-NEXT:    mov w9, w0
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI2_0]
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI2_0]
 ; CHECK-GI-NEXT:    and x8, x9, #0xfffffff8
 ; CHECK-GI-NEXT:  .LBB2_1: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    ldrh w9, [x2], #16
 ; CHECK-GI-NEXT:    subs x8, x8, #8
-; CHECK-GI-NEXT:    mov v0.s[0], w9
+; CHECK-GI-NEXT:    mov v2.s[0], w9
 ; CHECK-GI-NEXT:    mov w9, w0
 ; CHECK-GI-NEXT:    add w0, w0, #8
 ; CHECK-GI-NEXT:    lsl x9, x9, #2
-; CHECK-GI-NEXT:    tbl v3.16b, { v0.16b, v1.16b }, v2.16b
-; CHECK-GI-NEXT:    mul v3.4s, v1.4s, v3.4s
-; CHECK-GI-NEXT:    str q3, [x1, x9]
+; CHECK-GI-NEXT:    tbl v2.16b, { v2.16b, v3.16b }, v1.16b
+; CHECK-GI-NEXT:    mul v2.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    str q2, [x1, x9]
 ; CHECK-GI-NEXT:    b.ne .LBB2_1
 ; CHECK-GI-NEXT:  // %bb.2: // %for.end12
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 3c4901ade972ec..69a69dbd3b18ba 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -2385,9 +2385,8 @@ define <2 x i32> @do_stuff(<2 x i64> %0, <2 x i64> %1) {
 ;
 ; CHECK-GI-LABEL: do_stuff:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
-; CHECK-GI-NEXT:    ext v2.16b, v1.16b, v2.16b, #8
+; CHECK-GI-NEXT:    mov d2, v1.d[1]
 ; CHECK-GI-NEXT:    umull v0.2d, v2.2s, v0.2s
 ; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-GI-NEXT:    add v0.2s, v0.2s, v1.2s
diff --git a/llvm/test/CodeGen/AArch64/arm64-ext.ll b/llvm/test/CodeGen/AArch64/arm64-ext.ll
index 932b94a91095a8..e32d83327fe424 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ext.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ext.ll
@@ -70,15 +70,10 @@ define <8 x i8> @test_vextd_undef(<8 x i8> %tmp1, <8 x i8> %tmp2) {
 }
 
 define <8 x i8> @test_vextd_undef2(<8 x i8> %tmp1, <8 x i8> %tmp2) {
-; CHECK-SD-LABEL: test_vextd_undef2:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ext v0.8b, v0.8b, v0.8b, #6
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vextd_undef2:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ext v0.8b, v1.8b, v0.8b, #6
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vextd_undef2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v0.8b, v0.8b, v0.8b, #6
+; CHECK-NEXT:    ret
   %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 5>
   ret <8 x i8> %tmp3
 }
diff --git a/llvm/test/CodeGen/AArch64/arm64-extract_subvector.ll b/llvm/test/CodeGen/AArch64/arm64-extract_subvector.ll
index 02e6f28ee6ff9b..f08ff2ccf693d9 100644
--- a/llvm/test/CodeGen/AArch64/arm64-extract_subvector.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-extract_subvector.ll
@@ -1,35 +1,50 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc < %s -mtriple=arm64-eabi -global-isel=1 -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefix=CHECK-SD
+; RUN: llc < %s -mtriple=arm64-eabi -global-isel=1 -aarch64-neon-syntax=apple | FileCheck %s --check-prefix=CHECK-GI
 
 ; Extract of an upper half of a vector is an "ext.16b v0, v0, v0, #8" insn.
 
 define <8 x i8> @v8i8(<16 x i8> %a) nounwind {
-; CHECK-LABEL: v8i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext.16b v0, v0, v0, #8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v8i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ext.16b v0, v0, v0, #8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v8i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov d0, v0[1]
+; CHECK-GI-NEXT:    ret
   %ret = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32>  <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <8 x i8> %ret
 }
 
 define <4 x i16> @v4i16(<8 x i16> %a) nounwind {
-; CHECK-LABEL: v4i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext.16b v0, v0, v0, #8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v4i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ext.16b v0, v0, v0, #8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v4i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov d0, v0[1]
+; CHECK-GI-NEXT:    ret
   %ret = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32>  <i32 4, i32 5, i32 6, i32 7>
   ret <4 x i16> %ret
 }
 
 define <2 x i32> @v2i32(<4 x i32> %a) nounwind {
-; CHECK-LABEL: v2i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext.16b v0, v0, v0, #8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ext.16b v0, v0, v0, #8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov d0, v0[1]
+; CHECK-GI-NEXT:    ret
   %ret = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32>  <i32 2, i32 3>
   ret <2 x i32> %ret
 }
@@ -65,11 +80,16 @@ define <1 x ptr> @v1p0(<2 x ptr> %a) nounwind {
 }
 
 define <2 x float> @v2f32(<4 x float> %a) nounwind {
-; CHECK-LABEL: v2f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext.16b v0, v0, v0, #8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2f32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ext.16b v0, v0, v0, #8
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2f32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov d0, v0[1]
+; CHECK-GI-NEXT:    ret
   %ret = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32>  <i32 2, i32 3>
   ret <2 x float> %ret
 }
diff --git a/llvm/test/CodeGen/AArch64/ext-narrow-index.ll b/llvm/test/CodeGen/AArch64/ext-narrow-index.ll
index 2c5d33da93c863..177f2cafcf833a 100644
--- a/llvm/test/CodeGen/AArch64/ext-narrow-index.ll
+++ b/llvm/test/CodeGen/AArch64/ext-narrow-index.ll
@@ -17,17 +17,11 @@ entry:
 }
 
 define <8 x i8> @i8_off1(<16 x i8> %arg1, <16 x i8> %arg2) {
-; CHECK-SD-LABEL: i8_off1:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #1
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: i8_off1:
-; CHECK-GISEL:       // %bb.0: // %entry
-; CHECK-GISEL-NEXT:    ext v0.16b, v0.16b, v1.16b, #1
-; CHECK-GISEL-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: i8_off1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> %arg2, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
   ret <8 x i8> %shuffle
@@ -42,8 +36,7 @@ define <8 x i8> @i8_off8(<16 x i8> %arg1, <16 x i8> %arg2) {
 ;
 ; CHECK-GISEL-LABEL: i8_off8:
 ; CHECK-GISEL:       // %bb.0: // %entry
-; CHECK-GISEL-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
-; CHECK-GISEL-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GISEL-NEXT:    mov d0, v0.d[1]
 ; CHECK-GISEL-NEXT:    ret
 entry:
   %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> %arg2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -90,17 +83,11 @@ entry:
 }
 
 define <4 x i16> @i16_off1(<8 x i16> %arg1, <8 x i16> %arg2) {
-; CHECK-SD-LABEL: i16_off1:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #2
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: i16_off1:
-; CHECK-GISEL:       // %bb.0: // %entry
-; CHECK-GISEL-NEXT:    ext v0.16b, v0.16b, v1.16b, #2
-; CHECK-GISEL-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: i16_off1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %arg1, <8 x i16> %arg2, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
   ret <4 x i16> %shuffle
@@ -140,17 +127,11 @@ entry:
 }
 
 define <2 x i32> @i32_off1(<4 x i32> %arg1, <4 x i32> %arg2) {
-; CHECK-SD-LABEL: i32_off1:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #4
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: i32_off1:
-; CHECK-GISEL:       // %bb.0: // %entry
-; CHECK-GISEL-NEXT:    ext v0.16b, v0.16b, v1.16b, #4
-; CHECK-GISEL-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: i32_off1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %arg1, <4 x i32> %arg2, <2 x i32> <i32 1, i32 2>
   ret <2 x i32> %shuffle
@@ -228,18 +209,11 @@ entry:
 }
 
 define <8 x i8> @i8_zero_off1(<16 x i8> %arg1) {
-; CHECK-SD-LABEL: i8_zero_off1:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #1
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: i8_zero_off1:
-; CHECK-GISEL:       // %bb.0: // %entry
-; CHECK-GISEL-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-GISEL-NEXT:    ext v0.16b, v0.16b, v1.16b, #1
-; CHECK-GISEL-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: i8_zero_off1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> zeroinitializer, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
   ret <8 x i8> %shuffle
@@ -254,9 +228,7 @@ define <8 x i8> @i8_zero_off8(<16 x i8> %arg1) {
 ;
 ; CHECK-GISEL-LABEL: i8_zero_off8:
 ; CHECK-GISEL:       // %bb.0: // %entry
-; CHECK-GISEL-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-GISEL-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
-; CHECK-GISEL-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GISEL-NEXT:    mov d0, v0.d[1]
 ; CHECK-GISEL-NEXT:    ret
 entry:
   %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -283,8 +255,8 @@ define <8 x i8> @i8_zero_off22(<16 x i8> %arg1) {
 ;
 ; CHECK-GISEL-LABEL: i8_zero_off22:
 ; CHECK-GISEL:       // %bb.0: // %entry
-; CHECK-GISEL-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-GISEL-NEXT:    ext v0.16b, v1.16b, v0.16b, #6
+; CHECK-GISEL-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-GISEL-NEXT:    ext v0.16b, v0.16b, v0.16b, #6
 ; CHECK-GISEL-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GISEL-NEXT:    ret
 entry:
@@ -304,18 +276,11 @@ entry:
 }
 
 define <4 x i16> @i16_zero_off1(<8 x i16> %arg1) {
-; CHECK-SD-LABEL: i16_zero_off1:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #2
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: i16_zero_off1:
-; CHECK-GISEL:       // %bb.0: // %entry
-; CHECK-GISEL-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-GISEL-NEXT:    ext v0.16b, v0.16b, v1.16b, #2
-; CHECK-GISEL-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: i16_zero_off1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %arg1, <8 x i16> zeroinitializer, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
   ret <4 x i16> %shuffle
@@ -355,18 +320,11 @@ entry:
 }
 
 define <2 x i32> @i32_zero_off1(<4 x i32> %arg1) {
-; CHECK-SD-LABEL: i32_zero_off1:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #4
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: i32_zero_off1:
-; CHECK-GISEL:       // %bb.0: // %entry
-; CHECK-GISEL-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-GISEL-NEXT:    ext v0.16b, v0.16b, v1.16b, #4
-; CHECK-GISEL-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: i32_zero_off1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #4
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %arg1, <4 x i32> zeroinitializer, <2 x i32> <i32 1, i32 2>
   ret <2 x i32> %shuffle