[llvm] [AArch64][GlobalISel] Combine G_UNMERGE(G_DUPLANE16) -> G_DUPLANE16 (PR #142731)

Mon Jun 9 12:54:22 PDT 2025

https://github.com/davemgreen updated https://github.com/llvm/llvm-project/pull/142731

>From 414013035d0f05caf872c3a12847bed6fa851607 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Mon, 9 Jun 2025 20:54:07 +0100
Subject: [PATCH] [AArch64][GlobalISel] Combine G_UNMERGE(G_DUPLANE16) ->
 G_DUPLANE16

We will generate G_UNMERGE(G_DUPLANE16) due to the legalization of shuffle
vector splats with mismatching vector sizes. The G_DUPLANE intrinsics can
handle different vector sizes (128bit and 64bit output, for example), and we
can combine away the unmerge.
---
 llvm/lib/Target/AArch64/AArch64Combine.td     |  18 +-
 .../postlegalizer-combiner-unmergedup.mir     |  91 ++
 llvm/test/CodeGen/AArch64/arm64-dup.ll        |  14 +-
 .../test/CodeGen/AArch64/arm64-neon-2velem.ll | 784 +++++-------------
 llvm/test/CodeGen/AArch64/arm64-neon-copy.ll  |  42 +-
 llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll | 102 +--
 6 files changed, 382 insertions(+), 669 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-combiner-unmergedup.mir

diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 940d18a17b244..571e2692cbfff 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -172,6 +172,21 @@ def form_duplane : GICombineRule <
   (apply [{ applyDupLane(*${root}, MRI, B, ${matchinfo}); }])
 >;
 
+// Clean up G_UNMERGE(G_DUPLANE16) -> G_DUPLANE16
+class unmerge_duplane<Instruction Op> : GICombineRule <
+  (defs root:$root),
+  (match (Op $a, $src, $c),
+         (G_UNMERGE_VALUES $d1, $d2, $a):$root,
+         [{ return MRI.getType(${d1}.getReg()).getSizeInBits() == 64; }]),
+  (apply (GIReplaceReg $d2, $d1), (Op $d1, $src, $c))
+>;
+def unmerge_duplane8 : unmerge_duplane<G_DUPLANE8>;
+def unmerge_duplane16 : unmerge_duplane<G_DUPLANE16>;
+def unmerge_duplane32 : unmerge_duplane<G_DUPLANE32>;
+// G_DUPLANE64 is not included as the result in scalar.
+def unmerge_duplanes : GICombineGroup<[unmerge_duplane8, unmerge_duplane16,
+                                       unmerge_duplane32]>;
+
 def shuffle_vector_lowering : GICombineGroup<[dup, form_duplane, rev, ext, zip,
                                               uzp, trn, fullrev, shuf_to_ins]>;
 
@@ -325,7 +340,8 @@ def AArch64PostLegalizerLowering
                         lower_vector_fcmp, form_truncstore,
                         vector_sext_inreg_to_shift,
                         unmerge_ext_to_unmerge, lower_mulv2s64,
-                        vector_unmerge_lowering, insertelt_nonconst]> {
+                        vector_unmerge_lowering, insertelt_nonconst,
+                        unmerge_duplanes]> {
 }
 
 // Post-legalization combines which are primarily optimizations.
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-combiner-unmergedup.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-combiner-unmergedup.mir
new file mode 100644
index 0000000000000..acfbec0dd0ef2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-combiner-unmergedup.mir
@@ -0,0 +1,91 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple aarch64 -run-pass=aarch64-postlegalizer-lowering -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name:            unmerge_dup8
+legalized:       true
+body:             |
+  bb.1.entry:
+    ; CHECK-LABEL: name: unmerge_dup8
+    ; CHECK: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[DUPLANE8_:%[0-9]+]]:_(<8 x s8>) = G_DUPLANE8 [[COPY]], [[C]](s64)
+    ; CHECK-NEXT: $d0 = COPY [[DUPLANE8_]](<8 x s8>)
+    ; CHECK-NEXT: $d1 = COPY [[DUPLANE8_]](<8 x s8>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(<16 x s8>) = COPY $q0
+    %1:_(s64) = G_CONSTANT i64 1
+    %2:_(<16 x s8>) = G_DUPLANE8 %0, %1
+    %3:_(<8 x s8>), %4:_(<8 x s8>) = G_UNMERGE_VALUES %2
+    $d0 = COPY %3
+    $d1 = COPY %4
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            unmerge_dup16
+legalized:       true
+body:             |
+  bb.1.entry:
+    ; CHECK-LABEL: name: unmerge_dup16
+    ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[DUPLANE16_:%[0-9]+]]:_(<4 x s16>) = G_DUPLANE16 [[COPY]], [[C]](s64)
+    ; CHECK-NEXT: $d0 = COPY [[DUPLANE16_]](<4 x s16>)
+    ; CHECK-NEXT: $d1 = COPY [[DUPLANE16_]](<4 x s16>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(<8 x s16>) = COPY $q0
+    %1:_(s64) = G_CONSTANT i64 1
+    %2:_(<8 x s16>) = G_DUPLANE16 %0, %1
+    %3:_(<4 x s16>), %4:_(<4 x s16>) = G_UNMERGE_VALUES %2
+    $d0 = COPY %3
+    $d1 = COPY %4
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            unmerge_dup32
+legalized:       true
+body:             |
+  bb.1.entry:
+    ; CHECK-LABEL: name: unmerge_dup32
+    ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[DUPLANE32_:%[0-9]+]]:_(<2 x s32>) = G_DUPLANE32 [[COPY]], [[C]](s64)
+    ; CHECK-NEXT: $d0 = COPY [[DUPLANE32_]](<2 x s32>)
+    ; CHECK-NEXT: $d1 = COPY [[DUPLANE32_]](<2 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(<4 x s32>) = COPY $q0
+    %1:_(s64) = G_CONSTANT i64 1
+    %2:_(<4 x s32>) = G_DUPLANE32 %0, %1
+    %3:_(<2 x s32>), %4:_(<2 x s32>) = G_UNMERGE_VALUES %2
+    $d0 = COPY %3
+    $d1 = COPY %4
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            unmerge_dup64
+legalized:       true
+body:             |
+  bb.1.entry:
+    ; CHECK-LABEL: name: unmerge_dup64
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[DUPLANE64_:%[0-9]+]]:_(<2 x s64>) = G_DUPLANE64 [[COPY]], [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[DUPLANE64_]](<2 x s64>), [[C1]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[DUPLANE64_]](<2 x s64>), [[C2]](s64)
+    ; CHECK-NEXT: $d0 = COPY [[EVEC]](s64)
+    ; CHECK-NEXT: $d1 = COPY [[EVEC1]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(<2 x s64>) = COPY $q0
+    %1:_(s64) = G_CONSTANT i64 1
+    %2:_(<2 x s64>) = G_DUPLANE64 %0, %1
+    %3:_(s64), %4:_(s64) = G_UNMERGE_VALUES %2
+    $d0 = COPY %3
+    $d1 = COPY %4
+    RET_ReallyLR implicit $x0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll
index c279cf0f241d2..49fb6c98e223f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-dup.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll
@@ -401,16 +401,10 @@ define <4 x i16> @test_build_illegal(<4 x i32> %in) {
 ; SelectionDAGBuilder here. We then added a DUPLANE on top of that, preventing
 ; the formation of an indexed-by-7 MLS.
 define <4 x i16> @test_high_splat(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
-; CHECK-SD-LABEL: test_high_splat:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mls.4h v0, v1, v2[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_high_splat:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup.8h v2, v2[7]
-; CHECK-GI-NEXT:    mls.4h v0, v2, v1
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_high_splat:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mls.4h v0, v1, v2[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %b
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
index c3ad3b4192cf9..85d8b7c3e2866 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -159,16 +159,10 @@ entry:
 }
 
 define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmla_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mla v0.4h, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmla_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    mla v0.4h, v2.4h, v1.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmla_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mla v0.4h, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %b
@@ -189,16 +183,10 @@ entry:
 }
 
 define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmla_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mla v0.2s, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmla_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    mla v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmla_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mla v0.2s, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %b
@@ -271,16 +259,10 @@ entry:
 }
 
 define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmls_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mls v0.4h, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmls_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    mls v0.4h, v2.4h, v1.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmls_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mls v0.4h, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %b
@@ -301,16 +283,10 @@ entry:
 }
 
 define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmls_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmls_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmls_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mls v0.2s, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %b
@@ -427,16 +403,10 @@ entry:
 }
 
 define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mul v0.4h, v0.4h, v1.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-GI-NEXT:    mul v0.4h, v1.4h, v0.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %a
@@ -455,16 +425,10 @@ entry:
 }
 
 define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mul v0.2s, v0.2s, v1.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[3]
-; CHECK-GI-NEXT:    mul v0.2s, v1.2s, v0.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %a
@@ -483,16 +447,10 @@ entry:
 }
 
 define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_u16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mul v0.4h, v0.4h, v1.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_u16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-GI-NEXT:    mul v0.4h, v1.4h, v0.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_u16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %a
@@ -511,16 +469,10 @@ entry:
 }
 
 define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_u32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mul v0.2s, v0.2s, v1.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_u32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[3]
-; CHECK-GI-NEXT:    mul v0.2s, v1.2s, v0.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %a
@@ -567,16 +519,10 @@ entry:
 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
 
 define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK-SD-LABEL: test_vfma_laneq_f32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmla v0.2s, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vfma_laneq_f32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    fmla v0.2s, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vfma_laneq_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmla v0.2s, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -834,16 +780,10 @@ entry:
 }
 
 define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smlal v0.4s, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    smlal v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlal_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlal v0.4s, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -852,16 +792,10 @@ entry:
 }
 
 define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smlal v0.2d, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    smlal v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlal_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -920,8 +854,7 @@ define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16
 ; CHECK-GI-LABEL: test_vmlal_high_laneq_s16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    smlal v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    smlal v0.4s, v1.4h, v2.h[7]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -940,8 +873,7 @@ define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32
 ; CHECK-GI-LABEL: test_vmlal_high_laneq_s32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    smlal v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    smlal v0.2d, v1.2s, v2.s[3]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -978,16 +910,10 @@ entry:
 }
 
 define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smlsl v0.4s, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    smlsl v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlsl_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlsl v0.4s, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -996,16 +922,10 @@ entry:
 }
 
 define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smlsl v0.2d, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    smlsl v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlsl_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlsl v0.2d, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1064,8 +984,7 @@ define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16
 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_s16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    smlsl v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    smlsl v0.4s, v1.4h, v2.h[7]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1084,8 +1003,7 @@ define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32
 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_s32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    smlsl v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    smlsl v0.2d, v1.2s, v2.s[3]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -1122,16 +1040,10 @@ entry:
 }
 
 define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_u16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umlal v0.4s, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_u16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    umlal v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlal_laneq_u16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlal v0.4s, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -1140,16 +1052,10 @@ entry:
 }
 
 define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_u32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umlal v0.2d, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_u32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    umlal v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlal_laneq_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlal v0.2d, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1208,8 +1114,7 @@ define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16
 ; CHECK-GI-LABEL: test_vmlal_high_laneq_u16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    umlal v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    umlal v0.4s, v1.4h, v2.h[7]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1228,8 +1133,7 @@ define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32
 ; CHECK-GI-LABEL: test_vmlal_high_laneq_u32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    umlal v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    umlal v0.2d, v1.2s, v2.s[3]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -1266,16 +1170,10 @@ entry:
 }
 
 define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_u16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umlsl v0.4s, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_u16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    umlsl v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlsl_laneq_u16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlsl v0.4s, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -1284,16 +1182,10 @@ entry:
 }
 
 define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_u32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umlsl v0.2d, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_u32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    umlsl v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlsl_laneq_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlsl v0.2d, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1352,8 +1244,7 @@ define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16
 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_u16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    umlsl v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    umlsl v0.4s, v1.4h, v2.h[7]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1372,8 +1263,7 @@ define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32
 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_u32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    umlsl v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    umlsl v0.2d, v1.2s, v2.s[3]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -1512,16 +1402,10 @@ entry:
 }
 
 define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmull_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smull v0.4s, v0.4h, v1.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmull_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-GI-NEXT:    smull v0.4s, v0.4h, v1.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmull_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smull v0.4s, v0.4h, v1.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1529,16 +1413,10 @@ entry:
 }
 
 define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmull_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smull v0.2d, v0.2s, v1.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmull_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[3]
-; CHECK-GI-NEXT:    smull v0.2d, v0.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmull_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smull v0.2d, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1546,16 +1424,10 @@ entry:
 }
 
 define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmull_laneq_u16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umull v0.4s, v0.4h, v1.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmull_laneq_u16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-GI-NEXT:    umull v0.4s, v0.4h, v1.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmull_laneq_u16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umull v0.4s, v0.4h, v1.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1563,16 +1435,10 @@ entry:
 }
 
 define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmull_laneq_u32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umull v0.2d, v0.2s, v1.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmull_laneq_u32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[3]
-; CHECK-GI-NEXT:    umull v0.2d, v0.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmull_laneq_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umull v0.2d, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1588,8 +1454,7 @@ define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-GI-LABEL: test_vmull_high_laneq_s16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-GI-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    smull v0.4s, v0.4h, v1.h[7]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1607,8 +1472,7 @@ define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-GI-LABEL: test_vmull_high_laneq_s32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[3]
-; CHECK-GI-NEXT:    smull v0.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    smull v0.2d, v0.2s, v1.s[3]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -1626,8 +1490,7 @@ define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-GI-LABEL: test_vmull_high_laneq_u16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-GI-NEXT:    umull v0.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    umull v0.4s, v0.4h, v1.h[7]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1645,8 +1508,7 @@ define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-GI-LABEL: test_vmull_high_laneq_u32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[3]
-; CHECK-GI-NEXT:    umull v0.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    umull v0.2d, v0.2s, v1.s[3]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -1816,16 +1678,10 @@ entry:
 }
 
 define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vqdmull_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sqdmull v0.4s, v0.4h, v1.h[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vqdmull_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[3]
-; CHECK-GI-NEXT:    sqdmull v0.4s, v0.4h, v1.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vqdmull_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmull v0.4s, v0.4h, v1.h[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -1833,16 +1689,10 @@ entry:
 }
 
 define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vqdmull_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sqdmull v0.2d, v0.2s, v1.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vqdmull_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[3]
-; CHECK-GI-NEXT:    sqdmull v0.2d, v0.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vqdmull_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmull v0.2d, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -1898,8 +1748,7 @@ define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-GI-LABEL: test_vqdmull_high_laneq_s16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-GI-NEXT:    sqdmull v0.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    sqdmull v0.4s, v0.4h, v1.h[7]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1917,8 +1766,7 @@ define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-GI-LABEL: test_vqdmull_high_laneq_s32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[3]
-; CHECK-GI-NEXT:    sqdmull v0.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    sqdmull v0.2d, v0.2s, v1.s[3]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -2322,16 +2170,10 @@ entry:
 }
 
 define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_f32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmul v0.2s, v0.2s, v1.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_f32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[3]
-; CHECK-GI-NEXT:    fmul v0.2s, v1.2s, v0.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
   %mul = fmul <2 x float> %shuffle, %a
@@ -2553,16 +2395,10 @@ entry:
 }
 
 define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) {
-; CHECK-SD-LABEL: test_vmulx_laneq_f32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmulx v0.2s, v0.2s, v1.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmulx_laneq_f32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[3]
-; CHECK-GI-NEXT:    fmulx v0.2s, v0.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmulx_laneq_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmulx v0.2s, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
@@ -2657,16 +2493,10 @@ entry:
 }
 
 define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmla_laneq_s16_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mla v0.4h, v1.4h, v2.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmla_laneq_s16_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-GI-NEXT:    mla v0.4h, v2.4h, v1.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmla_laneq_s16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mla v0.4h, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %b
@@ -2687,16 +2517,10 @@ entry:
 }
 
 define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmla_laneq_s32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mla v0.2s, v1.2s, v2.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmla_laneq_s32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-GI-NEXT:    mla v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmla_laneq_s32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mla v0.2s, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %b
@@ -2769,16 +2593,10 @@ entry:
 }
 
 define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmls_laneq_s16_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mls v0.4h, v1.4h, v2.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmls_laneq_s16_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-GI-NEXT:    mls v0.4h, v2.4h, v1.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmls_laneq_s16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mls v0.4h, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %b
@@ -2799,16 +2617,10 @@ entry:
 }
 
 define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmls_laneq_s32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmls_laneq_s32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmls_laneq_s32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mls v0.2s, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %b
@@ -2925,16 +2737,10 @@ entry:
 }
 
 define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_s16_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mul v0.4h, v0.4h, v1.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_s16_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[0]
-; CHECK-GI-NEXT:    mul v0.4h, v1.4h, v0.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_s16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %a
@@ -2953,16 +2759,10 @@ entry:
 }
 
 define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_s32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mul v0.2s, v0.2s, v1.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_s32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[0]
-; CHECK-GI-NEXT:    mul v0.2s, v1.2s, v0.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_s32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %a
@@ -2981,16 +2781,10 @@ entry:
 }
 
 define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_u16_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mul v0.4h, v0.4h, v1.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_u16_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[0]
-; CHECK-GI-NEXT:    mul v0.4h, v1.4h, v0.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_u16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %mul = mul <4 x i16> %shuffle, %a
@@ -3009,16 +2803,10 @@ entry:
 }
 
 define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_u32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mul v0.2s, v0.2s, v1.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_u32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[0]
-; CHECK-GI-NEXT:    mul v0.2s, v1.2s, v0.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_u32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %mul = mul <2 x i32> %shuffle, %a
@@ -3061,16 +2849,10 @@ entry:
 }
 
 define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK-SD-LABEL: test_vfma_laneq_f32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmla v0.2s, v1.2s, v2.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vfma_laneq_f32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-GI-NEXT:    fmla v0.2s, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vfma_laneq_f32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmla v0.2s, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -3188,16 +2970,10 @@ entry:
 }
 
 define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_s16_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smlal v0.4s, v1.4h, v2.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_s16_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-GI-NEXT:    smlal v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlal_laneq_s16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlal v0.4s, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -3206,16 +2982,10 @@ entry:
 }
 
 define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_s32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smlal v0.2d, v1.2s, v2.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_s32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-GI-NEXT:    smlal v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlal_laneq_s32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -3274,8 +3044,7 @@ define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i
 ; CHECK-GI-LABEL: test_vmlal_high_laneq_s16_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-GI-NEXT:    smlal v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    smlal v0.4s, v1.4h, v2.h[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -3294,8 +3063,7 @@ define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i
 ; CHECK-GI-LABEL: test_vmlal_high_laneq_s32_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-GI-NEXT:    smlal v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    smlal v0.2d, v1.2s, v2.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -3332,16 +3100,10 @@ entry:
 }
 
 define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_s16_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smlsl v0.4s, v1.4h, v2.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_s16_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-GI-NEXT:    smlsl v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlsl_laneq_s16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlsl v0.4s, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -3350,16 +3112,10 @@ entry:
 }
 
 define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_s32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smlsl v0.2d, v1.2s, v2.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_s32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-GI-NEXT:    smlsl v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlsl_laneq_s32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlsl v0.2d, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -3418,8 +3174,7 @@ define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i
 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_s16_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-GI-NEXT:    smlsl v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    smlsl v0.4s, v1.4h, v2.h[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -3438,8 +3193,7 @@ define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i
 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_s32_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-GI-NEXT:    smlsl v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    smlsl v0.2d, v1.2s, v2.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -3476,16 +3230,10 @@ entry:
 }
 
 define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_u16_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umlal v0.4s, v1.4h, v2.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_u16_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-GI-NEXT:    umlal v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlal_laneq_u16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlal v0.4s, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -3494,16 +3242,10 @@ entry:
 }
 
 define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_u32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umlal v0.2d, v1.2s, v2.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_u32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-GI-NEXT:    umlal v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlal_laneq_u32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlal v0.2d, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -3562,8 +3304,7 @@ define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i
 ; CHECK-GI-LABEL: test_vmlal_high_laneq_u16_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-GI-NEXT:    umlal v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    umlal v0.4s, v1.4h, v2.h[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -3582,8 +3323,7 @@ define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i
 ; CHECK-GI-LABEL: test_vmlal_high_laneq_u32_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-GI-NEXT:    umlal v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    umlal v0.2d, v1.2s, v2.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -3620,16 +3360,10 @@ entry:
 }
 
 define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_u16_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umlsl v0.4s, v1.4h, v2.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_u16_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-GI-NEXT:    umlsl v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlsl_laneq_u16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlsl v0.4s, v1.4h, v2.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -3638,16 +3372,10 @@ entry:
 }
 
 define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_u32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umlsl v0.2d, v1.2s, v2.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_u32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-GI-NEXT:    umlsl v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlsl_laneq_u32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlsl v0.2d, v1.2s, v2.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -3706,8 +3434,7 @@ define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i
 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_u16_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-GI-NEXT:    umlsl v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    umlsl v0.4s, v1.4h, v2.h[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -3726,8 +3453,7 @@ define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i
 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_u32_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-GI-NEXT:    umlsl v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    umlsl v0.2d, v1.2s, v2.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -3866,16 +3592,10 @@ entry:
 }
 
 define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmull_laneq_s16_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smull v0.4s, v0.4h, v1.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmull_laneq_s16_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[0]
-; CHECK-GI-NEXT:    smull v0.4s, v0.4h, v1.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmull_laneq_s16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smull v0.4s, v0.4h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -3883,16 +3603,10 @@ entry:
 }
 
 define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmull_laneq_s32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smull v0.2d, v0.2s, v1.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmull_laneq_s32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[0]
-; CHECK-GI-NEXT:    smull v0.2d, v0.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmull_laneq_s32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smull v0.2d, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -3900,16 +3614,10 @@ entry:
 }
 
 define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmull_laneq_u16_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umull v0.4s, v0.4h, v1.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmull_laneq_u16_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[0]
-; CHECK-GI-NEXT:    umull v0.4s, v0.4h, v1.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmull_laneq_u16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umull v0.4s, v0.4h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -3917,16 +3625,10 @@ entry:
 }
 
 define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmull_laneq_u32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umull v0.2d, v0.2s, v1.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmull_laneq_u32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[0]
-; CHECK-GI-NEXT:    umull v0.2d, v0.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmull_laneq_u32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umull v0.2d, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -3942,8 +3644,7 @@ define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-GI-LABEL: test_vmull_high_laneq_s16_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[0]
-; CHECK-GI-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    smull v0.4s, v0.4h, v1.h[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -3961,8 +3662,7 @@ define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-GI-LABEL: test_vmull_high_laneq_s32_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[0]
-; CHECK-GI-NEXT:    smull v0.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    smull v0.2d, v0.2s, v1.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -3980,8 +3680,7 @@ define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-GI-LABEL: test_vmull_high_laneq_u16_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[0]
-; CHECK-GI-NEXT:    umull v0.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    umull v0.4s, v0.4h, v1.h[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -3999,8 +3698,7 @@ define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-GI-LABEL: test_vmull_high_laneq_u32_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[0]
-; CHECK-GI-NEXT:    umull v0.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    umull v0.2d, v0.2s, v1.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -4170,16 +3868,10 @@ entry:
 }
 
 define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vqdmull_laneq_s16_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sqdmull v0.4s, v0.4h, v1.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vqdmull_laneq_s16_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[0]
-; CHECK-GI-NEXT:    sqdmull v0.4s, v0.4h, v1.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vqdmull_laneq_s16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmull v0.4s, v0.4h, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
   %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
@@ -4187,16 +3879,10 @@ entry:
 }
 
 define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vqdmull_laneq_s32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sqdmull v0.2d, v0.2s, v1.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vqdmull_laneq_s32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[0]
-; CHECK-GI-NEXT:    sqdmull v0.2d, v0.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vqdmull_laneq_s32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqdmull v0.2d, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
   %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
@@ -4252,8 +3938,7 @@ define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
 ; CHECK-GI-LABEL: test_vqdmull_high_laneq_s16_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[0]
-; CHECK-GI-NEXT:    sqdmull v0.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    sqdmull v0.4s, v0.4h, v1.h[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -4271,8 +3956,7 @@ define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
 ; CHECK-GI-LABEL: test_vqdmull_high_laneq_s32_0:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d0, v0.d[1]
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[0]
-; CHECK-GI-NEXT:    sqdmull v0.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    sqdmull v0.2d, v0.2s, v1.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -4402,16 +4086,10 @@ entry:
 }
 
 define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_f32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmul v0.2s, v0.2s, v1.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_f32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[0]
-; CHECK-GI-NEXT:    fmul v0.2s, v1.2s, v0.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_f32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmul v0.2s, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
   %mul = fmul <2 x float> %shuffle, %a
@@ -4498,16 +4176,10 @@ entry:
 }
 
 define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
-; CHECK-SD-LABEL: test_vmulx_laneq_f32_0:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmulx v0.2s, v0.2s, v1.s[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmulx_laneq_f32_0:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[0]
-; CHECK-GI-NEXT:    fmulx v0.2s, v0.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmulx_laneq_f32_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmulx v0.2s, v0.2s, v1.s[0]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
   %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index ddd8a72618b1e..367105f783817 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -807,46 +807,28 @@ define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %v1) #0 {
 }
 
 define <8 x i8> @test_vdup_laneq_s8(<16 x i8> %v1) #0 {
-; CHECK-SD-LABEL: test_vdup_laneq_s8:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    dup v0.8b, v0.b[5]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vdup_laneq_s8:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    dup v0.16b, v0.b[5]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vdup_laneq_s8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.8b, v0.b[5]
+; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
   ret <8 x i8> %shuffle
 }
 
 define <4 x i16> @test_vdup_laneq_s16(<8 x i16> %v1) #0 {
-; CHECK-SD-LABEL: test_vdup_laneq_s16:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    dup v0.4h, v0.h[2]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vdup_laneq_s16:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    dup v0.8h, v0.h[2]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vdup_laneq_s16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.4h, v0.h[2]
+; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
   ret <4 x i16> %shuffle
 }
 
 define <2 x i32> @test_vdup_laneq_s32(<4 x i32> %v1) #0 {
-; CHECK-SD-LABEL: test_vdup_laneq_s32:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    dup v0.2s, v0.s[1]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vdup_laneq_s32:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    dup v0.4s, v0.s[1]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vdup_laneq_s32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.2s, v0.s[1]
+; CHECK-NEXT:    ret
   %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
   ret <2 x i32> %shuffle
 }
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll b/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
index bb97ba6d92651..cb14adc00df00 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
@@ -569,16 +569,10 @@ define i32 @test_sqrdmlsh_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
 ; Using sqrdmlah intrinsics
 
 define <4 x i16> @test_vqrdmlah_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vqrdmlah_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sqrdmlah v0.4h, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vqrdmlah_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    sqrdmlah v0.4h, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vqrdmlah_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmlah v0.4h, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %lane = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vqrdmlah_v3.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %lane) #4
@@ -586,16 +580,10 @@ entry:
 }
 
 define <2 x i32> @test_vqrdmlah_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vqrdmlah_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sqrdmlah v0.2s, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vqrdmlah_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    sqrdmlah v0.2s, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vqrdmlah_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmlah v0.2s, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %lane = shufflevector <4 x i32> %v, <4 x i32> poison, <2 x i32> <i32 3, i32 3>
   %vqrdmlah_v3.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %lane) #4
@@ -700,22 +688,13 @@ entry:
 }
 
 define i16 @test_vqrdmlahh_laneq_s16(i16 %a, i16 %b, <8 x i16> %c) {
-; CHECK-SD-LABEL: test_vqrdmlahh_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmov s1, w0
-; CHECK-SD-NEXT:    fmov s2, w1
-; CHECK-SD-NEXT:    sqrdmlah v1.4h, v2.4h, v0.h[7]
-; CHECK-SD-NEXT:    umov w0, v1.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vqrdmlahh_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v0.8h, v0.h[7]
-; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    fmov s2, w1
-; CHECK-GI-NEXT:    sqrdmlah v1.4h, v2.4h, v0.4h
-; CHECK-GI-NEXT:    umov w0, v1.h[0]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vqrdmlahh_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fmov s2, w1
+; CHECK-NEXT:    sqrdmlah v1.4h, v2.4h, v0.h[7]
+; CHECK-NEXT:    umov w0, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %0 = insertelement <4 x i16> undef, i16 %a, i64 0
   %1 = insertelement <4 x i16> undef, i16 %b, i64 0
@@ -740,16 +719,10 @@ entry:
 }
 
 define <4 x i16> @test_vqrdmlsh_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vqrdmlsh_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sqrdmlsh v0.4h, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vqrdmlsh_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    sqrdmlsh v0.4h, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vqrdmlsh_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmlsh v0.4h, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %lane = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vqrdmlsh_v3.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %lane) #4
@@ -757,16 +730,10 @@ entry:
 }
 
 define <2 x i32> @test_vqrdmlsh_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vqrdmlsh_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sqrdmlsh v0.2s, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vqrdmlsh_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    sqrdmlsh v0.2s, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vqrdmlsh_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqrdmlsh v0.2s, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %lane = shufflevector <4 x i32> %v, <4 x i32> poison, <2 x i32> <i32 3, i32 3>
   %vqrdmlsh_v3.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %lane) #4
@@ -871,22 +838,13 @@ entry:
 }
 
 define i16 @test_vqrdmlshh_laneq_s16(i16 %a, i16 %b, <8 x i16> %c) {
-; CHECK-SD-LABEL: test_vqrdmlshh_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmov s1, w0
-; CHECK-SD-NEXT:    fmov s2, w1
-; CHECK-SD-NEXT:    sqrdmlsh v1.4h, v2.4h, v0.h[7]
-; CHECK-SD-NEXT:    umov w0, v1.h[0]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vqrdmlshh_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v0.8h, v0.h[7]
-; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    fmov s2, w1
-; CHECK-GI-NEXT:    sqrdmlsh v1.4h, v2.4h, v0.4h
-; CHECK-GI-NEXT:    umov w0, v1.h[0]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vqrdmlshh_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    fmov s2, w1
+; CHECK-NEXT:    sqrdmlsh v1.4h, v2.4h, v0.h[7]
+; CHECK-NEXT:    umov w0, v1.h[0]
+; CHECK-NEXT:    ret
 entry:
   %0 = insertelement <4 x i16> undef, i16 %a, i64 0
   %1 = insertelement <4 x i16> undef, i16 %b, i64 0